### Exploratory Data Analysis
ensure `SA2_2021_AUST_SHP_GDA94` and `requirements.txt` are to the same directory as this Jupyter notebook file 

for linux command, run this beforehand:
```
sudo apt-get install gdal-bin
sudo apt-get install libgdal-dev

sudo apt install python-geopandas
```

In [25]:
!pip install -r ./requirements.txt

Collecting pandas==1.4.2
  Using cached pandas-1.4.2-cp39-cp39-macosx_10_9_x86_64.whl (11.1 MB)
Collecting dash==2.1
  Using cached dash-2.1.0-py3-none-any.whl (7.4 MB)
Installing collected packages: dash, pandas
  Attempting uninstall: dash
    Found existing installation: dash 2.3.1
    Uninstalling dash-2.3.1:
      Successfully uninstalled dash-2.3.1
  Attempting uninstall: pandas
    Found existing installation: pandas 1.3.4
    Uninstalling pandas-1.3.4:
      Successfully uninstalled pandas-1.3.4
Successfully installed dash-2.1.0 pandas-1.4.2


In [245]:
import pandas as pd
import geopandas as gpd
import json
import plotly.express as px
import plotly.graph_objects as go
from jupyter_dash import JupyterDash

from dash import Dash, html, dcc
# import dash_core_components as dcc
from dash.dependencies import Input, Output

import couchdb

from datetime import datetime

MAPBOX_ACCESS_TOKEN="pk.eyJ1Ijoia3VrYWhlYWRsYSIsImEiOiJjbDJ5Mml5MHEweHlkM2tvNWxodm1najcwIn0.h1p7x_rboO1iqycGcogJFQ"

### Exploratory Data Analysis for `Historical Tweets`

In [466]:
USER = 'user'
PASSWORD = 'password'

server = couchdb.Server('http://{}:{}@172.26.134.34:5984/'.format(USER, PASSWORD))

In [467]:
db = server['new_tweets']
envir_df = server['envir_test1']

In [468]:
# envir_db.view('area_week/area_week_topic')
topic_agg = db.view('area_week/area_week_topic', include_doc=True)
topic_df = pd.DataFrame((row.key+[row.value['sentiments']['compound']] for row in topic_agg),
                        columns = ['time', 'area', 'topic','sentiment'])

# Aggregate by average sentiment 
topic_df = topic_df.groupby(['time', 'area', 'topic']).mean().reset_index()

#filter out wNaN 
topic_df['week'] = topic_df.apply(lambda x: x['time'].split('-')[0], axis=1)
topic_df = topic_df[topic_df['week'] != 'wNaN']

#Change string to datetime format and filtering out non-valid string
topic_df['time'] = topic_df.apply(lambda row: datetime.strptime(row['time']+'-1', 'w%W-%Y-%w'), axis=1) #append -1 as Monday
topic_df = topic_df[topic_df['area'] != 'zzzzzzzzz'] 

topic_df.head()

Unnamed: 0,time,area,topic,sentiment,week
0,2020-01-06,berry - kangaroo valley,environment,0.3818,w1
1,2021-01-04,guildford west - merrylands west,environment,-0.0108,w1
2,2021-01-04,springvale south,environment,0.0,w1
3,2021-01-04,springvale south,health,0.254367,w1
4,2021-01-04,sydney (north) - millers point,environment,0.7839,w1


In [469]:
sa2_gdf = gpd.read_file("./SA2_2021_AUST_SHP_GDA94")
sa2_gdf['SA2_NAME21'] = sa2_gdf['SA2_NAME21'].str.lower()
sa2_gdf.to_crs(pyproj.CRS.from_epsg(4283), inplace=True)


In [470]:
topic_df = topic_df.merge(sa2_gdf, left_on='area', right_on='SA2_NAME21', how='left')
topic_df = gpd.GeoDataFrame(topic_df, crs="EPSG:4283", geometry=topic_df.geometry)
topic_df.head()

Unnamed: 0,time,area,topic,sentiment,week,SA2_CODE21,SA2_NAME21,CHG_FLAG21,CHG_LBL21,SA3_CODE21,...,SA4_NAME21,GCC_CODE21,GCC_NAME21,STE_CODE21,STE_NAME21,AUS_CODE21,AUS_NAME21,AREASQKM21,LOCI_URI21,geometry
0,2020-01-06,berry - kangaroo valley,environment,0.3818,w1,114011272,berry - kangaroo valley,0,No change,11401,...,Southern Highlands and Shoalhaven,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,537.0884,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((150.41186 -34.75384, 150.41140 -34.7..."
1,2021-01-04,guildford west - merrylands west,environment,-0.0108,w1,125031484,guildford west - merrylands west,0,No change,12503,...,Sydney - Parramatta,1GSYD,Greater Sydney,1,New South Wales,AUS,Australia,5.4654,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((150.96026 -33.84519, 150.96039 -33.8..."
2,2021-01-04,springvale south,environment,0.0,w1,212041318,springvale south,0,No change,21204,...,Melbourne - South East,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,4.56,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((145.13575 -37.96907, 145.13531 -37.9..."
3,2021-01-04,springvale south,health,0.254367,w1,212041318,springvale south,0,No change,21204,...,Melbourne - South East,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,4.56,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((145.13575 -37.96907, 145.13531 -37.9..."
4,2021-01-04,sydney (north) - millers point,environment,0.7839,w1,117031644,sydney (north) - millers point,1,New,11703,...,Sydney - City and Inner South,1GSYD,Greater Sydney,1,New South Wales,AUS,Australia,3.2122,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"MULTIPOLYGON (((151.22538 -33.85526, 151.22524..."


In [460]:
# dff = topic_df.copy()
# gdff = dff[dff['time'].dt.year == 2022]
# gdff = gdff[gdff['topic'] == 'health']
# gdff.set_index('area', inplace=True)
# gdff.head()

Unnamed: 0_level_0,time,topic,sentiment,week
area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
springvale south,2022-03-07,health,0.738814,w10
springvale south,2022-03-14,health,0.7373,w11
springvale south,2022-03-28,health,0.0,w13
kangaroo island,2022-04-04,health,0.1774,w14
springvale south,2022-04-04,health,0.0,w14


Considering  `envir_test1` data wrangglings below (feel fee to ignore)

In [216]:
envir_topic = envir_db.view('area_week/area_week_topic', include_doc=True)
envir_topic_df = pd.DataFrame((row.key+[row.value['sentiments']['compound']] for row in envir_topic),
                        columns = ['time', 'area', 'topic','sentiment'])
envir_topic_df = envir_topic_df.groupby(['time', 'area', 'topic']).mean().reset_index()
envir_topic_df['time'] = envir_topic_df.apply(lambda row: datetime.strptime(row['time']+'-1', 'w%W-%Y-%w'), axis=1) #append -1 as Monday
envir_topic_df = envir_topic_df[envir_topic_df['area'] != 'zzzzzzzzz']
envir_topic_df.head()

Unnamed: 0,time,area,topic,sentiment
0,2020-01-06,berry - kangaroo valley,environment,0.3818
1,2021-01-04,guildford west - merrylands west,environment,-0.0108
2,2021-01-04,springvale south,environment,0.0
3,2021-01-04,sydney (north) - millers point,environment,0.7839
5,2018-03-05,guildford west - merrylands west,environment,-0.1027


In [221]:
# envir_topic_df = envir_topic_df.merge(sa2_gdf, left_on='area', right_on='SA2_NAME21', how='left')
envir_topic_overall_df = envir_topic_df.groupby(['area', 'topic']).mean().reset_index()
# envir_topic_overall_df.head()

envir_topic_overall_df = envir_topic_overall_df.merge(sa2_gdf, left_on='area', right_on='SA2_NAME21', how='left')
envir_topic_overall_df

Unnamed: 0,area,topic,sentiment,SA2_CODE21,SA2_NAME21,CHG_FLAG21,CHG_LBL21,SA3_CODE21,SA3_NAME21,SA4_CODE21,SA4_NAME21,GCC_CODE21,GCC_NAME21,STE_CODE21,STE_NAME21,AUS_CODE21,AUS_NAME21,AREASQKM21,LOCI_URI21,geometry
0,adelaide,environment,-0.827100,401011001,adelaide,0,No change,40101,Adelaide City,401,Adelaide - Central and Hills,4GADE,Greater Adelaide,4,South Australia,AUS,Australia,10.4824,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((138.58096 -34.93024, 138.58094 -34.9..."
1,alexandra,environment,0.378950,204011054,alexandra,0,No change,20401,Upper Goulburn Valley,204,Hume,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,2118.9554,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((145.59014 -37.22478, 145.58638 -37.2..."
2,armadale,environment,0.836000,206061135,armadale,0,No change,20606,Stonnington - West,206,Melbourne - Inner,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,2.1835,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((145.01167 -37.85358, 145.01176 -37.8..."
3,asquith - mount colah,environment,0.515200,121021403,asquith - mount colah,0,No change,12102,Hornsby,121,Sydney - North Sydney and Hornsby,1GSYD,Greater Sydney,1,New South Wales,AUS,Australia,34.6889,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((151.08587 -33.67043, 151.08610 -33.6..."
4,avoca,environment,0.000000,201031013,avoca,0,No change,20103,Maryborough - Pyrenees,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,1714.2397,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.10106 -37.10084, 143.10043 -37.0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,warrnambool - south,environment,-0.313646,217041480,warrnambool - south,0,No change,21704,Warrnambool,217,Warrnambool and South West,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,120.1957,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((142.45281 -38.39127, 142.45229 -38.3..."
84,williamstown,environment,0.000000,213021346,williamstown,0,No change,21302,Hobsons Bay,213,Melbourne - West,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,7.3463,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((144.88699 -37.85080, 144.88729 -37.8..."
85,wollongong - west,environment,-0.726900,107041549,wollongong - west,0,No change,10704,Wollongong,107,Illawarra,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,5.6592,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((150.86880 -34.42428, 150.86929 -34.4..."
86,woollahra,environment,0.000000,118011347,woollahra,0,No change,11801,Eastern Suburbs - North,118,Sydney - Eastern Suburbs,1GSYD,Greater Sydney,1,New South Wales,AUS,Australia,1.2805,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((151.25350 -33.88694, 151.25378 -33.8..."


In [242]:
min(envir_topic_overall_df.sentiment)

-0.886

In [229]:
# envir_topic_overall.to_crs(pyproj.CRS.from_epsg(4283), inplace=True)
envir_topic_overall_df = gpd.GeoDataFrame(envir_topic_overall_df, crs="EPSG:4283", geometry=envir_topic_overall_df.geometry)
# envir_topic_overall_df.set_index('area', inplace=True)
envir_topic_overall_df.head()

Unnamed: 0_level_0,topic,sentiment,SA2_CODE21,SA2_NAME21,CHG_FLAG21,CHG_LBL21,SA3_CODE21,SA3_NAME21,SA4_CODE21,SA4_NAME21,GCC_CODE21,GCC_NAME21,STE_CODE21,STE_NAME21,AUS_CODE21,AUS_NAME21,AREASQKM21,LOCI_URI21,geometry
area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
adelaide,environment,-0.8271,401011001,adelaide,0,No change,40101,Adelaide City,401,Adelaide - Central and Hills,4GADE,Greater Adelaide,4,South Australia,AUS,Australia,10.4824,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((138.58096 -34.93024, 138.58094 -34.9..."
alexandra,environment,0.37895,204011054,alexandra,0,No change,20401,Upper Goulburn Valley,204,Hume,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,2118.9554,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((145.59014 -37.22478, 145.58638 -37.2..."
armadale,environment,0.836,206061135,armadale,0,No change,20606,Stonnington - West,206,Melbourne - Inner,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,2.1835,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((145.01167 -37.85358, 145.01176 -37.8..."
asquith - mount colah,environment,0.5152,121021403,asquith - mount colah,0,No change,12102,Hornsby,121,Sydney - North Sydney and Hornsby,1GSYD,Greater Sydney,1,New South Wales,AUS,Australia,34.6889,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((151.08587 -33.67043, 151.08610 -33.6..."
avoca,environment,0.0,201031013,avoca,0,No change,20103,Maryborough - Pyrenees,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,1714.2397,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.10106 -37.10084, 143.10043 -37.0..."


In [472]:
fig = px.choropleth_mapbox(envir_topic_overall_df, geojson=envir_topic_overall_df.geometry, 
                            locations=envir_topic_overall_df.index,
                            color=envir_topic_overall_df.sentiment,
                            zoom= 5,
                            center = {"lat": -37.8136, "lon": 144.9631}, mapbox_style="carto-positron")
fig.update_layout(mapbox_style="dark", mapbox_accesstoken=MAPBOX_ACCESS_TOKEN)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [250]:
geometry_json = json.loads(envir_topic_overall_df.geometry.to_json())

In [253]:
# fig = go.Figure(
#     data=go.Choroplethmapbox(geojson=geometry_json,
#                             locations=envir_topic_overall_df.index,
#                             z=envir_topic_overall_df.sentiment,
#                             zmin=min(envir_topic_overall_df.sentiment), zmax=max(envir_topic_overall_df.sentiment),
#                             colorscale="Viridis",
#                             marker_opacity=0.5, marker_line_width=0)
# )
# fig.update_layout(mapbox_style="carto-positron",
#                   mapbox_zoom=3, 
#                   mapbox_center = {"lat": -37.8136, "lon": 144.9631},
#                   margin={"r":0,"t":0,"l":0,"b":0})
# fig.show()

In [289]:
# envir_topic_df = envir_topic_df.merge(sa2_gdf, left_on='area', right_on='SA2_NAME21', how='left')
# envir_topic_df = gpd.GeoDataFrame(envir_topic_df, crs="EPSG:4283", geometry=envir_topic_df.geometry)
envir_topic_df.crs

<Geographic 2D CRS: EPSG:4283>
Name: GDA94
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: Australia including Lord Howe Island, Macquarie Island, Ashmore and Cartier Islands, Christmas Island, Cocos (Keeling) Islands, Norfolk Island. All onshore and offshore.
- bounds: (93.41, -60.55, 173.34, -8.47)
Datum: Geocentric Datum of Australia 1994
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich

In [448]:
dff = topic_df.copy()
gdff = dff[dff['time'].dt.year == 2022]
gdff = gdff[gdff['topic'] == 'health']
gdff.set_index('area', inplace=True)
gdff.head()

Unnamed: 0_level_0,time,topic,sentiment,week,SA2_CODE21,SA2_NAME21,CHG_FLAG21,CHG_LBL21,SA3_CODE21,SA3_NAME21,...,SA4_NAME21,GCC_CODE21,GCC_NAME21,STE_CODE21,STE_NAME21,AUS_CODE21,AUS_NAME21,AREASQKM21,LOCI_URI21,geometry
area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
springvale south,2022-03-07,health,0.738814,w10,212041318,springvale south,0,No change,21204,Dandenong,...,Melbourne - South East,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,4.56,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((145.13575 -37.96907, 145.13531 -37.9..."
springvale south,2022-03-14,health,0.7373,w11,212041318,springvale south,0,No change,21204,Dandenong,...,Melbourne - South East,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,4.56,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((145.13575 -37.96907, 145.13531 -37.9..."
springvale south,2022-03-28,health,0.0,w13,212041318,springvale south,0,No change,21204,Dandenong,...,Melbourne - South East,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,4.56,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((145.13575 -37.96907, 145.13531 -37.9..."
kangaroo island,2022-04-04,health,0.1774,w14,407011145,kangaroo island,0,No change,40701,Fleurieu - Kangaroo Island,...,South Australia - South East,4RSAU,Rest of SA,4,South Australia,AUS,Australia,4400.7382,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"MULTIPOLYGON (((136.69272 -36.08554, 136.69281..."
springvale south,2022-04-04,health,0.0,w14,212041318,springvale south,0,No change,21204,Dandenong,...,Melbourne - South East,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,4.56,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((145.13575 -37.96907, 145.13531 -37.9..."


In [440]:
# fig = px.choropleth_mapbox(gdff, 
#                         geojson=gdff.geometry,
#                         color=gdff.sentiment,
#                         locations=gdff.index,
#                         zoom=5,
#                         center = {"lat": -37.8136, "lon": 144.9631},
#                         mapbox_style="carto-positron")
# fig.show()

In [471]:
app = JupyterDash(__name__)

drop_down_lst = list(map(lambda x: x.capitalize(), topic_df.topic.unique()))

app.layout = html.Div([
    html.H1('Sentiment Transportation on Dashboard', 
            style={'text_align': 'center', 'color':'#FDA172', 'font-family':'sans-serif'}),
    html.Div(id="heading-description", children=[
        html.P('This dashboard shows the sentiment of selected topic in Australia. (+ve means happy, -ve mean sad)', 
            style={'text_align': 'center', 'color':'#FFFFFF', 'font-family':'sans-serif'}),
    ]),
    dcc.Dropdown(drop_down_lst , drop_down_lst[0],
                id='topic-dropdown',
                style={'width': '50%', 
                    'font-family':'sans-serif', 
                    'margin': '0px 0px 5px 0px', 
                    'padding': '5px 0px 5px 0px',
                    'background-color':'#282828'},
                clearable=False
    ),
    dcc.Graph(id='choropleth-mapbox', figure={}, style={'width':'50%'}),
    html.Div(
        children=[
            html.P(
                id="slider-text",
                children="Drag the slider to change the year:",
                style={'color':'#FFECE8', 'font-family':'sans-serif'}
            ),
            dcc.Slider(
                    id='year-slider',
                    min=min(topic_df.time).year,
                    max=max(topic_df.time).year,
                    value=max(topic_df.time).year,
                    marks={str(year): {
                        'label':str(year), 
                        'style': {"color": "#FDA172", 'font-family':'sans-serif'} 
                    } for year in range(min(topic_df.time).year, max(topic_df.time).year+1, 1)},
                    step=None
            )
        ],
        style={'width': '48%', 'padding': '20px 20px 20px 20px', 'background-color':'#282828'}),
    html.Div(id='output-container', children=[], 
            style={'text_align': 'center', 'color':'#FDA172', 'font-family':'sans-serif', 'padding':'10px'},),
])

@app.callback(
    [Output(component_id='output-container', component_property='children'),
     Output(component_id='choropleth-mapbox', component_property='figure')
    ],
    [Input(component_id='year-slider', component_property='value'),
     Input(component_id='topic-dropdown', component_property='value')]
)

def update_graph(year, topic):
    container = "The year selected was: {}".format(year)
    
    dff = topic_df.copy()
    gdff = dff[dff['time'].dt.year == year]
    gdff = gdff[gdff['topic'] == topic.lower()]
    gdff.set_index('area', inplace=True)
    fig = px.choropleth_mapbox(gdff, 
                        geojson=gdff.geometry,
                        color=gdff.sentiment,
                        locations=gdff.index,
                        zoom=5,
                        center = {"lat": -37.8136, "lon": 144.9631},
                        mapbox_style="carto-positron",
                        color_continuous_scale="oranges")
    fig.update_layout(mapbox_style="dark", mapbox_accesstoken=MAPBOX_ACCESS_TOKEN)
    fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0}, 
                    paper_bgcolor='rgba(0,0,0,0)', 
                    plot_bgcolor='rgba(0,0,0,0)',
                    font_color='#FFFFFF')

    return container, fig

app.run_server(mode="external", debug=True)

Dash app running on http://127.0.0.1:8050/
