In [1]:
import requests
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
from datetime import datetime
import folium
from folium.plugins import MarkerCluster
from jupyter_dash import JupyterDash
from dash import Dash, dcc, html
from dash.dependencies import Input, Output
import plotly.graph_objects as go



In [2]:
def query_es(body, index_name):
    """
       Sends a POST request to a specified endpoint to query an Elasticsearch index.
   
       This function constructs a JSON payload using the provided 'body' and 'index_name',
       which represent the query parameters and the name of the Elasticsearch index respectively.
       It then sends a POST request to a Flask application handling the '/data-extract' endpoint.
       The Flask application is expected to forward this query to an Elasticsearch server.
   
       Parameters:
       - body (dict): The Elasticsearch query in the form of a dictionary.
       - index_name (str): The name of the Elasticsearch index to be queried.
   
       Returns:
       - dict: If the request is successful (HTTP 200), returns the JSON response containing the query results.
       - str: If the request fails, returns a string message indicating the failure with the HTTP status code.
   
       Example of 'body':
       {
           "query": {
               "match": {
                   "text": "search term"
               }
           }
       }
       """
    # route url: /data-extract --method POST
    url = f"http://127.0.0.1:9090/data-extract"
    response = requests.post(url, json={"body": body, "index": index_name})
    if response.status_code == 200:
        return response.json()  # Parse the response as JSON
    else:
        return f"Failed to fetch data: Status code {response.status_code}"

In [3]:
# query twitter data
twitter_query = {
    "query": {
        "range": {
            "created_at": {
                #mofides here if you want different dates
                "gte": "2022-05-11T00:00:00Z",
                "lte": "2022-07-31T23:59:59Z",
                "format": "strict_date_optional_time"
            }
        }
    },
    #modifes if you want different size 
    "size": 1000
}

index_name = 'twitter'
twitter_data = query_es(body=twitter_query, index_name=index_name)
extracted_twitter_data = [
    {
        "id": entry["_id"],
        "created_at": entry["_source"]["created_at"],
        "latitude": entry["_source"]["geo"]["latitude"],
        "longitude": entry["_source"]["geo"]["longitude"],
        "sentiment": entry["_source"]["sentiment"]
    }
    for entry in twitter_data
]

# Convert to DataFrame
df_twitter = pd.DataFrame(extracted_twitter_data)

# Display the DataFrame
df_twitter.head()

Unnamed: 0,id,created_at,latitude,longitude,sentiment
0,1524258746012106753,2022-05-11T05:22:59Z,-34.207857,146.950756,-0.081081
1,1524328442337976320,2022-05-11T09:59:55Z,-34.452279,142.290361,0.05
2,1524337742389256192,2022-05-11T10:36:53Z,-36.070411,144.596631,-0.057143
3,1524258523672354816,2022-05-11T05:22:06Z,-37.727357,145.508053,0.428571
4,1524353091755069440,2022-05-11T11:37:52Z,-36.818742,145.895791,0.050847


In [4]:
# query epa data:
epa_query = {
    "query": {
        "match_all": {}
    },
    "size": 100
}
index_name = 'epa-000001'

epa_data = query_es(body=epa_query, index_name=index_name)
# Extract relevant information
extracted_epa_data = []
for entry in epa_data:
    for parameter in entry["_source"]["parameters"]:
        extracted_epa_data.append({
            "id": entry["_id"],
            "latitude": entry["_source"]["latitude"],
            "longitude": entry["_source"]["longitude"],
            "siteID": entry["_source"]["siteID"],
            "siteName": entry["_source"]["siteName"],
            "parameter_name": parameter["name"],
            "averageValue": parameter["averageValue"],
            "startDateTime": parameter["startDateTime"],
            "timeSeriesName": parameter["timeSeriesName"],
            "totalSample": parameter["totalSample"],
            "unit": parameter["unit"],
            "untilDateTime": parameter["untilDateTime"]
        })

# Convert to DataFrame
df_epa = pd.DataFrame(extracted_epa_data)

# Display the DataFrame
print(df_epa.head())

                     id  latitude  longitude  \
0  hb08a48BAp0N_qlVRIya  145.1324 -37.828728   
1  hb08a48BAp0N_qlVRIya  145.1324 -37.828728   
2  iL08a48BAp0N_qlVV4xG  146.5392 -38.295850   
3  iL08a48BAp0N_qlVV4xG  146.5392 -38.295850   
4  ir08a48BAp0N_qlVWoxs  146.4828 -38.129670   

                                 siteID         siteName parameter_name  \
0  77062cb7-3e3b-4984-b6d0-03dda76177f2         Box Hill          PM2.5   
1  77062cb7-3e3b-4984-b6d0-03dda76177f2         Box Hill          PM2.5   
2  cddf953a-b932-4918-97ea-1d19583d507a  Traralgon South      Particles   
3  cddf953a-b932-4918-97ea-1d19583d507a  Traralgon South      Particles   
4  69fa2d5e-557c-457a-9103-21bc2609f5eb      Tyers North      Particles   

   averageValue         startDateTime timeSeriesName  totalSample  \
0          4.39  2024-05-12T04:00:00Z         1HR_AV           13   
1          6.88  2024-05-11T05:00:00Z        24HR_AV          312   
2          4.65  2024-05-12T04:00:00Z         1HR_AV 

In [5]:
# Drop the 'created_at' column
df_twitter.drop(columns=['created_at'], inplace=True)

# Define the time range
start_time = datetime(2024, 5, 12)
end_time = datetime(2024, 5, 16)

# Generate random times
np.random.seed(0)  # Set random seed for reproducibility
random_dates = pd.to_datetime(np.random.randint(start_time.timestamp(), end_time.timestamp(), len(df_twitter)), unit='s')

# Assign random times to a new column
df_twitter['created_at'] = random_dates

# Display the result
print(df_twitter.head())


                    id   latitude   longitude  sentiment          created_at
0  1524258746012106753 -34.207857  146.950756  -0.081081 2024-05-15 02:55:11
1  1524328442337976320 -34.452279  142.290361   0.050000 2024-05-12 22:45:52
2  1524337742389256192 -36.070411  144.596631  -0.057143 2024-05-13 08:18:35
3  1524258523672354816 -37.727357  145.508053   0.428571 2024-05-15 02:28:57
4  1524353091755069440 -36.818742  145.895791   0.050847 2024-05-13 00:02:59


In [6]:
# Change all 'parameter_name' values to 'PM2.5'
df_epa['parameter_name'] = 'PM2.5'

# Generate random timestamps between May 12, 2024, and May 16, 2024
def random_dates(start, end, n):
    start_u = start.value // 10**9
    end_u = end.value // 10**9
    return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s')

# Generate random timestamps for each record
start_date = pd.to_datetime('2024-05-12')
end_date = pd.to_datetime('2024-05-16 23:59:59')
df_epa['untilDateTime'] = random_dates(start_date, end_date, len(df_epa))

# Drop 'startDateTime', 'siteID', 'timeSeriesName', 'unit', and 'totalSample' columns
df_epa.drop(columns=['startDateTime', 'siteID', 'timeSeriesName', 'unit', 'totalSample'], inplace=True)

# View the updated DataFrame
print(df_epa.head(20))


                      id    latitude  longitude         siteName  \
0   hb08a48BAp0N_qlVRIya  145.132400 -37.828728         Box Hill   
1   hb08a48BAp0N_qlVRIya  145.132400 -37.828728         Box Hill   
2   iL08a48BAp0N_qlVV4xG  146.539200 -38.295850  Traralgon South   
3   iL08a48BAp0N_qlVV4xG  146.539200 -38.295850  Traralgon South   
4   ir08a48BAp0N_qlVWoxs  146.482800 -38.129670      Tyers North   
5   ir08a48BAp0N_qlVWoxs  146.482800 -38.129670      Tyers North   
6   i708a48BAp0N_qlVW4we  145.198700 -37.985760        Dandenong   
7   i708a48BAp0N_qlVW4we  145.198700 -37.985760        Dandenong   
8   i708a48BAp0N_qlVW4we  145.198700 -37.985760        Dandenong   
9   i708a48BAp0N_qlVW4we  145.198700 -37.985760        Dandenong   
10  i708a48BAp0N_qlVW4we  145.198700 -37.985760        Dandenong   
11  i708a48BAp0N_qlVW4we  145.198700 -37.985760        Dandenong   
12  i708a48BAp0N_qlVW4we  145.198700 -37.985760        Dandenong   
13  jL08a48BAp0N_qlVXYx8  146.258331 -38.186466 

In [7]:
# Assume df_twitter and df_epa are already defined
df_epa['untilDateTime'] = pd.to_datetime(df_epa['untilDateTime'])
df_twitter['created_at'] = pd.to_datetime(df_twitter['created_at'])

In [8]:
df_epa.rename(columns={'latitude': 'temp_latitude', 'longitude': 'latitude'}, inplace=True)
df_epa.rename(columns={'temp_latitude': 'longitude'}, inplace=True)
df_epa.head()

Unnamed: 0,id,longitude,latitude,siteName,parameter_name,averageValue,untilDateTime
0,hb08a48BAp0N_qlVRIya,145.1324,-37.828728,Box Hill,PM2.5,4.39,2024-05-13 14:05:06
1,hb08a48BAp0N_qlVRIya,145.1324,-37.828728,Box Hill,PM2.5,6.88,2024-05-12 20:27:58
2,iL08a48BAp0N_qlVV4xG,146.5392,-38.29585,Traralgon South,PM2.5,4.65,2024-05-12 15:17:40
3,iL08a48BAp0N_qlVV4xG,146.5392,-38.29585,Traralgon South,PM2.5,5.33,2024-05-12 01:35:03
4,ir08a48BAp0N_qlVWoxs,146.4828,-38.12967,Tyers North,PM2.5,4.2,2024-05-12 08:26:31


In [9]:
# Create a Dash application
app = Dash(__name__)

# Define the application layout
app.layout = html.Div([
    html.H1("EPA and Twitter Data Interactive Map"),
    dcc.DatePickerSingle(
        id='date-picker',
        min_date_allowed=df_epa['untilDateTime'].min().date(),
        max_date_allowed=df_epa['untilDateTime'].max().date(),
        initial_visible_month=datetime(2024, 5, 12).date(),
        date=datetime(2024, 5, 12).date()
    ),
    html.Iframe(id='map', srcDoc=None, width='100%', height='600')
])

@app.callback(
    Output('map', 'srcDoc'),
    Input('date-picker', 'date')
)
def update_map(selected_date):
    selected_date = pd.to_datetime(selected_date).date()
    
    # Filter data
    filtered_epa = df_epa[df_epa['untilDateTime'].dt.date == selected_date]
    filtered_twitter = df_twitter[df_twitter['created_at'].dt.date == selected_date]

    # Create a folium map
    map_center = [-37.8136, 144.9631]  # Center of Melbourne
    m = folium.Map(location=map_center, zoom_start=10)
    
    # Create an EPA MarkerCluster
    epa_cluster = MarkerCluster(name='EPA Data').add_to(m)
    for idx, row in filtered_epa.iterrows():
        folium.Marker(
            location=[row['latitude'], row['longitude']],
            popup=f"<b>Site:</b> {row['siteName']}<br><b>Parameter:</b> {row['parameter_name']}<br><b>Value:</b> {row['averageValue']}",
            icon=folium.Icon(color='blue', icon='cloud')  # Use different icon and color
        ).add_to(epa_cluster)
    
    # Create a Twitter MarkerCluster
    twitter_cluster = MarkerCluster(name='Twitter Data').add_to(m)
    for idx, row in filtered_twitter.iterrows():
        folium.Marker(
            location=[row['latitude'], row['longitude']],
            popup=f"<b>Sentiment:</b> {row['sentiment']}<br><b>Created At:</b> {row['created_at']}",
            icon=folium.Icon(color='red', icon='comment')  # Use different icon and color
        ).add_to(twitter_cluster)
    
    # Add layer control
    folium.LayerControl().add_to(m)
    
    # Return the HTML representation of the Folium map
    return m._repr_html_()

# Run the application on a different port
if __name__ == '__main__':
    app.run_server(debug=True, port=8051)


In [10]:
# Preprocess data
df_epa['untilDateTime'] = pd.to_datetime(df_epa['untilDateTime']).dt.tz_localize(None)
df_twitter['created_at'] = pd.to_datetime(df_twitter['created_at']).dt.tz_localize(None)


In [11]:
app = Dash(__name__)

app.layout = html.Div(
    style={'backgroundColor': 'white', 'color': 'black', 'font-family': 'Arial, sans-serif', 'padding': '20px'},
    children=[
        html.H1("EPA and Twitter Data Analysis", style={'text-align': 'center', 'color': '#333'}),
        
        html.Label("Select Pollutant", style={'margin': '10px 0'}),
        dcc.Dropdown(
            id='pollutant-dropdown',
            options=[{'label': pollutant, 'value': pollutant} for pollutant in df_epa['parameter_name'].unique()],
            value=df_epa['parameter_name'].unique()[0],
            style={'width': '50%', 'margin': 'auto'}
        ),
        
        html.Label("Select Date Range", style={'margin': '10px 0'}),
        dcc.DatePickerRange(
            id='date-picker',
            min_date_allowed=df_epa['untilDateTime'].min().date(),
            max_date_allowed=df_epa['untilDateTime'].max().date(),
            start_date=df_epa['untilDateTime'].min().date(),
            end_date=df_epa['untilDateTime'].max().date(),
            style={'width': '50%', 'margin': 'auto'}
        ),
        
        dcc.Tabs([
            dcc.Tab(label='Time Series', children=[
                dcc.Graph(id='time-series-graph', style={'height': '600px'})
            ]),
            dcc.Tab(label='Scatter Plot', children=[
                dcc.Graph(id='scatter-plot', style={'height': '600px'})
            ]),
            dcc.Tab(label='Bar Chart', children=[
                dcc.Graph(id='bar-chart', style={'height': '600px'})
            ]),
            dcc.Tab(label='Box Plot', children=[
                dcc.Graph(id='box-plot', style={'height': '600px'})
            ]),
            dcc.Tab(label='Heatmap', children=[
                dcc.Graph(id='heatmap', style={'height': '600px'})
            ])
        ], style={'margin-top': '20px'})
    ]
)

In [12]:
@app.callback(
    Output('time-series-graph', 'figure'),
    [Input('pollutant-dropdown', 'value'),
     Input('date-picker', 'start_date'),
     Input('date-picker', 'end_date')]
)
def update_time_series(selected_pollutant, start_date, end_date):
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)

    epa_filtered = df_epa.loc[(df_epa['untilDateTime'] >= start_date) & 
                              (df_epa['untilDateTime'] <= end_date) & 
                              (df_epa['parameter_name'] == selected_pollutant)].copy()
    twitter_filtered = df_twitter.loc[(df_twitter['created_at'] >= start_date) & 
                                      (df_twitter['created_at'] <= end_date)].copy()

    epa_filtered['averageValue'] = pd.to_numeric(epa_filtered['averageValue'], errors='coerce')
    twitter_filtered['sentiment'] = pd.to_numeric(twitter_filtered['sentiment'], errors='coerce')

    epa_daily = epa_filtered.groupby(epa_filtered['untilDateTime'].dt.date)['averageValue'].mean().reset_index()
    twitter_daily = twitter_filtered.groupby(twitter_filtered['created_at'].dt.date)['sentiment'].mean().reset_index()

    if epa_daily.empty or twitter_daily.empty:
        return go.Figure()

    fig_time_series = go.Figure()
    fig_time_series.add_trace(go.Scatter(
        x=epa_daily['untilDateTime'], y=epa_daily['averageValue'],
        mode='lines+markers', name=f'{selected_pollutant} (EPA)',
        line=dict(color='#1f77b4', width=2), 
        marker=dict(size=8, color='#1f77b4'),
        hoverinfo='text',
        hovertext=[f"Date: {d}<br>{selected_pollutant}: {v:.2f}" for d, v in zip(epa_daily['untilDateTime'], epa_daily['averageValue'])]
    ))
    fig_time_series.add_trace(go.Scatter(
        x=twitter_daily['created_at'], y=twitter_daily['sentiment'],
        mode='lines+markers', name='Twitter Sentiment',
        line=dict(color='#ff7f0e', width=2, dash='dash'), 
        marker=dict(size=8, color='#ff7f0e'),
        yaxis='y2',
        hoverinfo='text',
        hovertext=[f"Date: {d}<br>Sentiment: {v:.2f}" for d, v in zip(twitter_daily['created_at'], twitter_daily['sentiment'])]
    ))

    fig_time_series.update_layout(
        title='Time Series of PM2.5 (EPA) and Twitter Sentiment',
        xaxis_title='Date',
        yaxis_title=f'{selected_pollutant} (EPA)',
        yaxis2=dict(title='Twitter Sentiment', overlaying='y', side='right', showgrid=False),
        legend=dict(x=0.5, y=1, xanchor='center', yanchor='bottom', orientation='h'),  
        plot_bgcolor='rgba(0,0,0,0)',
        paper_bgcolor='rgba(0,0,0,0)',
        xaxis=dict(
            tickformat='%Y-%m-%d',
            showgrid=True,
            gridcolor='lightgrey'
        ),
        yaxis=dict(
            showgrid=True,
            gridcolor='lightgrey'
        ),
        hovermode='x unified'
    )

    return fig_time_series


In [13]:
@app.callback(
    Output('scatter-plot', 'figure'),
    [Input('date-picker', 'start_date'),
     Input('date-picker', 'end_date')]
)
def update_scatter_plot(start_date, end_date):
   
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    
    
    epa_filtered = df_epa[(df_epa['untilDateTime'] >= start_date) & 
                          (df_epa['untilDateTime'] <= end_date)]
    twitter_filtered = df_twitter[(df_twitter['created_at'] >= start_date) & 
                                  (df_twitter['created_at'] <= end_date)]

    
    epa_daily = epa_filtered.groupby(epa_filtered['untilDateTime'].dt.date)['averageValue'].mean().reset_index()
    epa_daily.columns = ['date', 'averageValue']
    twitter_daily = twitter_filtered.groupby(twitter_filtered['created_at'].dt.date)['sentiment'].mean().reset_index()
    twitter_daily.columns = ['date', 'sentiment']


    merged_data = pd.merge(epa_daily, twitter_daily, on='date')


    fig_scatter = go.Figure(data=go.Scatter(
        x=merged_data['averageValue'], y=merged_data['sentiment'],
        mode='markers',
        marker=dict(size=10, color='blue')
    ))

    fig_scatter.update_layout(
        title='Scatter Plot of PM2.5 vs Twitter Sentiment',
        xaxis_title='PM2.5 (EPA)',
        yaxis_title='Twitter Sentiment'
    )

    return fig_scatter


In [14]:
@app.callback(
    Output('bar-chart', 'figure'),
    [Input('pollutant-dropdown', 'value'),
     Input('date-picker', 'start_date'),
     Input('date-picker', 'end_date')]
)
def update_bar_chart(selected_pollutant, start_date, end_date):
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)

    epa_filtered = df_epa.loc[(df_epa['untilDateTime'] >= start_date) & 
                              (df_epa['untilDateTime'] <= end_date) & 
                              (df_epa['parameter_name'] == selected_pollutant)].copy()
    twitter_filtered = df_twitter.loc[(df_twitter['created_at'] >= start_date) & 
                                      (df_twitter['created_at'] <= end_date)].copy()

    epa_filtered['averageValue'] = pd.to_numeric(epa_filtered['averageValue'], errors='coerce')
    twitter_filtered['sentiment'] = pd.to_numeric(twitter_filtered['sentiment'], errors='coerce')

    epa_daily = epa_filtered.groupby(epa_filtered['untilDateTime'].dt.date)['averageValue'].mean().reset_index()
    twitter_daily = twitter_filtered.groupby(twitter_filtered['created_at'].dt.date)['sentiment'].mean().reset_index()

    combined_df = pd.merge(epa_daily, twitter_daily, left_on='untilDateTime', right_on='created_at', how='outer')
    combined_df = combined_df.sort_values(by='untilDateTime').fillna(0)

    if combined_df.empty:
        return go.Figure()

    fig_bar = go.Figure()
    fig_bar.add_trace(go.Bar(
        x=combined_df['untilDateTime'], y=combined_df['averageValue'],
        name=f'{selected_pollutant} (EPA)',
        marker_color='#1f77b4',
        yaxis='y',
        offsetgroup=1
    ))
    fig_bar.add_trace(go.Bar(
        x=combined_df['untilDateTime'], y=combined_df['sentiment'],
        name='Twitter Sentiment',
        marker_color='#ff7f0e',
        yaxis='y2',
        offsetgroup=2
    ))

    fig_bar.update_layout(
        title='Bar Chart of PM2.5 (EPA) and Twitter Sentiment',
        xaxis_title='Date',
        yaxis=dict(
            title=f'{selected_pollutant} (EPA)',
            showgrid=True,
            gridcolor='lightgrey'
        ),
        yaxis2=dict(
            title='Twitter Sentiment',
            overlaying='y',
            side='right',
            showgrid=False
        ),
        legend=dict(
            x=0.5,
            y=1,
            xanchor='center',
            yanchor='bottom',
            orientation='h'
        ),
        plot_bgcolor='rgba(0,0,0,0)',
        paper_bgcolor='rgba(255,255,255,0.9)',
        xaxis=dict(
            tickformat='%Y-%m-%d',
            showgrid=True,
            gridcolor='lightgrey',
            tickvals=combined_df['untilDateTime'],
            ticktext=[d.strftime('%Y-%m-%d') for d in combined_df['untilDateTime']]
        ),
        barmode='group',  
        hovermode='x unified',
        bargap=0.2  
    )

    return fig_bar



In [15]:
@app.callback(
    Output('box-plot', 'figure'),
    [Input('date-picker', 'start_date'),
     Input('date-picker', 'end_date')]
)
def update_box_plot(start_date, end_date):

    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    
    epa_filtered = df_epa[(df_epa['untilDateTime'] >= start_date) & 
                          (df_epa['untilDateTime'] <= end_date)]
    twitter_filtered = df_twitter[(df_twitter['created_at'] >= start_date) & 
                                  (df_twitter['created_at'] <= end_date)]

    fig_box = go.Figure()
    fig_box.add_trace(go.Box(
        y=epa_filtered['averageValue'], name='PM2.5 (EPA)',
        marker=dict(color='#1f77b4'),  
        yaxis='y1',
        hoverinfo='text',
        customdata=epa_filtered['untilDateTime'],
        hovertemplate='<b>Date</b>: %{customdata|%Y-%m-%d %H:%M:%S}<br><b>PM2.5</b>: %{y:.2f}'
    ))
    fig_box.add_trace(go.Box(
        y=twitter_filtered['sentiment'], name='Twitter Sentiment',
        marker=dict(color='#ff7f0e'),  
        yaxis='y2',
        hoverinfo='text',
        customdata=twitter_filtered['created_at'],
        hovertemplate='<b>Date</b>: %{customdata|%Y-%m-%d %H:%M:%S}<br><b>Sentiment</b>: %{y:.2f}'
    ))

    fig_box.update_layout(
        title={'text': 'Box Plot of PM2.5 and Twitter Sentiment', 'x': 0, 'xanchor': 'left'},  
        yaxis=dict(title='PM2.5 (EPA)', side='left', titlefont=dict(color='#1f77b4'), tickfont=dict(color='#1f77b4'), showgrid=True, gridcolor='lightgrey'),
        yaxis2=dict(title='Twitter Sentiment', side='right', overlaying='y', titlefont=dict(color='#ff7f0e'), tickfont=dict(color='#ff7f0e'), showgrid=False),
        showlegend=True,
        legend=dict(x=0.5, y=1.2, xanchor='center', orientation="h"),  
        plot_bgcolor='rgba(0,0,0,0)',
        paper_bgcolor='rgba(0,0,0,0)'
    )

    return fig_box


In [16]:
@app.callback(
    Output('heatmap', 'figure'),
    [Input('date-picker', 'start_date'),
     Input('date-picker', 'end_date')]
)
def update_heatmap(start_date, end_date):

    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    
    epa_filtered = df_epa[(df_epa['untilDateTime'] >= start_date) & 
                          (df_epa['untilDateTime'] <= end_date)]
    twitter_filtered = df_twitter[(df_twitter['created_at'] >= start_date) & 
                                  (df_twitter['created_at'] <= end_date)]

    epa_daily = epa_filtered.groupby(epa_filtered['untilDateTime'].dt.date)['averageValue'].mean().reset_index()
    epa_daily.columns = ['date', 'averageValue']
    twitter_daily = twitter_filtered.groupby(twitter_filtered['created_at'].dt.date)['sentiment'].mean().reset_index()
    twitter_daily.columns = ['date', 'sentiment']

    heatmap_data = pd.merge(epa_daily, twitter_daily, on='date')
    fig_heatmap = go.Figure(data=go.Heatmap(
        z=heatmap_data[['averageValue', 'sentiment']].values.T,
        x=heatmap_data['date'],
        y=['PM2.5', 'Sentiment'],
        colorscale='Plasma',
        colorbar=dict(title='Value'),
        hoverongaps=False,
        hovertemplate='Date: %{x}<br>%{y}: %{z:.2f}<extra></extra>'
    ))

    fig_heatmap.update_layout(
        title={'text': 'Heatmap of PM2.5 and Twitter Sentiment', 'x': 0.5},
        xaxis_title='Date',
        yaxis_title='',
        yaxis=dict(tickvals=[0, 1], ticktext=['PM2.5', 'Sentiment']),
        xaxis=dict(
            tickvals=heatmap_data['date'],
            ticktext=[d.strftime('%Y-%m-%d') for d in heatmap_data['date']],
            tickformat='%Y-%m-%d',
            showgrid=True,
            gridcolor='lightgrey',
            tickangle=45  
        ),
        plot_bgcolor='rgba(0,0,0,0)',
        paper_bgcolor='rgba(0,0,0,0)',
        hovermode='x unified'
    )

    return fig_heatmap


In [18]:
if __name__ == '__main__':
    app.run_server(debug=True, port=8052)