Author: Adafaly Matthieu </br>

# Code Explanation:</br>

This code focuses on the data collected from mobile sensors in order to create a map. This map will include several filters (geographical, temporal, etc.) and will allow users to explore the data in more detail for further analysis.

 This code aim to get the library and to shutdown the warning of the other code part.

# Importing libraries


In [None]:
import pandas as pd
import tkinter as tk
from tkcalendar import Calendar
from datetime import datetime, timedelta
import folium
import json
import plotly.io as pio
import plotly.express as px
from sklearn.cluster import DBSCAN
from folium.plugins import Draw
import webbrowser
from IPython.display import display, IFrame, Markdown
from tkinter import filedialog
import warnings
warnings.filterwarnings("ignore")

# Data

In [None]:
df = pd.read_pickle("Data/pollution_rennes.pkl")
print("dataframe loaded")

In [None]:
df_mobile = df.loc[(df['sensor_type'] == 'mobileGps') & (df['PM_2.5'].notna())]
df_mobile = df_mobile.reset_index()

# Analysis


## Map generation and filter configuration

In [None]:
def get_first_point(coords):
    """
    Extracts the first coordinate pair from a list of coordinates.

    Parameters:
    coords (list): A list of coordinate pairs (long,lat).

    Returns:
    list or None: A list containing the first longitude and latitude as floats,
                  or None if the input is invalid or improperly formatted.
    """
    # Check that the coordinates are a list and contain at least one point
    if isinstance(coords, list) and len(coords) > 0:
        # Retrieve the first coordinate pair
        try:
            return [float(coords[0][0]), float(coords[0][1])]  # Longitude, Latitude
        except (ValueError, TypeError):
            return None
    return None

# Apply this function to rows with geometry type 'LineString'
df_mobile.loc[df_mobile['geo_type'] == 'LineString', 'longitude'] = df_mobile['geo_coords'].apply(
    lambda x: get_first_point(x)[0] if isinstance(x, list) and get_first_point(x) is not None else None
)

df_mobile.loc[df_mobile['geo_type'] == 'LineString', 'latitude'] = df_mobile['geo_coords'].apply(
    lambda x: get_first_point(x)[1] if isinstance(x, list) and get_first_point(x) is not None else None
)

df = df_mobile

# Extract date only (remove timezone) and convert to datetime
df['date'] = df['measure_date'].dt.date  
df['date'] = pd.to_datetime(df['date'])

# Extract time component separately
df['time'] = df['measure_date'].dt.time



In [None]:
df_clustered = None
df_superpose = pd.DataFrame()  # Initialize with empty dataframe or load it with your data

def load_geojson_rectangle():
    """
    Opens a file dialog to allow the user to select a GeoJSON file and loads its content.

    Returns:
    dict or None: The parsed GeoJSON data as a dictionary if a file is selected,
                  otherwise None.
    """
    file_path = filedialog.askopenfilename(filetypes=[("GeoJSON files", "*.geojson")])
    if not file_path:
        print("No file selected.")
        return None
    with open(file_path, 'r', encoding='utf-8') as f:
        geojson_data = json.load(f)
    return geojson_data



def filter_by_rectangle(df, geojson_data) -> pd.DataFrame():
    """
    Filters a DataFrame based on the geographic bounds defined by a GeoJSON rectangle.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data to be filtered.
    geojson_data (dict): The GeoJSON data containing the rectangle coordinates for filtering.

    Returns:
    pd.DataFrame: A filtered DataFrame containing only the points within the geographic rectangle.
                  Returns an empty DataFrame if there is an error or no points match the criteria.
    """
    try:
        coords = geojson_data['features'][0]['geometry']['coordinates'][0]
        lons = [c[0] for c in coords]
        lats = [c[1] for c in coords]

        min_lat, max_lat = min(lats), max(lats)
        min_lon, max_lon = min(lons), max(lons)

        df_filtered = df[
            (df['latitude'] >= min_lat) &
            (df['latitude'] <= max_lat) &
            (df['longitude'] >= min_lon) &
            (df['longitude'] <= max_lon)]
        

        print(f"{len(df_filtered)} points found in the rectangle.")
        return df_filtered
    except Exception as e:
        print("Error while filtering with the rectangle:", e)
        return pd.DataFrame()



def get_color(v) -> str:
    """
    Determines the color based on the pollution value.

    Parameters:
    v (float): The pollution PM 2.5 value (in µg/m³) of an observation.

    Returns:
    str: A color corresponding to the pollution level:
         - "green" for values less than 5 µg/m³,
         - "yellow" for values between 5 µg/m³ and 15 µg/m³,
         - "orange" for values between 15 µg/m³ and 25 µg/m³,
         - "red" for values greater than or equal to 25 µg/m³.
    """
    if v < 5:
        return "green"
    elif v < 15:
        return "yellow"
    elif v < 25:
        return "orange"
    else:
        return "red"


def display_map(start, end, selected_filters) ->  None:
    """
    Return the map with the applied filters

    Parameters:
    start (date): The first date selected
    end (date): The second date selected
    selected_filters (list[str]): The selected sensors

    Returns:
    None
    """
    global df_clustered
    global df_superpose  
    df_loca['date'] = pd.to_datetime(df_loca['date']).dt.normalize()
    filtered = df_loca[(df_loca['date'] >= start) & (df_loca['date'] <= end)].copy()
    print(f"Start Date: {start} \nEnd Date:   {end}")
    print(f"Number of points after date filtering: {len(filtered)}")
    filter_dict = {station: filtered[filtered['sensor_name'] == station] for station in filtered['sensor_name'].unique()}
    df_superpose =  pd.DataFrame()
    print(selected_filters)
    for i in selected_filters:
        if i in filter_dict:
            print(f"Number of points for {i}: {len(filter_dict[i])}")
            df_superpose = pd.concat([df_superpose, filter_dict[i]], ignore_index=True)
        else:
            print(f"[⚠️] Sensor '{i}' has no data in this period or area.")

    print(f"Filtered sensors: {', '.join(selected_filters)}")
    print(f"Number of points after sensor filtering: {len(df_superpose)}")
    if len(df_superpose)==0:
        m = folium.Map(location=[df_mobile['latitude'].mean(), df_mobile['longitude'].mean()], zoom_start=12)
        display(m)
    else:
        coords = df_superpose[['latitude', 'longitude']].values
        db = DBSCAN(eps=0.0002, min_samples=1).fit(coords)
        df_superpose.loc[:, 'cluster'] = db.labels_
    
        df_clustered = df_superpose.groupby('cluster').agg({
            'latitude': 'mean', 
            'longitude': 'mean', 
            'PM_2.5': 'mean',
            'sensor_name': lambda x: ', '.join(x.unique())
        }).reset_index()
    
        m = folium.Map(location=[df['latitude'].mean(), df['longitude'].mean()], zoom_start=12)
        low_group = folium.FeatureGroup(name="Low (<5 µg/m³)", overlay=True)
        mod_group = folium.FeatureGroup(name="Moderate (5-15 µg/m³)", overlay=True)
        high_group = folium.FeatureGroup(name="High (15-25 µg/m³)", overlay=True)
        crit_group = folium.FeatureGroup(name="Critical (>25 µg/m³)", overlay=True)
    
        for _, row in df_clustered.iterrows():
            color = get_color(row['PM_2.5'])
            marker = folium.CircleMarker(
                location=[row['latitude'], row['longitude']],
                radius=max(2, min(3, row['PM_2.5'] / 5)),
                color=color,
                fill=True,
                fill_color=color,
                fill_opacity=0.7,
                popup=f"Pollution: {round(row['PM_2.5'], 2)} µg/m³ - Station {row['sensor_name']}"
            )
    
            if row['PM_2.5'] < 5:
                low_group.add_child(marker)
            elif row['PM_2.5'] < 15:
                mod_group.add_child(marker)
            elif row['PM_2.5'] < 25:
                high_group.add_child(marker)
            else:
                crit_group.add_child(marker)
    
        for group in [low_group, mod_group, high_group, crit_group]:
            group.add_to(m)
    
        folium.LayerControl().add_to(m)
    
        draw = Draw(
            export=True,
            draw_options={
                'polyline': False,
                'polygon': False,
                'circle': False,
                'circlemarker': False,
                'marker': False,
                'rectangle': True
            },
            edit_options={'edit': False}
        )
        draw.add_to(m)
    
        m.save("map_pollution.html")
        
        # Open the map
        webbrowser.open("map_pollution.html")

def request_dates():
    """

    """
    min_date = df_mobile['date'].min().date()
    max_date = df_mobile['date'].max().date()

    root = tk.Tk()
    root.title("Select Dates")
    root.attributes("-topmost", True)

    calendar_start = Calendar(root, selectmode='day', date_pattern='yyyy-mm-dd', mindate=min_date, maxdate=max_date)
    calendar_start.selection_set(min_date)
    calendar_start.pack(pady=10)

    calendar_end = Calendar(root, selectmode='day', date_pattern='yyyy-mm-dd', mindate=min_date, maxdate=max_date)
    calendar_end.selection_set(max_date)
    calendar_end.pack(pady=10)

    stations = df['sensor_name'].unique()
    filter_vars = {}
    
    filters_frame = tk.Frame(root)
    filters_frame.pack(pady=10)

    for station in stations:
        filter_vars[station] = tk.BooleanVar(value=True)
        tk.Checkbutton(filters_frame, text=station, variable=filter_vars[station]).pack(anchor=tk.W)

    def get_dates():
        """
        Retrieves the start and end dates selected by the user from the calendar widgets, adjusts the end date to include the entire day, collects the selected filters, and call the function display map.

        Parameters:
        None

        Returns:
        None
        """
        global selected_filters
        start_date = calendar_start.get_date()
        end_date = calendar_end.get_date()
        start_date = datetime.strptime(start_date, "%Y-%m-%d")
        end_date = datetime.strptime(end_date, "%Y-%m-%d") + timedelta(days=1) - timedelta(seconds=1)
    
        selected_filters = [station for station, filter_var in filter_vars.items() if filter_var.get()]
        display_map(start_date, end_date, selected_filters)
    
    def on_quit():
        """
        Quit the window 

        Parameters:
        None

        Returns:
        None
        """
        root.destroy()
    
    def on_load_rectangle():
        """
        Write the number of observation left after applying the location filter
                
        Parameters:
        None

        Returns:
        None
        """
        global df_loca
        # Loads a GeoJSON file and applies filtering
        geojson_data = load_geojson_rectangle()  # Loads the GeoJSON file using the previous function
        if geojson_data is not None:
            # Applies a filter on the DataFrame data based on the GeoJSON
            df_loca = filter_by_rectangle(df, geojson_data)
            print(f"Number of points after filtering by rectangle: {len(df_loca)}")

    def on_validate():
        """
        This function select the data frame that will be use in the get dates function.
        
        Parameters:
        None

        Returns:
        None
        """
        global df_loca
        if 'df_loca' in globals():
            pass  # It already exists, so do nothing
        else:
            df_loca = df_mobile
        print(f"Number of points after filtering by rectangle: {len(df_loca)}")
        get_dates()
    
    button_frame = tk.Frame(root)
    button_frame.pack(pady=20)
    
    load_btn = tk.Button(button_frame, text="Load Rectangle", command=on_load_rectangle)
    load_btn.pack(side=tk.LEFT, padx=10)
    
    quit_btn = tk.Button(button_frame, text="Quit", command=on_quit)
    quit_btn.pack(side=tk.LEFT, padx=10)
    
    quit_btn = tk.Button(button_frame, text="Confirm", command=on_validate)
    quit_btn.pack(side=tk.LEFT, padx=10)


    root.mainloop()

request_dates()


## Map Visualizations Based on Active Filters

In [None]:
pio.renderers.default = 'notebook'  # ou 'iframe', 'notebook_connected'
# Average per month
df_monthly = df_superpose.groupby('month')['PM_2.5'].mean().reset_index()

# Line chart with Plotly
fig = px.line(df_monthly,
              x='month',
              y='PM_2.5',
              title="Monthly Evolution of Pollution (PM_2.5 in µg/m³)",
              labels={'month': 'Month', 'v (ug/m3)': 'Pollution Value (µg/m³)'})

fig.update_layout(template='plotly_white', xaxis_title='Month', yaxis_title='µg/m³')
fig.show()

In [None]:
# Create a boxplot with Plotly
fig = px.box(df_superpose, 
             x="sensor_name", 
             y="PM_2.5", 
             title="Distribution of Pollution Values by Station", 
             labels={"PM_2.5": "Pollution Value (µg/m³)", "sensor_name": "Station Name"})

# Update margins to center the box
fig.update_layout(
    margin=dict(l=40, r=40, t=40, b=40),  # Adjust margins: left, right, top, bottom
    boxmode='group',  # Ensure boxes do not overlap
    yaxis=dict(
        range=[df_superpose['PM_2.5'].quantile(0.05), df_superpose['PM_2.5'].quantile(0.95)]  # Limit y-axis to 5-95% of data
    )
)

# Display the plot
fig.show()


In [None]:
# Calculate the average per hour and per station
mean_values = df_superpose.groupby(['hour', 'sensor_name'])['PM_2.5'].mean().reset_index()

# Remove a specific station if needed
mean_values = mean_values[mean_values['sensor_name'] != 'standalone-LOPY-AQ05']

# Create the plot with Plotly
fig = px.line(mean_values, 
              x='hour', 
              y='PM_2.5', 
              color='sensor_name',
              title="Average v (ug/m³) by hour and station",
              labels={
                  "hour": "Hour of the Day",
                  "v (ug/m3)": "Pollution (µg/m³)",
                  "sensor_name": "Station"
              })

# Display the plot
fig.update_layout(legend_title_text='Station',
                  legend=dict(x=1.05, y=0.5),
                  margin=dict(r=150))  # offset to make space for the legend

fig.show()


In [None]:
# Define the order of the days of the week
order_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df_superpose['day_week'] = pd.Categorical(df_superpose['day_week'], categories=order_days, ordered=True)

# Calculate the average value per day of the week and per station
mean_values = df_superpose.groupby(['day_week', 'sensor_name'], observed=True)['PM_2.5'].mean().reset_index()

# Create the plot with Plotly
fig = px.line(
    mean_values,
    x='day_week',
    y='PM_2.5',
    color='sensor_name',
    markers=True,
    title="Average PM_2.5 (ug/m³) by Day of the Week and Station",
    labels={'day_week': 'Day of the Week', 'PM_2.5': 'Concentration (µg/m³)', 'sensor_name': 'Station'}
)

# Display the legend on the right
fig.update_layout(
    legend=dict(
        title='Station',
        orientation="v",
        yanchor="middle",
        y=0.5,
        xanchor="left",
        x=1
    ),
    margin=dict(r=150)  # to prevent cutting off the legend
)

# Show the plot
fig.show()


In [None]:
# Define the order of the days of the week
order_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df_superpose['day_week'] = pd.Categorical(df_superpose['day_week'], categories=order_days, ordered=True)

# Calculate the average PM_2.5 per day of the week (all stations combined)
mean_values = df_superpose.groupby('day_week', observed=True)['PM_2.5'].mean().reset_index()

# Create the plot with Plotly (no color grouping)
fig = px.line(
    mean_values,
    x='day_week',
    y='PM_2.5',
    markers=True,
    title="Average PM_2.5 (µg/m³) by Day of the Week (All Stations Combined)",
    labels={'day_week': 'Day of the Week', 'PM_2.5': 'Concentration (µg/m³)'}
)

fig.show()


In [None]:
# Calculate the global daily average (all stations)
global_mean = df_superpose.groupby(df_superpose['measure_date'].dt.date)['PM_2.5'].mean().reset_index()
global_mean.columns = ['date', 'mean_v']

# Add smoothed average
global_mean['smoothed_mean'] = global_mean['mean_v'].rolling(window=7, center=True).mean()

# Calculate the number of measurements per day
count_by_day = df_superpose.groupby(df_superpose['measure_date'].dt.date)['PM_2.5'].count().reset_index()
count_by_day.columns = ['date', 'count']
global_mean = global_mean.merge(count_by_day, on='date')

# Create the plot
fig = px.line(
    global_mean,
    x='date',
    y='mean_v',
    markers=True,
    custom_data=['count'],
    title="Daily Average of PM_2.5 (ug/m³)",
    labels={
        'mean_v': 'Concentration (µg/m³)',
        'date': 'Date',
        'count': 'Number of Measurements'
    }
)

# Name the trace
fig.update_traces(name='Raw Average')

# Target this trace specifically
fig.update_traces(
    hovertemplate="<br>".join([
        "Date: %{x}",
        "Average: %{y:.2f} µg/m³",
        "Number of Measurements: %{customdata[0]}"
    ]),
    selector=dict(name='Raw Average')
)

# Add the smoothed line
fig.add_scatter(
    x=global_mean['date'],
    y=global_mean['smoothed_mean'],
    mode='lines',
    name='Smoothed Average (7 days)',
    line=dict(color='black', width=3)
)

# Set color for the main line
fig.update_traces(line_color='darkorange', line_width=3, selector=dict(name=None))
fig.update_layout(hovermode='x unified')

fig.show()
