Author: Adafaly Matthieu </br>
Licence:

# Importing libraries


In [None]:
import plotly.express as px
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import norm
from scipy.stats import wilcoxon

# Data


In [None]:
df = pd.read_pickle("Data/pollution_rennes_filter.pkl")
print("dataframe loaded")

In [None]:
display(df)

In [None]:
df_stationnary = df.loc[(df['sensor_type'] == 'fixedGps') & (df['PM_2.5'].notna())]
df_mobile = df.loc[(df['sensor_type'] == 'mobileGps') & (df['PM_2.5'].notna())]

## Data visualization

In [None]:
# Calculate the hourly average PM₂.₅ per sensor
mean_values = df.groupby(['hour', 'sensor_name'])['PM_2.5'].mean().reset_index()

# Remove a specific station if needed
mean_values = mean_values[mean_values['sensor_name'] != 'standalone-LOPY-AQ05']

# Set the figure size
plt.figure(figsize=(12, 6))

# Plot the lines using Seaborn
sns.lineplot(data=mean_values, x='hour', y='PM_2.5', hue='sensor_name')

# Add title and labels
plt.title("Average PM₂.₅ Concentration by Hour and Sensor")
plt.xlabel("Time of Day (Hour)")
plt.ylabel("PM₂.₅ Concentration (µg/m³)")
plt.legend(title="Station", bbox_to_anchor=(1.05, 0.5), loc='center left')
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
###### Define the order of the days of the week
week_days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df['day_week'] = pd.Categorical(df['day_week'], categories=week_days_order, ordered=True)

# Compute the average PM2.5 concentration
mean_values = df.groupby(['day_week', 'sensor_name'], observed=True)['PM_2.5'].mean().reset_index()

# Create the line plot
plt.figure(figsize=(12, 6))
sns.lineplot(data=mean_values, x='day_week', y='PM_2.5', hue='sensor_name', marker='o')

# Add title and axis labels
plt.title("Average PM₂.₅ Concentration by Day of the Week and Sensor")
plt.xlabel("Day of the Week")
plt.ylabel("PM₂.₅ Concentration (µg/m³)")

# Show legend on the right
plt.legend(title="Sensor", bbox_to_anchor=(1.05, 0.5), loc='center left')
plt.tight_layout()

# Display the plot
plt.show()

In [None]:
lissage=14
global_mean = df.groupby(
    df.index.get_level_values('measure_date').floor('D')
)['PM_2.5'].mean().reset_index()

global_mean.columns = ['date', 'mean_v']


# Adding the smoothed mean
global_mean['smoothed_mean'] = global_mean['mean_v'].rolling(window=lissage, center=True).mean()

# Interactive plot of the global mean
fig = px.line(
    global_mean,
    x='date',
    y='mean_v',
    markers=True,
    title="Daily Mean PM₂.₅ Concentration",
    labels={
        'mean_v': 'Concentration (µg/m³)',
        'date': 'Date'
    }
)


# Adding the smoothed line (black line)
fig.add_scatter(
    x=global_mean['date'],
    y=global_mean['smoothed_mean'],
    mode='lines',
    name=f'Smoothed mean ({lissage} days)',
    line=dict(color='black', width=3)
)
fig.update_traces(line_color='darkorange', line_width=3, selector=dict(name=None))  # main line
fig.update_layout(hovermode='x unified')

fig.show()

In [None]:
# Create a boxplot with Plotly
df_box = df.copy()
df_box['sensor_name'] = df_box.index.get_level_values('sensor_name')

fig = px.box(df_box, 
             x="sensor_name", 
             y="PM_2.5", 
             title="Distribution of pollution value by sensor", 
             labels={"PM_2.5": "Pollution value (µg/m³)", "sensor_name": "Sensor name"})

# Update the margins to recenter the box
fig.update_layout(
    margin=dict(l=40, r=40, t=40, b=40),  # Adjust the left, right, top, bottom margins
    boxmode='group',  # Ensure that the boxes do not overlap
    yaxis=dict(
        range=[df['PM_2.5'].quantile(0.05), df['PM_2.5'].quantile(0.95)]  # Limit the y-axis to the 5-95% of the data
    )
)
# Display the graph
fig.show()

In [None]:
# Type conversion (applied directly via index levels)
df.index = df.index.set_levels(
    pd.to_datetime(df.index.levels[df.index.names.index('measure_date')]), 
    level='measure_date'
)
df['PM_2.5'] = pd.to_numeric(df['PM_2.5'], errors='coerce')

# Group by sensor and date
daily_stats = df.groupby([
    df.index.get_level_values('measure_date').date,  # group by day only
    df.index.get_level_values('sensor_name')
])['PM_2.5'].mean().reset_index()

# Rename columns
daily_stats.columns = ['date', 'sensor_name', 'mean_value']

# Interactive plot: daily mean PM_2.5 per sensor
fig = px.line(
    daily_stats,
    x='date',
    y='mean_value',
    color='sensor_name',
    markers=True,
    title="Daily Mean PM₂.₅ Concentration per Sensor",
    labels={
        'mean_value': 'Concentration (µg/m³)',
        'date': 'Date',
        'sensor_name': 'Sensor'
    }
)

fig.update_layout(
    legend_title='Sensor',
    hovermode='x unified'
)

fig.show()


### PM₂.₅ Data Distribution Across Different Variables


In [None]:
# Extract all PM₂.₅ values (excluding missing values)
data = df['PM_2.5'].dropna()

# Create the figure
plt.figure(figsize=(10, 6))

# Histogram 
sns.histplot(data, bins=200, kde=True, color='orange', stat="density", edgecolor=None, label='Actual data (KDE)')

# Theoretical normal distribution curve
mu = data.mean()
sigma = data.std()
x = np.linspace(0, 40, 500)
y = norm.pdf(x, mu, sigma)
plt.plot(x, y, color='red', linestyle='--', linewidth=2, label=f'Normal distribution\nμ={mu:.2f}, σ={sigma:.2f}')

# Graph adjustments
plt.title("Overall PM₂.₅ Distribution with KDE and Normal Curve")
plt.xlabel("PM₂.₅ (µg/m³)")
plt.ylabel("Density")
plt.xlim(0, 40)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Month names in English
months_labels = ['January', 'February', 'March', 'April', 'May', 'June',
                 'July', 'August', 'September', 'October', 'November', 'December']

# Create a figure with 4 rows and 3 columns of subplots
fig, axes = plt.subplots(4, 3, figsize=(15, 12))
axes = axes.flatten()

# Plot histograms + normal distribution
for i in range(1, 13):
    ax = axes[i - 1]
    data = df[df['month'] == i]['PM_2.5'].dropna()

    if len(data) < 2:
        ax.set_title(f"{months_labels[i - 1]} (Not enough data)")
        continue

    # Histogram 
    sns.histplot(data, bins=200, kde=True, ax=ax, color='orange', stat="density", edgecolor=None)

    # Normal distribution curve
    mu = data.mean()
    sigma = data.std()
    x = np.linspace(0, 40, 500)
    y = norm.pdf(x, mu, sigma)
    ax.plot(x, y, color='red', linestyle='--', linewidth=2, label='Normal')

    ax.set_title(months_labels[i - 1])
    ax.set_xlim(0, 40)
    ax.set_xlabel("PM₂.₅ (µg/m³)")
    ax.set_ylabel("Density")
    ax.legend()

# Remove any empty subplots
for j in range(len(months_labels), len(axes)):
    fig.delaxes(axes[j])

fig.suptitle("PM₂.₅ Distribution by Month with Normal Curve", fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

In [None]:
# Days of the week
days_labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# 2x4 figure (1 empty slot)
fig, axes = plt.subplots(2, 4, figsize=(16, 6))
fig.suptitle("PM₂.₅ Distribution by Day of the Week with Normal Curve", fontsize=16)

# Plot histograms + KDE + normal curve
for i, day in enumerate(days_labels):
    row = i // 4
    col = i % 4
    ax = axes[row, col]

    data = df[df['day_week'] == day]['PM_2.5'].dropna()

    if len(data) < 2:
        ax.set_title(f"{day} (Not enough data)", fontsize=10)
        ax.axis('off')
        continue

    # Histogram + KDE
    sns.histplot(data, bins=200, kde=True, ax=ax, color='orange', stat="density", edgecolor=None)

    # Theoretical normal curve
    mu = data.mean()
    sigma = data.std()
    x = np.linspace(0, 60, 500)
    y = norm.pdf(x, mu, sigma)
    ax.plot(x, y, color='red', linestyle='--', linewidth=2, label='Normal')

    ax.set_title(day, fontsize=10)
    ax.set_xlim(0, 60)
    ax.set_xlabel('PM₂.₅ (µg/m³)', fontsize=9)
    ax.set_ylabel('Density', fontsize=9)
    ax.legend(fontsize=8)

# Remove the last empty subplot
axes[1, 3].axis('off')

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()


In [None]:
# List of unique sensors from the multi-level index
sensor_labels = df.index.get_level_values('sensor_name').dropna().unique()

# Subplot layout
cols = 3
n = len(sensor_labels)
rows = (n + cols - 1) // cols  # ceiling division

# Create the figure
fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
fig.suptitle("PM₂.₅ Distribution by Sensor with Normal Curve", fontsize=16)

axes = axes.flatten()  # Simplified access even if only one row

# Plot histograms
for i, sensor in enumerate(sensor_labels):
    ax = axes[i]
    data = df[df.index.get_level_values('sensor_name') == sensor]['PM_2.5'].dropna()

    if len(data) < 2:
        ax.set_title(f"{sensor} (Not enough data)", fontsize=10)
        ax.axis('off')
        continue

    # Histogram 
    sns.histplot(data, bins=200, kde=True, ax=ax, color='orange', stat="density", edgecolor=None)

    # Theoretical normal distribution curve
    mu = data.mean()
    sigma = data.std()
    x = np.linspace(0, 40, 500)
    y = norm.pdf(x, mu, sigma)
    ax.plot(x, y, color='red', linestyle='--', linewidth=2, label='Normal')

    ax.set_title(sensor, fontsize=10)
    ax.set_xlim(0, 40)
    ax.set_xlabel('PM₂.₅ (µg/m³)', fontsize=9)
    ax.set_ylabel('Density', fontsize=9)
    ax.legend(fontsize=8)

# Remove empty subplots
for j in range(i + 1, len(axes)):
    axes[j].axis('off')

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()


In [None]:
# Dictionary mapping month numbers to month names
month_names = {
    1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June',
    7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'
}

# If you have a dataframe df with 'month' for the month and 'PM_2.5' for pollution data
# Example: df = pd.read_csv('your_file.csv')

for month in range(1, 12):  # Stop at 11 to compare month 1-2, 2-3, ..., 11-12
    # Select pollution data for each month
    data_month1 = df[df['month'] == month]['PM_2.5'].dropna()
    data_month2 = df[df['month'] == month + 1]['PM_2.5'].dropna()

    # Check if data is not empty
    if len(data_month1) > 0 and len(data_month2) > 0:
        # Draw a random sample of 1000 measurements from each month
        data_month1 = np.random.choice(data_month1, 1000, replace=False)  # Sample of 1000
        data_month2 = np.random.choice(data_month2, 1000, replace=False)  # Sample of 1000

        # Perform the Wilcoxon test
        stat, p_value = wilcoxon(data_month1, data_month2)

        print(f"Comparison between {month_names[month]} and {month_names[month + 1]}:")
        print(f"  Statistic: {stat}, p-value: {p_value}")

        # Interpret the p-value
        if p_value < 0.05:
            print("  There is a significant difference between the pollution levels of the two months.")
        else:
            print("  There is no significant difference between the pollution levels of the two months.")
    else:
        print(f"  Not enough data to compare {month_names[month]} and {month_names[month + 1]}.")
    print("-" * 90)  # Separator between comparisons
