Author: Adafaly Matthieu </br>

This notebook still has some modifications to be made and is not finished yet.

# Importing libraries


In [None]:
import plotly.express as px
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import norm
from scipy.stats import wilcoxon
from scipy.stats import shapiro
import scipy.stats as stats
from sklearn.model_selection import train_test_split


# Data


In [None]:
df = pd.read_pickle("Data/pollution_rennes_filter.pkl")
print("dataframe loaded")

In [None]:
display(df)

In [None]:
df_stationnary = df.loc[(df['sensor_type'] == 'fixedGps') & (df['PM_2.5'].notna())]
df_mobile = df.loc[(df['sensor_type'] == 'mobileGps') & (df['PM_2.5'].notna())]

## Data visualization

In [None]:
# Calculate the hourly average PM₂.₅ per sensor
mean_values = df.groupby(['hour', 'sensor_name'])['PM_2.5'].mean().reset_index()

# Remove a specific station if needed
mean_values = mean_values[mean_values['sensor_name'] != 'standalone-LOPY-AQ05']

# Set the figure size
plt.figure(figsize=(12, 6))

# Plot the lines using Seaborn
sns.lineplot(data=mean_values, x='hour', y='PM_2.5', hue='sensor_name')

# Add title and labels
plt.title("Average PM₂.₅ Concentration by Hour and Sensor")
plt.xlabel("Time of Day (Hour)")
plt.ylabel("PM₂.₅ Concentration (µg/m³)")
plt.legend(title="Station", bbox_to_anchor=(1.05, 0.5), loc='center left')
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
###### Define the order of the days of the week
week_days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df['day_week'] = pd.Categorical(df['day_week'], categories=week_days_order, ordered=True)

# Compute the average PM2.5 concentration
mean_values = df.groupby(['day_week', 'sensor_name'], observed=True)['PM_2.5'].mean().reset_index()

# Create the line plot
plt.figure(figsize=(12, 6))
sns.lineplot(data=mean_values, x='day_week', y='PM_2.5', hue='sensor_name', marker='o')

# Add title and axis labels
plt.title("Average PM₂.₅ Concentration by Day of the Week and Sensor")
plt.xlabel("Day of the Week")
plt.ylabel("PM₂.₅ Concentration (µg/m³)")

# Show legend on the right
plt.legend(title="Sensor", bbox_to_anchor=(1.05, 0.5), loc='center left')
plt.tight_layout()

# Display the plot
plt.show()

In [None]:
# Create a boxplot with Plotly
df_box = df.copy()
df_box['sensor_name'] = df_box.index.get_level_values('sensor_name')

fig = px.box(df_box, 
             x="sensor_name", 
             y="PM_2.5", 
             title="Distribution of pollution value by sensor", 
             labels={"PM_2.5": "Pollution value (µg/m³)", "sensor_name": "Sensor name"})

# Update the margins to recenter the box
fig.update_layout(
    margin=dict(l=40, r=40, t=40, b=40),  # Adjust the left, right, top, bottom margins
    boxmode='group',  # Ensure that the boxes do not overlap
    yaxis=dict(
        range=[df['PM_2.5'].quantile(0.05), df['PM_2.5'].quantile(0.95)]  # Limit the y-axis to the 5-95% of the data
    )
)
# Display the graph
fig.show()

In [None]:
lissage=14
global_mean = df.groupby(
    df.index.get_level_values('measure_date').floor('D')
)['PM_2.5'].mean().reset_index()

global_mean.columns = ['date', 'mean_v']


# Adding the smoothed mean
global_mean['smoothed_mean'] = global_mean['mean_v'].rolling(window=lissage, center=True).mean()

# Interactive plot of the global mean
fig = px.line(
    global_mean,
    x='date',
    y='mean_v',
    markers=True,
    title="Daily Mean PM₂.₅ Concentration",
    labels={
        'mean_v': 'Concentration (µg/m³)',
        'date': 'Date'
    }
)

# Adding the smoothed line (black line)
fig.add_scatter(
    x=global_mean['date'],
    y=global_mean['smoothed_mean'],
    mode='lines',
    name=f'Smoothed mean ({lissage} days)',
    line=dict(color='black', width=3)
)
fig.update_traces(line_color='darkorange', line_width=3, selector=dict(name=None))  # main line
fig.update_layout(hovermode='x unified')

fig.show()

### Creation of a sample that is representative

In [None]:
df['pollution_bins'] = pd.qcut(df['PM_2.5'], q=10, labels=False)
df['strata'] = df.index.get_level_values('sensor_name') + '_' + df['month'].astype(str)+ '_' + df['pollution_bins'].astype(str)
prop = 1000 / len(df)
print("Proportion pour 1000 individus:", prop)

counts = df['strata'].value_counts()

rare_classes = counts[counts < 2].index

df = df[~df['strata'].isin(rare_classes)]


sample_df, _ = train_test_split(
    df,
    train_size=prop,    
    stratify=df['strata'],
    random_state=42
)

df = sample_df.drop(columns=['strata', 'pollution_bins'])
print('fin')
print("Taille de l'échantillon :", len(sample_df))

### PM₂.₅ Data Distribution Across Different Variables


In [None]:
# Extract all PM₂.₅ values (excluding missing values)
data = df['PM_2.5'].dropna()

# Create the figure
plt.figure(figsize=(10, 6))

# Histogram 
sns.histplot(data, bins=200, kde=True, color='orange', stat="density", edgecolor=None, label='Actual data (KDE)')

# Theoretical normal distribution curve
mu = data.mean()
sigma = data.std()
x = np.linspace(0, 40, 500)
y = norm.pdf(x, mu, sigma)
plt.plot(x, y, color='red', linestyle='--', linewidth=2, label=f'Normal distribution\nμ={mu:.2f}, σ={sigma:.2f}')

# Graph adjustments
plt.title("Overall PM₂.₅ Distribution Normal Curve")
plt.xlabel("PM₂.₅ (µg/m³)")
plt.ylabel("Density")
plt.xlim(0, 40)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
data = df['PM_2.5'].dropna()
# Histogram 
sns.histplot(data, bins=200, kde=True, color='orange', stat="density", edgecolor=None, label='Actual data (KDE)')

# Theoretical normal distribution curve
mu = data.mean()
sigma = data.std()
x = np.linspace(0, 40, 500)
y = norm.pdf(x, mu, sigma)
plt.plot(x, y, color='red', linestyle='--', linewidth=2, label=f'Normal distribution\nμ={mu:.2f}, σ={sigma:.2f}')

# Graph adjustments
plt.title("Overall PM₂.₅ Distribution Normal Curve")
plt.xlabel("PM₂.₅ (µg/m³)")
plt.ylabel("Density")
plt.xlim(0, 40)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
stat, p_value = shapiro(data)
print(f"Shapiro-Wilk Test: statistic = {stat:.4f}, p-value = {p_value:.4f}")
if p_value > 0.05:
    print("✅ The data probably follows a normal distribution.")
else:
    print("❌ The data does not follow a normal distribution.")


In [None]:
print(sample_df['PM_2.5'].dtype)
df['PM_2.5'] = pd.to_numeric(df['PM_2.5'], errors='coerce')
print(sample_df['PM_2.5'].dtype)

In [None]:
# Example using your 'PM_2.5' column
df['log_PM2.5'] = np.log(df['PM_2.5'] + 1)  # +1 to avoid log(0)
# Extract all PM₂.₅ values (excluding missing values)
data = df['log_PM2.5']

# Create the figure
plt.figure(figsize=(10, 6))

# Histogram 
sns.histplot(data, bins=200, kde=True, color='orange', stat="density", edgecolor=None, label='Actual data (KDE)')

# Theoretical normal distribution curve
mu = data.mean()
sigma = data.std()
x = np.linspace(0, 40, 500)
y = norm.pdf(x, mu, sigma)
plt.plot(x, y, color='red', linestyle='--', linewidth=2, label=f'Normal distribution\nμ={mu:.2f}, σ={sigma:.2f}')

# Graph adjustments
plt.title("Overall PM₂.₅ Distribution with Normal Curve")
plt.xlabel("PM₂.₅ (µg/m³)")
plt.ylabel("Density")
plt.xlim(0, 40)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
stat, p_value = shapiro(data)
print(f"Shapiro-Wilk Test: statistic = {stat:.4f}, p-value = {p_value:.4f}")
if p_value > 0.05:
    print("✅ The data probably follows a normal distribution.")
else:    print("❌ The data does not follow a normal distribution.")


In [None]:
data = df['log_PM2.5']
stats.probplot(data, dist="norm", plot=plt)
plt.title("Q-Q plot for log(PM2.5)")
plt.grid()
plt.show()


The Wilcoxon signed-rank test is a non-parametric test used to compare two related samples. It evaluates whether the median difference between paired observations is significantly different from zero. It does not assume normal distribution, making it suitable for skewed data like pollution measurements. Unlike the paired t-test, the Wilcoxon test does not assume normality, which is important here since pollution data often deviate from a normal distribution.

In [None]:
# Dictionary mapping month numbers to month names
month_names = {
    1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June',
    7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'
}

# If you have a dataframe df with 'month' for the month and 'PM_2.5' for pollution data
# Example: df = pd.read_csv('your_file.csv')

for month in range(1, 12):  # Stop at 11 to compare month 1-2, 2-3, ..., 11-12
    # Select pollution data for each month
    data_month1 = df[df['month'] == month]['PM_2.5'].dropna()
    data_month2 = df[df['month'] == month + 1]['PM_2.5'].dropna()

    # Check if data is not empty
    if len(data_month1) > 50 and len(data_month2) > 50:
        # Draw a random sample of 1000 measurements from each month
        data_month1 = np.random.choice(data_month1, 50, replace=False)  # Sample of 1000
        data_month2 = np.random.choice(data_month2, 50, replace=False)  # Sample of 1000

        # Perform the Wilcoxon test
        stat, p_value = wilcoxon(data_month1, data_month2)

        print(f"Comparison between {month_names[month]} and {month_names[month + 1]}:")
        print(f"  Statistic: {stat}, p-value: {p_value}")

        # Interpret the p-value
        if p_value < 0.05:
            print("  ✅ Significant difference in pollution levels.")
        else:
            print("  ❌ No significant difference.")
    else:
        print(f"  Not enough data to compare {month_names[month]} and {month_names[month + 1]}.")
    print("-" * 90)  # Separator between comparisons 

In [None]:
# 1. Dictionary to map month → season
def month_to_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

# 2. Add the 'season' column to your DataFrame
df['season'] = df['month'].apply(month_to_season)

# 3. Define the order of the seasons
season_order = ['Winter', 'Spring', 'Summer', 'Fall']

# 4. Compare each season with the next one
for i in range(len(season_order) - 1):
    season1 = season_order[i]
    season2 = season_order[i + 1]

    data_season1 = df[df['season'] == season1]['PM_2.5'].dropna()
    data_season2 = df[df['season'] == season2]['PM_2.5'].dropna()

    # Make sure we have enough data
    if len(data_season1) > 50 and len(data_season2) > 50:
        # Random sample of 50 measurements from each season
        data_season1 = np.random.choice(data_season1, 50, replace=False)
        data_season2 = np.random.choice(data_season2, 50, replace=False)

        # Wilcoxon test (paired sample test)
        stat, p_value = wilcoxon(data_season1, data_season2)

        print(f"Comparison between {season1} and {season2}:")
        print(f"  Statistic: {stat:.3f}, p-value: {p_value:.4f}")

        if p_value < 0.05:
            print("  ✅ Significant difference in pollution levels.")
        else:
            print("  ❌ No significant difference.")
    else:
        print(f"Not enough data to compare {season1} and {season2}.")
    print("-" * 80)

In [None]:
fig = px.box(df, 
             x="season", 
             y="PM_2.5", 
             title="Distribution of pollution value by sensor", 
             labels={"PM_2.5": "Pollution value (µg/m³)", "sensor_name": "Sensor name"})
# Update the margins to recenter the box
fig.update_layout(
    margin=dict(l=40, r=40, t=40, b=40),  # Adjust the left, right, top, bottom margins
    boxmode='group',  # Ensure that the boxes do not overlap
    yaxis=dict(
        range=[df['PM_2.5'].quantile(0.05), df['PM_2.5'].quantile(0.95)]  # Limit the y-axis to the 5-95% of the data
    )
)
# Display the graph
fig.show()