Author: Adafaly Matthieu </br>

This notebook is used to prepare the Data_Exploration notebook.
Its purpose is to create a new dataset that combines a time filter and a location filter, allowing you to analyze a subset of the data according to your preferences.

# Importation of the libraries


In [57]:
import pandas as pd
from IPython.display import display, Markdown
from datetime import datetime, timedelta

# Data

In [58]:
df = pd.read_pickle("Data/pollution_rennes.pkl")
df=df.reset_index()
print("dataframe loaded")

dataframe loaded


Creating the dataframe for fixed stations and the one for mobile stations.

In [60]:
df_stationnary = df.loc[(df['sensor_type'] == 'fixedGps') & (df['PM_2.5'].notna())]
df_mobile = df.loc[(df['sensor_type'] == 'mobileGps') & (df['PM_2.5'].notna())]

In [61]:
df.groupby('sensor_name')['sensor_type'].unique()

sensor_name
parautarin02            [mobileGps]
parautarin30            [mobileGps]
parautarin31            [mobileGps]
parautarin32            [mobileGps]
parautarin33            [mobileGps]
parautarin34            [mobileGps]
parautarin35            [mobileGps]
standalone-LOPY-AQ03     [fixedGps]
standalone-LOPY-AQ09     [fixedGps]
Name: sensor_type, dtype: object

In [62]:
df_comparaison_sensor = pd.DataFrame({
    'Total': df['sensor_name'].value_counts().sort_index(),
}).fillna(0).astype(int)

display(Markdown("### 📊 Chart of the repartion if the pollution have a value or not by sensor"))
display(df_comparaison_sensor)

### 📊 Chart of the repartion if the pollution have a value or not by sensor

Unnamed: 0_level_0,Total
sensor_name,Unnamed: 1_level_1
parautarin02,102431
parautarin30,1066
parautarin31,6832
parautarin32,16111
parautarin33,116
parautarin34,12059
parautarin35,12121
standalone-LOPY-AQ03,403859
standalone-LOPY-AQ09,435155


In [63]:
# Create a series representing the month indices (1 to 12)
month_index = pd.Series(range(1, 13), name="month")
# Create a DataFrame 'df_sensor_type_comparison' by counting occurrences of 'mois' in different DataFrames
df_sensor_type_comparison = pd.DataFrame({
    'global': df['month'].value_counts().reindex(month_index).fillna(0),  # Counts from the 'df' DataFrame
    'mobile': df_mobile['month'].value_counts().reindex(month_index).fillna(0),  # Counts from 'df_mobile'
    'stationnary': df_stationnary['month'].value_counts().reindex(month_index).fillna(0)  # Counts from 'df_stationnary'
}).astype(int)  # Ensures all values are integers

month_names = {
    1: "January", 2: "February", 3: "March", 4: "April",
    5: "May", 6: "June", 7: "July", 8: "August",
    9: "September", 10: "October", 11: "November", 12: "December"
}


# Replace the index with the month names instead of month numbers
df_sensor_type_comparison.index = df_sensor_type_comparison.index.map(month_names)

# Calculate the total row by summing the columns
total_row = df_sensor_type_comparison[['global', 'mobile', 'stationnary']].sum()

# Add the total row to the DataFrame
df_sensor_type_comparison.loc['Total'] = total_row

# Display the title as markdown
display(Markdown("### 📊 Chart of the number of measure by month"))

# Display the resulting DataFrame
display(df_sensor_type_comparison)

### 📊 Chart of the number of measure by month

Unnamed: 0_level_0,global,mobile,stationnary
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
January,0,0,0
February,0,0,0
March,0,0,0
April,47029,776,46253
May,104913,2114,102799
June,110279,3185,107094
July,120965,23050,97915
August,119000,19155,99845
September,101320,17026,84294
October,163103,28500,134603


# Analysis


In [64]:
# Convert the 'measure_date' column to datetime
df['date'] = pd.to_datetime(df['measure_date'], format='%Y-%m-%d', utc=True)

# Function to validate a date in the YYYY-MM-DD format
def validate_date(date_str):
    try:
        # Try to convert the string to a date object
        datetime.strptime(date_str, '%Y-%m-%d')
        return True
    except ValueError:
        return False

# Ask the user for the start and end date
start_date = '2020-04-11'
end_date = "2020-12-16"

# If the user enters "start", use the first date from the DataFrame
if start_date.lower() == "start":
    start_date = df['date'].min().strftime('%Y-%m-%d')  # Use the earliest date in the DataFrame
    print(f"Start date set to: {start_date}")

# If the user enters "end", use the last date from the DataFrame
if end_date.lower() == "end":
    end_date = df['date'].max().strftime('%Y-%m-%d')  # Use the latest date in the DataFrame
    print(f"End date set to: {end_date}")

# Check that the dates are filled and in the correct format
if start_date and end_date:
    if validate_date(start_date) and validate_date(end_date):
        print("Dates are valid.")
    else:
        print("One or more dates are invalid. Please use the YYYY-MM-DD format.")
else:
    print("Date fields cannot be empty.")

# Convert start and end dates to datetime with the desired time
start_date = datetime.strptime(start_date + ' 00:00', '%Y-%m-%d %H:%M')
end_date = datetime.strptime(end_date + ' 23:59', '%Y-%m-%d %H:%M')

# Also convert them to datetime64[ns, UTC] for comparison with pandas
start_date = pd.to_datetime(start_date, utc=True)
end_date = pd.to_datetime(end_date, utc=True)

# Filter the DataFrame based on the selected date range
filtered_df = df[(df['date'] >= start_date) & (df['date'] <= end_date)]

# Display the filtered results
print("Filtered data:")
display(filtered_df)

Dates are valid.
Filtered data:


Unnamed: 0,sensor_name,measure_date,id,geo_type,geo_coords,start_date,sensor_type,p,PM_2.5,bn,longitude,latitude,day_week,day,hour,month,year,hour_minute_second,date
0,parautarin02,2020-04-14 11:14:39+00:00,5e95acdec541160cddaff95a,Point,"[-1.642062938, 48.11025748]",2020-04-14 09:14:29+00:00,mobileGps,4555.709,3.59,OPC_N3:12,-1.642063,48.110257,Tuesday,14,11,4,2020,11:14:39,2020-04-14 11:14:39+00:00
1,parautarin02,2020-04-14 12:39:46+00:00,5e9592f7c541160cddaff656,Point,"[-1.5904910540000001, 48.113529421]",2020-04-14 12:39:31+00:00,mobileGps,24.064,5.53,OPC_N3:12,-1.590491,48.113529,Tuesday,14,12,4,2020,12:39:46,2020-04-14 12:39:46+00:00
2,parautarin02,2020-04-14 12:40:11+00:00,5e959310c541160cddaff65d,Point,"[-1.5904910540000001, 48.113529421]",2020-04-14 12:39:56+00:00,mobileGps,24.052,5.6,OPC_N3:12,-1.590491,48.113529,Tuesday,14,12,4,2020,12:40:11,2020-04-14 12:40:11+00:00
3,parautarin02,2020-04-14 12:40:36+00:00,5e959328c541160cddaff666,Point,"[-1.5904910540000001, 48.113529421]",2020-04-14 12:40:21+00:00,mobileGps,24.074,5.63,OPC_N3:12,-1.590491,48.113529,Tuesday,14,12,4,2020,12:40:36,2020-04-14 12:40:36+00:00
4,parautarin02,2020-04-14 14:32:48+00:00,5e95b25bc541160cddaff9ab,Point,"[-1.6420568260000001, 48.110504919]",2020-04-14 14:32:48+00:00,mobileGps,24.208,3.72,OPC_N3:12,-1.642057,48.110505,Tuesday,14,14,4,2020,14:32:48,2020-04-14 14:32:48+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
989745,standalone-LOPY-AQ09,2020-12-16 01:01:21+00:00,5fd95c7ce5283f00124f1692,Point,"[-1.651349, 48.120699]",2020-05-14 12:35:37+00:00,fixedGps,34.079,6.23,OPC_N3:05,-1.651349,48.120699,Wednesday,16,1,12,2020,01:01:21,2020-12-16 01:01:21+00:00
989746,standalone-LOPY-AQ09,2020-12-16 01:01:55+00:00,5fd95c9ce5283f00124f1696,Point,"[-1.651349, 48.120699]",2020-05-14 12:35:37+00:00,fixedGps,34.08,6.83,OPC_N3:05,-1.651349,48.120699,Wednesday,16,1,12,2020,01:01:55,2020-12-16 01:01:55+00:00
989747,standalone-LOPY-AQ09,2020-12-16 01:02:29+00:00,5fd95cbae5283f00124f169a,Point,"[-1.651349, 48.120699]",2020-05-14 12:35:37+00:00,fixedGps,34.063,7.5,OPC_N3:05,-1.651349,48.120699,Wednesday,16,1,12,2020,01:02:29,2020-12-16 01:02:29+00:00
989748,standalone-LOPY-AQ09,2020-12-16 01:03:03+00:00,5fd95ce0e5283f00124f169c,Point,"[-1.651349, 48.120699]",2020-05-14 12:35:37+00:00,fixedGps,34.06,7.87,OPC_N3:05,-1.651349,48.120699,Wednesday,16,1,12,2020,01:03:03,2020-12-16 01:03:03+00:00


In [65]:
# List of sensor available
print(f"sensor available {df['sensor_name'].unique()}")
sensor_selected = ['standalone-LOPY-AQ03', 'parautarin02', 'standalone-LOPY-AQ09', 'parautarin31', 'parautarin30', 'standalone-LOPY-AQ05', 'parautarin34', 'parautarin33', 'parautarin32', 'parautarin35']
print(f'sensor selected({sensor_selected})')

sensor available ['parautarin02' 'parautarin30' 'parautarin31' 'parautarin32'
 'parautarin33' 'parautarin34' 'parautarin35' 'standalone-LOPY-AQ03'
 'standalone-LOPY-AQ09']
sensor selected(['standalone-LOPY-AQ03', 'parautarin02', 'standalone-LOPY-AQ09', 'parautarin31', 'parautarin30', 'standalone-LOPY-AQ05', 'parautarin34', 'parautarin33', 'parautarin32', 'parautarin35'])


In [66]:
print(sensor_selected)
print(start_date)
print(end_date)

['standalone-LOPY-AQ03', 'parautarin02', 'standalone-LOPY-AQ09', 'parautarin31', 'parautarin30', 'standalone-LOPY-AQ05', 'parautarin34', 'parautarin33', 'parautarin32', 'parautarin35']
2020-04-11 00:00:00+00:00
2020-12-16 23:59:00+00:00


In [67]:
df_filter = df[(df['measure_date'] >= start_date) & (df['measure_date'] <= end_date) & (df['sensor_name'].isin(sensor_selected))]
print("Data filter :")
display(df_filter)

Data filter :


Unnamed: 0,sensor_name,measure_date,id,geo_type,geo_coords,start_date,sensor_type,p,PM_2.5,bn,longitude,latitude,day_week,day,hour,month,year,hour_minute_second,date
0,parautarin02,2020-04-14 11:14:39+00:00,5e95acdec541160cddaff95a,Point,"[-1.642062938, 48.11025748]",2020-04-14 09:14:29+00:00,mobileGps,4555.709,3.59,OPC_N3:12,-1.642063,48.110257,Tuesday,14,11,4,2020,11:14:39,2020-04-14 11:14:39+00:00
1,parautarin02,2020-04-14 12:39:46+00:00,5e9592f7c541160cddaff656,Point,"[-1.5904910540000001, 48.113529421]",2020-04-14 12:39:31+00:00,mobileGps,24.064,5.53,OPC_N3:12,-1.590491,48.113529,Tuesday,14,12,4,2020,12:39:46,2020-04-14 12:39:46+00:00
2,parautarin02,2020-04-14 12:40:11+00:00,5e959310c541160cddaff65d,Point,"[-1.5904910540000001, 48.113529421]",2020-04-14 12:39:56+00:00,mobileGps,24.052,5.6,OPC_N3:12,-1.590491,48.113529,Tuesday,14,12,4,2020,12:40:11,2020-04-14 12:40:11+00:00
3,parautarin02,2020-04-14 12:40:36+00:00,5e959328c541160cddaff666,Point,"[-1.5904910540000001, 48.113529421]",2020-04-14 12:40:21+00:00,mobileGps,24.074,5.63,OPC_N3:12,-1.590491,48.113529,Tuesday,14,12,4,2020,12:40:36,2020-04-14 12:40:36+00:00
4,parautarin02,2020-04-14 14:32:48+00:00,5e95b25bc541160cddaff9ab,Point,"[-1.6420568260000001, 48.110504919]",2020-04-14 14:32:48+00:00,mobileGps,24.208,3.72,OPC_N3:12,-1.642057,48.110505,Tuesday,14,14,4,2020,14:32:48,2020-04-14 14:32:48+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
989745,standalone-LOPY-AQ09,2020-12-16 01:01:21+00:00,5fd95c7ce5283f00124f1692,Point,"[-1.651349, 48.120699]",2020-05-14 12:35:37+00:00,fixedGps,34.079,6.23,OPC_N3:05,-1.651349,48.120699,Wednesday,16,1,12,2020,01:01:21,2020-12-16 01:01:21+00:00
989746,standalone-LOPY-AQ09,2020-12-16 01:01:55+00:00,5fd95c9ce5283f00124f1696,Point,"[-1.651349, 48.120699]",2020-05-14 12:35:37+00:00,fixedGps,34.08,6.83,OPC_N3:05,-1.651349,48.120699,Wednesday,16,1,12,2020,01:01:55,2020-12-16 01:01:55+00:00
989747,standalone-LOPY-AQ09,2020-12-16 01:02:29+00:00,5fd95cbae5283f00124f169a,Point,"[-1.651349, 48.120699]",2020-05-14 12:35:37+00:00,fixedGps,34.063,7.5,OPC_N3:05,-1.651349,48.120699,Wednesday,16,1,12,2020,01:02:29,2020-12-16 01:02:29+00:00
989748,standalone-LOPY-AQ09,2020-12-16 01:03:03+00:00,5fd95ce0e5283f00124f169c,Point,"[-1.651349, 48.120699]",2020-05-14 12:35:37+00:00,fixedGps,34.06,7.87,OPC_N3:05,-1.651349,48.120699,Wednesday,16,1,12,2020,01:03:03,2020-12-16 01:03:03+00:00


In [68]:
df_filter = df_filter.set_index(["sensor_name", "measure_date"])
df_filter.to_pickle('Data/pollution_rennes_filter.pkl')