Author: Adafaly Matthieu </br>

This notebook is used to prepare the Data_Exploration notebook.
Its purpose is to create a new dataset that combines a time filter and a location filter, allowing you to analyze a subset of the data according to your preferences.

# Importation of the libraries


In [None]:
import pandas as pd
from IPython.display import display, Markdown
from datetime import datetime, timedelta

# Data

In [None]:
df = pd.read_pickle("Data/pollution_rennes.pkl")
df=df.reset_index()
df_stationnary = df.loc[(df['sensor_type'] == 'fixedGps') & (df['PM_2.5'].notna())]
df_mobile = df.loc[(df['sensor_type'] == 'mobileGps') & (df['PM_2.5'].notna())]
print("dataframe loaded")

### Visualization to help you choose how to filter the data

In [None]:
df.groupby('sensor_name')['sensor_type'].unique()

In [None]:
df_comparaison_sensor = pd.DataFrame({
    'Total': df['sensor_name'].value_counts().sort_index(),
}).fillna(0).astype(int)

display(Markdown("### Chart of the repartion if the pollution have a value or not by sensor"))
display(df_comparaison_sensor)

In [None]:
# Create a series representing the month indices (1 to 12)
month_index = pd.Series(range(1, 13), name="month")
# Create a DataFrame 'df_sensor_type_comparison' by counting occurrences of 'mois' in different DataFrames
df_sensor_type_comparison = pd.DataFrame({
    'global': df['month'].value_counts().reindex(month_index).fillna(0),  # Counts from the 'df' DataFrame
    'mobile': df_mobile['month'].value_counts().reindex(month_index).fillna(0),  # Counts from 'df_mobile'
    'stationnary': df_stationnary['month'].value_counts().reindex(month_index).fillna(0)  # Counts from 'df_stationnary'
}).astype(int)  # Ensures all values are integers

month_names = {
    1: "January", 2: "February", 3: "March", 4: "April",
    5: "May", 6: "June", 7: "July", 8: "August",
    9: "September", 10: "October", 11: "November", 12: "December"
}


# Replace the index with the month names instead of month numbers
df_sensor_type_comparison.index = df_sensor_type_comparison.index.map(month_names)

# Calculate the total row by summing the columns
total_row = df_sensor_type_comparison[['global', 'mobile', 'stationnary']].sum()

# Add the total row to the DataFrame
df_sensor_type_comparison.loc['Total'] = total_row

# Display the title as markdown
display(Markdown("### Chart of the number of measure by month"))

# Display the resulting DataFrame
display(df_sensor_type_comparison)

### Select the analysis period

In [None]:
# Convert the 'measure_date' column to datetime
df['date'] = pd.to_datetime(df['measure_date'], format='%Y-%m-%d', utc=True)

# Function to validate a date in the YYYY-MM-DD format
def validate_date(date_str):
    try:
        # Try to convert the string to a date object
        datetime.strptime(date_str, '%Y-%m-%d')
        return True
    except ValueError:
        return False

# Ask the user for the start and end date
start_date = '2020-04-11'
end_date = "2020-12-16"

# If the user enters "start", use the first date from the DataFrame
if start_date.lower() == "start":
    start_date = df['date'].min().strftime('%Y-%m-%d')  # Use the earliest date in the DataFrame
    print(f"Start date set to: {start_date}")

# If the user enters "end", use the last date from the DataFrame
if end_date.lower() == "end":
    end_date = df['date'].max().strftime('%Y-%m-%d')  # Use the latest date in the DataFrame
    print(f"End date set to: {end_date}")

# Check that the dates are filled and in the correct format
if start_date and end_date:
    if validate_date(start_date) and validate_date(end_date):
        print("Dates are valid.")
    else:
        print("One or more dates are invalid. Please use the YYYY-MM-DD format.")
else:
    print("Date fields cannot be empty.")

# Convert start and end dates to datetime with the desired time
start_date = datetime.strptime(start_date + ' 00:00', '%Y-%m-%d %H:%M')
end_date = datetime.strptime(end_date + ' 23:59', '%Y-%m-%d %H:%M')

# Also convert them to datetime64[ns, UTC] for comparison with pandas
start_date = pd.to_datetime(start_date, utc=True)
end_date = pd.to_datetime(end_date, utc=True)

# Filter the DataFrame based on the selected date range
filtered_df = df[(df['date'] >= start_date) & (df['date'] <= end_date)]

# Display the filtered results
print("Filtered data:")
display(filtered_df)

### Choose sensors to filter

In [None]:
# List of sensor available
print(f"sensor available {df['sensor_name'].unique()}")
sensor_selected = ['standalone-LOPY-AQ03', 'parautarin02', 'standalone-LOPY-AQ09', 'parautarin31', 'parautarin30', 'standalone-LOPY-AQ05', 'parautarin34', 'parautarin33', 'parautarin32', 'parautarin35']
print(f'sensor selected({sensor_selected})')

In [None]:
print(sensor_selected)
print(start_date)
print(end_date)

In [None]:
df_filter = df[(df['measure_date'] >= start_date) & (df['measure_date'] <= end_date) & (df['sensor_name'].isin(sensor_selected))]
print("Data filter :")
display(df_filter)

In [None]:
df_filter = df_filter.set_index(["sensor_name", "measure_date"])
df_filter.to_pickle('Data/pollution_rennes_filter.pkl')