In [13]:
# Code for extracting the daily ATMO score for each station in AIRPARIF (Région: Île-de-France)

In [14]:
# Installing required packages
import pandas as pd


import os

# Base directory path
base_path = ".\data\raw\Données de la qualité de l'air (Indicateurs SOMO35 et ATMO)\"


In [15]:
# PM10 AIRPARIF

# File path to the data
PM10_AIRPARIF_file_path = os.path.join(base_path, "PM10 Moyenne Journalière\Export Moy. journalière - PM10 - AIRPARIF.csv")

# Use read_csv function from pandas specifying the delimiter as ';'
df_PM10 = pd.read_csv(PM10_AIRPARIF_file_path, delimiter=';')

# Display the first few rows of the DataFrame
print(df_PM10.head())

# Sample ATMO subscore dictionary for PM10 (Moyenne journalière)
ATMO_subscore_PM10 = {
    'Subscore': [1, 2, 3, 4, 5, 6],
    'Range': [(0.0, 20.0), (20.0, 40.0), (40.0, 50.0), (50.0, 100.0), (100.0, 150.0), (150.0,10000.0)]
}

# Convert the ATMO_subscore dictionary to a DataFrame
ATMO_subscore_df_PM10 = pd.DataFrame(ATMO_subscore_PM10)

# Convert the range tuples to ranges of numbers for easy comparison
ATMO_subscore_df_PM10['Range'] = ATMO_subscore_df_PM10['Range'].apply(lambda x: [round(i / 10, 1) for i in range(int(x[0] * 10), int(x[1] * 10) + 1)])

# Function to assign subscore based on value of PM10 measurement
def assign_subscore_PM10(valeur):
    for index, row in ATMO_subscore_df_PM10.iterrows():  
        if valeur in row['Range']:
            return row['Subscore']
    return None  # Return None if no matching subscore found

# Apply the function to fill in the Subscore column while ensuring the values in the 'valeur' column are recognised as numeric
df_PM10['ATMO sub-score'] = df_PM10['valeur'].apply(assign_subscore_PM10)  

# Display the updated DataFrame
print(df_PM10.head())


         Date de début          Date de fin Organisme   code zas        Zas  \
0  2023/01/01 00:00:00  2023/01/01 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   
1  2023/01/01 00:00:00  2023/01/01 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   
2  2023/01/01 00:00:00  2023/01/01 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   
3  2023/01/01 00:00:00  2023/01/01 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   
4  2023/01/01 00:00:00  2023/01/01 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   

  code site            nom site type d'implantation Polluant type d'influence  \
0   FR04002       GENNEVILLIERS             Urbaine     PM10             Fond   
1   FR04004         PARIS 18eme             Urbaine     PM10             Fond   
2   FR04012  Place Victor Basch             Urbaine     PM10           Trafic   
3   FR04023      CERGY-PONTOISE             Urbaine     PM10             Fond   
4   FR04031   Av Champs Elysees             Urbaine     PM10           Trafic   

   ... valeur valeur brute unité de me

In [16]:
# PM2.5 AIRPARIF

# File path to the data
PM25_AIRPARIF_file_path = os.path.join(base_path, "PM2.5 Moyenne Journalière\Export Moy. journalière - PM2.5 - AIRPARIF.csv")

# Use read_csv function from pandas specifying the delimiter as ';'
df_PM25 = pd.read_csv(PM25_AIRPARIF_file_path, delimiter=';')

# Display the first few rows of the DataFrame
print(df_PM25.head())

# Sample ATMO subscore dictionary for PM2.5 (Moyenne journalière)
ATMO_subscore_PM25 = {
    'Subscore': [1, 2, 3, 4, 5, 6],
    'Range': [(0.0, 10.0), (10.0, 20.0), (20.0, 25.0), (25.0, 50.0), (50.0, 75.0), (75.0,10000.0)]
}

# Convert the ATMO_subscore dictionary to a DataFrame
ATMO_subscore_df_PM25 = pd.DataFrame(ATMO_subscore_PM25)

# Convert the range tuples to ranges of numbers for easy comparison
ATMO_subscore_df_PM25['Range'] = ATMO_subscore_df_PM25['Range'].apply(lambda x: [round(i / 10, 1) for i in range(int(x[0] * 10), int(x[1] * 10) + 1)])

# Function to assign subscore based on value of PM2.5 measurement
def assign_subscore_PM25(valeur):
    for index, row in ATMO_subscore_df_PM25.iterrows(): 
        if valeur in row['Range']:
            return row['Subscore']
    return None  # Return None if no matching subscore found

# Apply the function to fill in the Subscore column while ensuring the values in the 'valeur' column are recognised as numeric
df_PM25['ATMO sub-score'] = df_PM25['valeur'].apply(assign_subscore_PM25)  

# Display the updated DataFrame
print(df_PM25.head())


         Date de début          Date de fin Organisme   code zas        Zas  \
0  2023/01/01 00:00:00  2023/01/01 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   
1  2023/01/01 00:00:00  2023/01/01 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   
2  2023/01/01 00:00:00  2023/01/01 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   
3  2023/01/01 00:00:00  2023/01/01 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   
4  2023/01/01 00:00:00  2023/01/01 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   

  code site           nom site type d'implantation Polluant type d'influence  \
0   FR04002      GENNEVILLIERS             Urbaine    PM2.5             Fond   
1   FR04004        PARIS 18eme             Urbaine    PM2.5             Fond   
2   FR04024            GONESSE         Périurbaine    PM2.5             Fond   
3   FR04031  Av Champs Elysees             Urbaine    PM2.5           Trafic   
4   FR04034    VITRY-SUR-SEINE             Urbaine    PM2.5             Fond   

   ... valeur valeur brute unité de mesure  

In [17]:
# NO2 AIRPARIF

# File path to the data
NO2_AIRPARIF_file_path = os.path.join(base_path, "NO2 Max Horaire Journalier\Export Max. journalier moy. hor. - NO2 - AIRPARIF.csv")

# Use read_csv function from pandas specifying the delimiter as ';'
df_NO2 = pd.read_csv(NO2_AIRPARIF_file_path, delimiter=';')

# Display the first few rows of the DataFrame
print(df_NO2.head())

# Sample ATMO subscore dictionary for NO2 (Max horaire journalier)
ATMO_subscore_NO2 = {
    'Subscore': [1, 2, 3, 4, 5, 6],
    'Range': [(0.0, 40.0), (40.0, 90.0), (90.0, 120.0), (120.0, 230.0), (230.0, 340.0), (340.0,10000.0)]
}

# Convert the ATMO_subscore dictionary to a DataFrame
ATMO_subscore_df_NO2 = pd.DataFrame(ATMO_subscore_NO2)

# Convert the range tuples to ranges of numbers for easy comparison
ATMO_subscore_df_NO2['Range'] = ATMO_subscore_df_NO2['Range'].apply(lambda x: [round(i / 10, 1) for i in range(int(x[0] * 10), int(x[1] * 10) + 1)])

# Function to assign subscore based on value of NO2 measurement
def assign_subscore_NO2(valeur):
    for index, row in ATMO_subscore_df_NO2.iterrows():  # Corrected variable name
        if valeur in row['Range']:
            return int(row['Subscore'])
    return None  # Return None if no matching subscore found

# Apply the function to fill in the Subscore column
df_NO2['ATMO sub-score'] = df_NO2['valeur'].apply(assign_subscore_NO2)

# Display the updated DataFrame
print(df_NO2.head())


         Date de début          Date de fin Organisme   code zas        Zas  \
0  2023/01/01 00:00:00  2023/01/01 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   
1  2023/01/01 00:00:00  2023/01/01 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   
2  2023/01/01 00:00:00  2023/01/01 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   
3  2023/01/01 00:00:00  2023/01/01 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   
4  2023/01/01 00:00:00  2023/01/01 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   

  code site            nom site type d'implantation Polluant type d'influence  \
0   FR04002       GENNEVILLIERS             Urbaine      NO2             Fond   
1   FR04004         PARIS 18eme             Urbaine      NO2             Fond   
2   FR04012  Place Victor Basch             Urbaine      NO2           Trafic   
3   FR04014         PARIS 12eme             Urbaine      NO2             Fond   
4   FR04017   NEUILLY-SUR-SEINE             Urbaine      NO2             Fond   

   ... valeur valeur brute unité de me

In [18]:
# O3 AIRPARIF

# File path to the data
O3_AIRPARIF_file_path = os.path.join(base_path, "O3 Max Horaire Journalier\Export Max. journalier moy. hor. - O3 - AIRPARIF.csv")

# Use read_csv function from pandas specifying the delimiter as ';'
df_O3 = pd.read_csv(O3_AIRPARIF_file_path, delimiter=';')

# Display the first few rows of the DataFrame
print(df_O3.head())

# Sample ATMO subscore dictionary for O3 (Max horaire journalier)
ATMO_subscore_O3 = {
    'Subscore': [1, 2, 3, 4, 5, 6],
    'Range': [(0.0, 50.0), (50.0, 100.0), (100.0, 130.0), (130.0, 240.0), (240.0, 380.0), (380.0,10000.0)]
}

# Convert the ATMO_subscore dictionary to a DataFrame
ATMO_subscore_df_O3 = pd.DataFrame(ATMO_subscore_O3)

# Convert the range tuples to ranges of numbers for easy comparison
ATMO_subscore_df_O3['Range'] = ATMO_subscore_df_O3['Range'].apply(lambda x: [round(i / 10, 1) for i in range(int(x[0] * 10), int(x[1] * 10) + 1)])

# Function to assign subscore based on value of O3 measurement
def assign_subscore_O3(valeur):
    for index, row in ATMO_subscore_df_O3.iterrows():  # Corrected variable name
        if valeur in row['Range']:
            return int(row['Subscore'])
    return None  # Return None if no matching subscore found

# Apply the function to fill in the Subscore column
df_O3['ATMO sub-score'] = df_O3['valeur'].apply(assign_subscore_O3)

# Display the updated DataFrame
print(df_O3.head())


         Date de début          Date de fin Organisme   code zas        Zas  \
0  2023/01/01 00:00:00  2023/01/01 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   
1  2023/01/01 00:00:00  2023/01/01 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   
2  2023/01/01 00:00:00  2023/01/01 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   
3  2023/01/01 00:00:00  2023/01/01 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   
4  2023/01/01 00:00:00  2023/01/01 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   

  code site           nom site type d'implantation Polluant type d'influence  \
0   FR04004        PARIS 18eme             Urbaine       O3             Fond   
1   FR04017  NEUILLY-SUR-SEINE             Urbaine       O3             Fond   
2   FR04023     CERGY-PONTOISE             Urbaine       O3             Fond   
3   FR04029         VERSAILLES         Périurbaine       O3             Fond   
4   FR04034    VITRY-SUR-SEINE             Urbaine       O3             Fond   

   ... valeur valeur brute unité de mesure  

In [19]:
# SO2 AIRPARIF (Missing for AIRPARIF)

In [20]:
# Combining 5 (if not missing) dataframes
# Concatenate DataFrames along the rows (stack them vertically)
combined_df_AIRPARIF = pd.concat([df_PM10, df_PM25, df_NO2, df_O3], axis=0) #, df_SO2

# If you want to reset the index of the combined DataFrame
combined_df_AIRPARIF.reset_index(drop=True, inplace=True)

# Renaming the ATMO subscore column to 'ATMO Score'
combined_df_AIRPARIF = combined_df_AIRPARIF.rename(columns={'ATMO sub-score': 'ATMO Score'})

# Printing the first five rows as a sanity check
print(combined_df_AIRPARIF.head())


         Date de début          Date de fin Organisme   code zas        Zas  \
0  2023/01/01 00:00:00  2023/01/01 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   
1  2023/01/01 00:00:00  2023/01/01 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   
2  2023/01/01 00:00:00  2023/01/01 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   
3  2023/01/01 00:00:00  2023/01/01 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   
4  2023/01/01 00:00:00  2023/01/01 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   

  code site            nom site type d'implantation Polluant type d'influence  \
0   FR04002       GENNEVILLIERS             Urbaine     PM10             Fond   
1   FR04004         PARIS 18eme             Urbaine     PM10             Fond   
2   FR04012  Place Victor Basch             Urbaine     PM10           Trafic   
3   FR04023      CERGY-PONTOISE             Urbaine     PM10             Fond   
4   FR04031   Av Champs Elysees             Urbaine     PM10           Trafic   

   ... valeur brute unité de mesure ta

In [31]:
# Find the index of rows with maximum 'ATMO Score' within each group
idx_max_scores = combined_df_AIRPARIF.groupby(['nom site', 'Date de fin'])['ATMO Score'].idxmax()

# Select rows with the maximum 'ATMO Score' using the index
final_df_AIRPARIF = combined_df_AIRPARIF.loc[idx_max_scores]

# Reset index if needed
final_df_AIRPARIF.reset_index(drop=True, inplace=True)

In [32]:
# Downloading the final dataframe of Daily ATMO Score in AIRPARIF to csv
final_df_AIRPARIF.to_csv('final_df_AIRPARIF.csv', index=False, encoding = 'latin1')

In [23]:
# Constructing indicator for AIRPARIF

# File path to the data
ATMO_AIRPARIF_file_path = os.path.join(base_path, "final_df_AIRPARIF.csv")

# Use read_csv function from pandas specifying the delimiter as ';'
df_ATMO = pd.read_csv(ATMO_AIRPARIF_file_path, delimiter=',', encoding = 'latin1')

print(df_ATMO.head())

counts = df_ATMO.groupby('nom site')['ATMO Score'].value_counts().reset_index(name='counts')
#print(counts)

#result = pd.merge(counts, df_ATMO[["Organisme", "type d'implantation", "Latitude", "Longitude"]], on='nom site', how='left')
#print(result)

# Getting a df of the site-specific information e.g., code, latitude, longitude
first_row_per_nom_site = df_ATMO.groupby('nom site').first().reset_index()
#print(first_row_per_nom_site)

# Merge the site-specific information with the counts DataFrame
result = pd.merge(counts, first_row_per_nom_site, on='nom site', how = 'inner').drop(['Date de fin', 'Date de début', 'ATMO Score_y'], axis = 1)
print(result)

         Date de début          Date de fin Organisme   code zas        Zas  \
0  2023/01/01 00:00:00  2023/01/01 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   
1  2023/01/02 00:00:00  2023/01/02 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   
2  2023/01/03 00:00:00  2023/01/03 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   
3  2023/01/04 00:00:00  2023/01/04 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   
4  2023/01/05 00:00:00  2023/01/05 23:59:59  AIRPARIF  FR11ZAG01  ZAG PARIS   

  code site              nom site type d'implantation Polluant  \
0   FR04180      RN20 - MONTLHERY         Périurbaine      NO2   
1   FR04180      RN20 - MONTLHERY         Périurbaine      NO2   
2   FR04058  Auto A1 -Saint-Denis             Urbaine     PM10   
3   FR04012    Place Victor Basch             Urbaine      NO2   
4   FR04180      RN20 - MONTLHERY         Périurbaine    PM2.5   

  type d'influence  ... valeur brute unité de mesure taux de saisie  \
0           Trafic  ...    92.100000           µg-m3     

In [24]:
# Group the DataFrame by 'nom' and calculate the total counts for each 'nom'
total_counts_by_nom = result.groupby('nom site')['counts'].sum()

# Filter the DataFrame to include only rows where the score is equal to or greater than 3
filtered_result = result[result['ATMO Score_x'] >= 3]

# Group the filtered DataFrame by 'nom' and calculate the counts equal to or greater than 3 for each 'nom'
counts_greater_than_3_by_nom = filtered_result.groupby('nom site')['counts'].sum()

# Calculate the ratio of counts greater than or equal to 3 over total counts for each 'nom'
ratio_counts_greater_than_3 = (counts_greater_than_3_by_nom / total_counts_by_nom).fillna(0)

# Display the resulting Series
print(ratio_counts_greater_than_3)

# Merge the site-specific information with the counts DataFrame
ratio_counts_greater_than_3_df = pd.merge(ratio_counts_greater_than_3, first_row_per_nom_site, on='nom site', how = 'inner').drop(['Date de fin', 'Date de début', 'ATMO Score'], axis = 1)
ratio_counts_greater_than_3_df = ratio_counts_greater_than_3_df.rename(columns={'counts': 'indicator'})
print(ratio_counts_greater_than_3_df)

nom site
AUBERVILLIERS           1.000000
Auto A1 -Saint-Denis    0.311321
Av Champs Elysees       0.000000
Bld peripherique Est    0.625000
Boulevard Haussmann     0.000000
CERGY-PONTOISE          1.000000
GENNEVILLIERS           0.722222
LES ULIS                1.000000
LOGNES                  1.000000
NEUILLY-SUR-SEINE       0.833333
PARIS 18eme             0.823529
Place Victor Basch      0.125000
Place de l'Opéra        0.375000
Quai des Celestins      1.000000
RAMBOUILLET             0.800000
RD934 Coulommiers       0.342105
RN2-PANTIN              0.666667
RN20 - MONTLHERY        0.800000
RN6-MELUN               0.000000
TREMBLAY-EN-FRANCE      1.000000
VILLEMOMBLE             1.000000
VITRY-SUR-SEINE         0.750000
Zone Rurale Est         1.000000
Zone Rurale NO          0.625000
Zone Rurale Nord        0.694444
Zone Rurale Nord-Est    1.000000
Zone Rurale SE          1.000000
Zone Rurale SO          0.170213
Zone rurale Sud         0.481481
Name: counts, dtype: float64
     

In [25]:
# Downloading the final indicator of AIRPARIF to csv
ratio_counts_greater_than_3_df.to_csv('indicator_AIRPARIF.csv', index=False, encoding = 'latin1')

In [26]:
# -----------------------------------------------------------------------------------------------------------------------------------------------------
#                                                               Sanity Checks
# -----------------------------------------------------------------------------------------------------------------------------------------------------

In [27]:
print(df_PM25['valeur'].head())

0     8.4
1    11.0
2     8.8
3     9.7
4     9.2
Name: valeur, dtype: float64


In [28]:
df_PM25['ATMO sub-score'].describe()

count    6551.000000
mean        1.424668
std         0.711151
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max         5.000000
Name: ATMO sub-score, dtype: float64

In [29]:
print(df_PM25['valeur'].head())

0     8.4
1    11.0
2     8.8
3     9.7
4     9.2
Name: valeur, dtype: float64
