In [1]:
# Code for extracting the daily ATMO score for each station in ATMO BOURGOGNE-FRANCE-COMTÉ (Région: BOURGOGNE-FRANCE-COMTÉ)

# Note: will use abbreviation ATMO BFC

In [1]:
# Installing required packages
import pandas as pd

import os

# Base directory path
base_path = ".\data\raw\Données de la qualité de l'air (Indicateurs SOMO35 et ATMO)\"


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
# PM10 ATMO BFC

# File path to the data
PM10_ATMO_BFC_file_path = os.path.join(base_path, "PM10 Moyenne Journalière\Export Moy. journalière - PM10 - ATMO BFC.csv")

# Use read_csv function from pandas specifying the delimiter as ';'
df_PM10 = pd.read_csv(PM10_ATMO_BFC_file_path, delimiter=';')

# Display the first few rows of the DataFrame
print(df_PM10.head())

# Sample ATMO subscore dictionary for PM10 (Moyenne journalière)
ATMO_subscore_PM10 = {
    'Subscore': [1, 2, 3, 4, 5, 6],
    'Range': [(0.0, 20.0), (20.0, 40.0), (40.0, 50.0), (50.0, 100.0), (100.0, 150.0), (150.0,10000.0)]
}

# Convert the ATMO_subscore dictionary to a DataFrame
ATMO_subscore_df_PM10 = pd.DataFrame(ATMO_subscore_PM10)

# Convert the range tuples to ranges of numbers for easy comparison
ATMO_subscore_df_PM10['Range'] = ATMO_subscore_df_PM10['Range'].apply(lambda x: [round(i / 10, 1) for i in range(int(x[0] * 10), int(x[1] * 10) + 1)])

# Function to assign subscore based on value of PM10 measurement
def assign_subscore_PM10(valeur):
    for index, row in ATMO_subscore_df_PM10.iterrows():  
        if valeur in row['Range']:
            return row['Subscore']
    return None  # Return None if no matching subscore found

# Apply the function to fill in the Subscore column while ensuring the values in the 'valeur' column are recognised as numeric
df_PM10['ATMO sub-score'] = df_PM10['valeur'].apply(assign_subscore_PM10)  

# Display the updated DataFrame
print(df_PM10.head())


         Date de début          Date de fin                     Organisme  \
0  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
1  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
2  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
3  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
4  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   

    code zas                         Zas code site          nom site  \
0  FR27ZAR02                   ZAR DIJON   FR26005     Dijon Péjoces   
1  FR27ZRE01  ZR BOURGOGNE-FRANCHE-COMTE   FR26012            Morvan   
2  FR27ZAR02                   ZAR DIJON   FR26014  Dijon Trémouille   
3  FR27ZRE01  ZR BOURGOGNE-FRANCHE-COMTE   FR26017            Nevers   
4  FR27ZRE01  ZR BOURGOGNE-FRANCHE-COMTE   FR26019           Auxerre   

  type d'implantation Polluant type d'influence  ... valeur valeur brute  \
0         Périurbaine     PM

In [4]:
# PM2.5 ATMO BFC

# File path to the data
PM25_ATMO_BFC_file_path = os.path.join(base_path, "PM2.5 Moyenne Journalière\Export Moy. journalière - PM2.5 - ATMO BFC.csv")

# Use read_csv function from pandas specifying the delimiter as ';'
df_PM25 = pd.read_csv(PM25_ATMO_BFC_file_path, delimiter=';')

# Display the first few rows of the DataFrame
print(df_PM25.head())

# Sample ATMO subscore dictionary for PM2.5 (Moyenne journalière)
ATMO_subscore_PM25 = {
    'Subscore': [1, 2, 3, 4, 5, 6],
    'Range': [(0.0, 10.0), (10.0, 20.0), (20.0, 25.0), (25.0, 50.0), (50.0, 75.0), (75.0,10000.0)]
}

# Convert the ATMO_subscore dictionary to a DataFrame
ATMO_subscore_df_PM25 = pd.DataFrame(ATMO_subscore_PM25)

# Convert the range tuples to ranges of numbers for easy comparison
ATMO_subscore_df_PM25['Range'] = ATMO_subscore_df_PM25['Range'].apply(lambda x: [round(i / 10, 1) for i in range(int(x[0] * 10), int(x[1] * 10) + 1)])

# Function to assign subscore based on value of PM2.5 measurement
def assign_subscore_PM25(valeur):
    for index, row in ATMO_subscore_df_PM25.iterrows(): 
        if valeur in row['Range']:
            return row['Subscore']
    return None  # Return None if no matching subscore found

# Apply the function to fill in the Subscore column while ensuring the values in the 'valeur' column are recognised as numeric
df_PM25['ATMO sub-score'] = df_PM25['valeur'].apply(assign_subscore_PM25)  

# Display the updated DataFrame
print(df_PM25.head())


         Date de début          Date de fin                     Organisme  \
0  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
1  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
2  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
3  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
4  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   

    code zas                         Zas code site             nom site  \
0  FR27ZAR02                   ZAR DIJON   FR26005        Dijon Péjoces   
1  FR27ZRE01  ZR BOURGOGNE-FRANCHE-COMTE   FR26012               Morvan   
2  FR27ZRE01  ZR BOURGOGNE-FRANCHE-COMTE   FR26019              Auxerre   
3  FR27ZAR02                   ZAR DIJON   FR26094       Dijon Ardennes   
4  FR27ZAR04                  ZAR CHALON   FR32001  Chalon Centre Ville   

  type d'implantation Polluant type d'influence  ... valeur valeur brute  \
0         

In [5]:
# NO2 ATMO BFC

# File path to the data
NO2_ATMO_BFC_file_path = os.path.join(base_path, "NO2 Max Horaire Journalier\Export Max. journalier moy. hor. - NO2 - ATMO BFC.csv")

# Use read_csv function from pandas specifying the delimiter as ';'
df_NO2 = pd.read_csv(NO2_ATMO_BFC_file_path, delimiter=';')

# Display the first few rows of the DataFrame
print(df_NO2.head())

# Sample ATMO subscore dictionary for NO2 (Max horaire journalier)
ATMO_subscore_NO2 = {
    'Subscore': [1, 2, 3, 4, 5, 6],
    'Range': [(0.0, 40.0), (40.0, 90.0), (90.0, 120.0), (120.0, 230.0), (230.0, 340.0), (340.0,10000.0)]
}

# Convert the ATMO_subscore dictionary to a DataFrame
ATMO_subscore_df_NO2 = pd.DataFrame(ATMO_subscore_NO2)

# Convert the range tuples to ranges of numbers for easy comparison
ATMO_subscore_df_NO2['Range'] = ATMO_subscore_df_NO2['Range'].apply(lambda x: [round(i / 10, 1) for i in range(int(x[0] * 10), int(x[1] * 10) + 1)])

# Function to assign subscore based on value of NO2 measurement
def assign_subscore_NO2(valeur):
    for index, row in ATMO_subscore_df_NO2.iterrows():  # Corrected variable name
        if valeur in row['Range']:
            return int(row['Subscore'])
    return None  # Return None if no matching subscore found

# Apply the function to fill in the Subscore column
df_NO2['ATMO sub-score'] = df_NO2['valeur'].apply(assign_subscore_NO2)

# Display the updated DataFrame
print(df_NO2.head())


         Date de début          Date de fin                     Organisme  \
0  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
1  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
2  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
3  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
4  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   

    code zas                         Zas code site          nom site  \
0  FR27ZAR02                   ZAR DIJON   FR26005     Dijon Péjoces   
1  FR27ZRE01  ZR BOURGOGNE-FRANCHE-COMTE   FR26012            Morvan   
2  FR27ZAR02                   ZAR DIJON   FR26014  Dijon Trémouille   
3  FR27ZRE01  ZR BOURGOGNE-FRANCHE-COMTE   FR26017            Nevers   
4  FR27ZAR02                   ZAR DIJON   FR26094    Dijon Ardennes   

  type d'implantation Polluant type d'influence  ... valeur valeur brute  \
0         Périurbaine      N

In [6]:
# O3 ATMO BFC

# File path to the data
O3_ATMO_BFC_file_path = os.path.join(base_path, "O3 Max Horaire Journalier\Export Max. journalier moy. hor. - O3 - ATMO BFC.csv")

# Use read_csv function from pandas specifying the delimiter as ';'
df_O3 = pd.read_csv(O3_ATMO_BFC_file_path, delimiter=';')

# Display the first few rows of the DataFrame
print(df_O3.head())

# Sample ATMO subscore dictionary for O3 (Max horaire journalier)
ATMO_subscore_O3 = {
    'Subscore': [1, 2, 3, 4, 5, 6],
    'Range': [(0.0, 50.0), (50.0, 100.0), (100.0, 130.0), (130.0, 240.0), (240.0, 380.0), (380.0,10000.0)]
}

# Convert the ATMO_subscore dictionary to a DataFrame
ATMO_subscore_df_O3 = pd.DataFrame(ATMO_subscore_O3)

# Convert the range tuples to ranges of numbers for easy comparison
ATMO_subscore_df_O3['Range'] = ATMO_subscore_df_O3['Range'].apply(lambda x: [round(i / 10, 1) for i in range(int(x[0] * 10), int(x[1] * 10) + 1)])

# Function to assign subscore based on value of O3 measurement
def assign_subscore_O3(valeur):
    for index, row in ATMO_subscore_df_O3.iterrows():  # Corrected variable name
        if valeur in row['Range']:
            return int(row['Subscore'])
    return None  # Return None if no matching subscore found

# Apply the function to fill in the Subscore column
df_O3['ATMO sub-score'] = df_O3['valeur'].apply(assign_subscore_O3)

# Display the updated DataFrame
print(df_O3.head())


         Date de début          Date de fin                     Organisme  \
0  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
1  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
2  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
3  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
4  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   

    code zas                         Zas code site       nom site  \
0  FR27ZAR02                   ZAR DIJON   FR26005  Dijon Péjoces   
1  FR27ZAR02                   ZAR DIJON   FR26010           Daix   
2  FR27ZRE01  ZR BOURGOGNE-FRANCHE-COMTE   FR26012         Morvan   
3  FR27ZRE01  ZR BOURGOGNE-FRANCHE-COMTE   FR26017         Nevers   
4  FR27ZRE01  ZR BOURGOGNE-FRANCHE-COMTE   FR26019        Auxerre   

  type d'implantation Polluant type d'influence  ... valeur valeur brute  \
0         Périurbaine       O3             Fon

In [7]:
# SO2 ATMO BFC

# File path to the data
SO2_ATMO_BFC_file_path = os.path.join(base_path, "SO2 Max Horaire Journalier\Export Max. journalier moy. hor. - SO2 - ATMO BFC.csv")

# Use read_csv function from pandas specifying the delimiter as ';'
df_SO2 = pd.read_csv(SO2_ATMO_BFC_file_path, delimiter=';')

# Display the first few rows of the DataFrame
print(df_SO2.head())

# Sample ATMO subscore dictionary for SO2 (Max horaire journalier)
ATMO_subscore_SO2 = {
    'Subscore': [1, 2, 3, 4, 5, 6],
    'Range': [(0.0, 100.0), (100.0, 200.0), (200.0, 350.0), (350.0, 500.0), (500.0, 750.0), (750.0,10000.0)]
}

# Convert the ATMO_subscore dictionary to a DataFrame
ATMO_subscore_df_SO2 = pd.DataFrame(ATMO_subscore_SO2)

# Convert the range tuples to ranges of numbers for easy comparison
ATMO_subscore_df_SO2['Range'] = ATMO_subscore_df_SO2['Range'].apply(lambda x: [round(i / 10, 1) for i in range(int(x[0] * 10), int(x[1] * 10) + 1)])

# Function to assign subscore based on value of SO2 measurement
def assign_subscore_SO2(valeur):
    for index, row in ATMO_subscore_df_SO2.iterrows():  # Corrected variable name
        if valeur in row['Range']:
            return int(row['Subscore'])
    return None  # Return None if no matching subscore found

# Apply the function to fill in the Subscore column
df_SO2['ATMO sub-score'] = df_SO2['valeur'].apply(assign_subscore_O3)

# Display the updated DataFrame
print(df_SO2.head())


         Date de début          Date de fin                     Organisme  \
0  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
1  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
2  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
3  2023/01/02 00:00:00  2023/01/02 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
4  2023/01/02 00:00:00  2023/01/02 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   

    code zas                         Zas code site   nom site  \
0  FR27ZRE01  ZR BOURGOGNE-FRANCHE-COMTE   FR82041     Tavaux   
1  FR27ZRE01  ZR BOURGOGNE-FRANCHE-COMTE   FR82042   Damparis   
2  FR27ZRE01  ZR BOURGOGNE-FRANCHE-COMTE   FR82043  Chatenois   
3  FR27ZRE01  ZR BOURGOGNE-FRANCHE-COMTE   FR82041     Tavaux   
4  FR27ZRE01  ZR BOURGOGNE-FRANCHE-COMTE   FR82042   Damparis   

      type d'implantation Polluant type d'influence  ... valeur valeur brute  \
0  Rurale près des villes      SO2     Industrielle  ...    2.2   

In [8]:
# Combining 5 (if not missing) dataframes
# Concatenate DataFrames along the rows (stack them vertically)
combined_df_ATMO_BFC = pd.concat([df_PM10, df_PM25, df_NO2, df_O3, df_SO2], axis=0) #

# If you want to reset the index of the combined DataFrame
combined_df_ATMO_BFC.reset_index(drop=True, inplace=True)

# Renaming the ATMO subscore column to 'ATMO Score'
combined_df_ATMO_BFC = combined_df_ATMO_BFC.rename(columns={'ATMO sub-score': 'ATMO Score'})

# Printing the first five rows as a sanity check
print(combined_df_ATMO_BFC.head())


         Date de début          Date de fin                     Organisme  \
0  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
1  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
2  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
3  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
4  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   

    code zas                         Zas code site          nom site  \
0  FR27ZAR02                   ZAR DIJON   FR26005     Dijon Péjoces   
1  FR27ZRE01  ZR BOURGOGNE-FRANCHE-COMTE   FR26012            Morvan   
2  FR27ZAR02                   ZAR DIJON   FR26014  Dijon Trémouille   
3  FR27ZRE01  ZR BOURGOGNE-FRANCHE-COMTE   FR26017            Nevers   
4  FR27ZRE01  ZR BOURGOGNE-FRANCHE-COMTE   FR26019           Auxerre   

  type d'implantation Polluant type d'influence  ... valeur brute  \
0         Périurbaine     PM10     

In [17]:
# Find the index of rows with maximum 'ATMO Score' within each group
idx_max_scores = combined_df_ATMO_BFC.groupby(['nom site', 'Date de fin'])['ATMO Score'].idxmax()

# Select rows with the maximum 'ATMO Score' using the index
final_df_ATMO_BFC = combined_df_ATMO_BFC.loc[idx_max_scores]

# Reset index if needed
final_df_ATMO_BFC.reset_index(drop=True, inplace=True)

In [18]:
# Downloading the final dataframe of Daily ATMO Score in ATMO BOURGOGNE-FRANCE-COMTÉ to csv
final_df_ATMO_BFC.to_csv('final_df_ATMO_BFC.csv', index=False, encoding = 'latin1')

In [19]:
# Constructing indicator for ATMO BOURGOGNE-FRANCE-COMTÉ (ATMO BFC)

# File path to the data
ATMO_BFC_file_path = os.path.join(base_path, "final_df_ATMO_BFC.csv")

# Use read_csv function from pandas specifying the delimiter as ';'
df_ATMO = pd.read_csv(ATMO_BFC_file_path, delimiter=',', encoding = 'latin1')

print(df_ATMO.head())

counts = df_ATMO.groupby('nom site')['ATMO Score'].value_counts().reset_index(name='counts')
#print(counts)

#result = pd.merge(counts, df_ATMO[["Organisme", "type d'implantation", "Latitude", "Longitude"]], on='nom site', how='left')
#print(result)

# Getting a df of the site-specific information e.g., code, latitude, longitude
first_row_per_nom_site = df_ATMO.groupby('nom site').first().reset_index()
#print(first_row_per_nom_site)

# Merge the site-specific information with the counts DataFrame
result = pd.merge(counts, first_row_per_nom_site, on='nom site', how = 'inner').drop(['Date de fin', 'Date de début', 'ATMO Score_y'], axis = 1)
print(result)

         Date de début          Date de fin                     Organisme  \
0  2023/01/01 00:00:00  2023/01/01 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
1  2023/01/02 00:00:00  2023/01/02 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
2  2023/01/03 00:00:00  2023/01/03 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
3  2023/01/04 00:00:00  2023/01/04 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   
4  2023/01/05 00:00:00  2023/01/05 23:59:59  ATMO BOURGOGNE-FRANCHE-COMTE   

    code zas                         Zas code site nom site  \
0  FR27ZRE01  ZR BOURGOGNE-FRANCHE-COMTE   FR26019  Auxerre   
1  FR27ZRE01  ZR BOURGOGNE-FRANCHE-COMTE   FR26019  Auxerre   
2  FR27ZRE01  ZR BOURGOGNE-FRANCHE-COMTE   FR26019  Auxerre   
3  FR27ZRE01  ZR BOURGOGNE-FRANCHE-COMTE   FR26019  Auxerre   
4  FR27ZRE01  ZR BOURGOGNE-FRANCHE-COMTE   FR26019  Auxerre   

  type d'implantation Polluant type d'influence  ... valeur brute  \
0             Urbaine       O3             Fond  ...       78.400   
1             Urbain

In [20]:
# Group the DataFrame by 'nom' and calculate the total counts for each 'nom'
total_counts_by_nom = result.groupby('nom site')['counts'].sum()

# Filter the DataFrame to include only rows where the score is equal to or greater than 3
filtered_result = result[result['ATMO Score_x'] >= 3]

# Group the filtered DataFrame by 'nom' and calculate the counts equal to or greater than 3 for each 'nom'
counts_greater_than_3_by_nom = filtered_result.groupby('nom site')['counts'].sum()

# Calculate the ratio of counts greater than or equal to 3 over total counts for each 'nom'
ratio_counts_greater_than_3 = (counts_greater_than_3_by_nom / total_counts_by_nom).fillna(0)

# Display the resulting Series
print(ratio_counts_greater_than_3)

# Merge the site-specific information with the counts DataFrame
ratio_counts_greater_than_3_df = pd.merge(ratio_counts_greater_than_3, first_row_per_nom_site, on='nom site', how = 'inner').drop(['Date de fin', 'Date de début', 'ATMO Score'], axis = 1)
ratio_counts_greater_than_3_df = ratio_counts_greater_than_3_df.rename(columns={'counts': 'indicator'})
print(ratio_counts_greater_than_3_df)

nom site
Auxerre                          0.249315
Baume-les-Dames                  0.032877
Belfort Octroi                   0.016484
Besancon Mégevand                0.013736
Besancon Prevoyance              0.288462
Chalon Centre Ville              0.082192
Champforgueil                    0.212707
Chatenois                        0.005479
Daix                             0.264463
Dambenois Citoyen                0.221918
Damparis                         0.000000
Dijon Ardennes                   0.030137
Dijon Péjoces                    0.236264
Dijon Trémouille                 0.030137
Dole centre                      0.243836
Lons-le-Saunier CV               0.290411
Macon Paul Bert                  0.273973
Montandon Baresans               0.167131
Montbéliard centre               0.235616
Montceau-les-Mines 9me écluse    0.231405
Montfaucon                       0.282548
Morvan                           0.235457
Nevers                           0.172603
Sens                     

In [21]:
# Downloading the final indicator of ATMO BFC to csv
ratio_counts_greater_than_3_df.to_csv('indicator_ATMO_BFC.csv', index=False, encoding = 'latin1')

In [13]:
# -----------------------------------------------------------------------------------------------------------------------------------------------------
#                                                               Sanity Checks
# -----------------------------------------------------------------------------------------------------------------------------------------------------

In [14]:
print(df_PM25['valeur'].head())

0    7.7
1    8.7
2    7.3
3    6.8
4    9.3
Name: valeur, dtype: float64


In [15]:
df_PM25['ATMO sub-score'].describe()

count    4770.000000
mean        1.190776
std         0.555269
min         1.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         5.000000
Name: ATMO sub-score, dtype: float64

In [16]:
print(df_PM25['valeur'].head())

0    7.7
1    8.7
2    7.3
3    6.8
4    9.3
Name: valeur, dtype: float64
