Concat whole csv files with removing the repeating of days

In [18]:
import pandas as pd
import os
from sklearn.linear_model import RidgeCV, LogisticRegressionCV
from sklearn.metrics import classification_report
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import os

# Set the path to your folder with CSV files
folder_path = r"C:\Users\nadim\Desktop\ttair\data\gazs"

# Get a list of all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Sort the files by their names to ensure they are processed in order
csv_files.sort()

# Load the first CSV file completely
data = pd.read_csv(os.path.join(folder_path, csv_files[0]))

# For each subsequent CSV file, read the entire file, then drop the first day's data
for file_name in csv_files[1:]:
    file_path = os.path.join(folder_path, file_name)
    temp_data = pd.read_csv(file_path)
    temp_data = temp_data.iloc[4:]
    data = pd.concat([data, temp_data])

# Save the concatenated DataFrame to a new CSV file
data.to_csv("output/concatenated.csv", index=False)

df = pd.read_csv("output/concatenated.csv")

# Preprocessing steps
# Convert 'Date de fin' column into a datetime object
df['Date de fin'] = pd.to_datetime(df['Date de fin'])

# Check for missing values
missing_values = df.isnull().sum()
missing_values

# If there are missing values in 'code qualité', fill them with 'U'
if 'code qualité' in missing_values.index:
    df['code qualité'] = df['code qualité'].fillna('U')

# Drop 'unité de mesure' column if it has a single unique value
if df['unité de mesure'].nunique() == 1:
    df = df.drop(columns='unité de mesure')

# Perform one-hot encoding on the categorical variables, including 'Zas'
encoded_df = pd.get_dummies(df, columns=['Polluant', 'Zas', 'type d\'implantation', 'type d\'influence', 'type d\'évaluation', 'procédure de mesure', 'code qualité'])

# Split the data into a training set and a testing set
train_df = encoded_df[encoded_df['Date de fin'] <= '2022-12-31']
test_df = encoded_df[encoded_df['Date de fin'] > '2022-12-31']

# Separate the target variable ('valeur') from the predictors
X_train = train_df.drop(columns=['Date de fin', 'valeur'])
y_train = train_df['valeur']
X_test = test_df.drop(columns=['Date de fin', 'valeur'])
y_test = test_df['valeur']

X_train.head(), y_train.head()


(   Polluant_NO2  Polluant_O3  Polluant_PM10  Polluant_PM25  \
 0          True        False          False          False   
 1          True        False          False          False   
 2         False         True          False          False   
 3         False         True          False          False   
 4         False        False           True          False   
 
    Zas_ZAR SAINT-DENIS  Zas_ZR ILE-DE-FRANCE  type d'implantation_Périurbaine  \
 0                 True                 False                            False   
 1                False                  True                            False   
 2                 True                 False                             True   
 3                False                  True                            False   
 4                 True                 False                            False   
 
    type d'implantation_Rurale régionale  type d'implantation_Urbaine  \
 0                                 False             

Analys and cleaning

In [19]:
from sklearn.linear_model import RidgeCV, LogisticRegressionCV
from sklearn.metrics import classification_report
from datetime import datetime, timedelta
import numpy as np

# Remove the 'code qualité' columns from the training data
X_train_reg = X_train.drop(columns=['code qualité_A', 'code qualité_R'])
X_test_reg = X_test.drop(columns=['code qualité_A', 'code qualité_R'])

# Define a RidgeCV model with built-in cross-validation of the alpha parameter
ridge = RidgeCV(alphas=np.logspace(-6, 6, 13))

# Fit the model to the training data
ridge.fit(X_train_reg, y_train)

# Separate the target variable ('code qualité') from the predictors for the classification task
y_train_class = train_df[['code qualité_A', 'code qualité_R']].idxmax(axis=1).str[-1]
y_test_class = test_df[['code qualité_A', 'code qualité_R']].idxmax(axis=1).str[-1]# Separate the target variable ('code qualité') from the predictors for the classification task
y_train_class = train_df[['code qualité_A', 'code qualité_R']].idxmax(axis=1).astype(str).str[-1]
y_test_class = test_df[['code qualité_A', 'code qualité_R']].idxmax(axis=1).astype(str).str[-1]

# Define a LogisticRegressionCV model with built-in cross-validation
logreg = LogisticRegressionCV(cv=5, multi_class='multinomial', random_state=42)

# Fit the model to the training data
logreg.fit(X_train, y_train_class)

# Get the most recent record for each gas
recent_NO2 = train_df[train_df['Polluant_NO2'] == 1].iloc[-1]
recent_O3 = train_df[train_df['Polluant_O3'] == 1].iloc[-1]
recent_PM10 = train_df[train_df['Polluant_PM10'] == 1].iloc[-1]
recent_PM25 = train_df[train_df['Polluant_PM25'] == 1].iloc[-1]

# Create a new dataframe for the predictions
predict_df = pd.DataFrame([recent_NO2, recent_O3, recent_PM10, recent_PM25])

# Remove the 'Date de fin', 'valeur', and 'code qualité' columns from the prediction data
predict_df = predict_df.drop(columns=['Date de fin', 'valeur', 'code qualité_A', 'code qualité_R'])

# Make the predictions for 'valeur'
predict_df['valeur'] = ridge.predict(predict_df)



In [20]:
# Identify unique 'Zas' in the dataset
unique_zas = [col for col in train_df.columns if 'Zas' in col]

# Initialize a dictionary to hold the prediction dataframes for each 'Zas'
predict_dfs = {}

# For each unique 'Zas', create a subset of the data, train the models, and make predictions
for zas in unique_zas:
    # Create a subset of the data for this 'Zas'
    train_df_zas = train_df[train_df[zas] == 1]
    test_df_zas = test_df[test_df[zas] == 1]

    # Separate the target variable ('valeur') from the predictors
    X_train_zas = train_df_zas.drop(columns=['Date de fin', 'valeur'])
    y_train_zas = train_df_zas['valeur']
    X_test_zas = test_df_zas.drop(columns=['Date de fin', 'valeur'])
    y_test_zas = test_df_zas['valeur']

    # Define a RidgeCV model with built-in cross-validation of the alpha parameter
    ridge_zas = RidgeCV(alphas=np.logspace(-6, 6, 13))

    # Fit the model to the training data
    ridge_zas.fit(X_train_zas, y_train_zas)

    # Separate the target variable ('code qualité') from the predictors for the classification task
    y_train_class_zas = train_df_zas[['code qualité_A', 'code qualité_R']].idxmax(axis=1).astype(str).str[-1]
    y_test_class_zas = test_df_zas[['code qualité_A', 'code qualité_R']].idxmax(axis=1).astype(str).str[-1]

    # Define a LogisticRegressionCV model with built-in cross-validation
    logreg_zas = LogisticRegressionCV(cv=5, multi_class='multinomial', random_state=42)

    # Fit the model to the training data if there are at least two classes
    if y_train_class_zas.nunique() >= 2:
        logreg_zas.fit(X_train_zas, y_train_class_zas)
        predict_qualite = logreg_zas.predict
    else:
        # If there is only one class, predict that class for all instances
        predict_qualite = lambda x: [y_train_class_zas.mode()[0]] * len(x)

    # Get the most recent record for each gas
    recent_records = [train_df_zas[train_df_zas['Polluant_' + gas] == 1].iloc[-1] for gas in ['NO2', 'O3', 'PM10', 'PM25']]

    # Create a new dataframe for the predictions
    predict_df_zas = pd.DataFrame(recent_records)

    # Check if any pollutants are missing and if so, add them
    for gas in ['NO2', 'O3', 'PM10', 'PM25']:
        if 'Polluant_' + gas not in predict_df_zas.columns:
            new_row = pd.Series(0, index=predict_df_zas.columns)
            new_row['Polluant_' + gas] = 1
            predict_df_zas = predict_df_zas.append(new_row, ignore_index=True)

    # Make sure the prediction data has the same columns as the training data
    for column in X_train_zas.columns:
        if column not in predict_df_zas.columns:
            predict_df_zas[column] = 0

    # Make the predictions for 'valeur'
    predict_df_zas['valeur'] = ridge_zas.predict(predict_df_zas[X_train_zas.columns])

    # Prepare the prediction data for the logistic regression model
    predict_data_class_zas = predict_df_zas[X_train_zas.columns].copy()
    predict_data_class_zas['code qualité_A'] = 0
    predict_data_class_zas['code qualité_R'] = 0
    predict_data_class_zas['code qualité_' + y_train_class_zas.mode()[0]] = 1

    # Make the predictions for 'code qualité'
    predict_df_zas['code qualité'] = predict_qualite(predict_data_class_zas)

    # Reverse one-hot encoding for 'Polluant'
    predict_df_zas['Polluant'] = predict_df_zas[['Polluant_NO2', 'Polluant_O3', 'Polluant_PM10', 'Polluant_PM25']].idxmax(axis=1).map(gas_mapping)

    # Add the date, which is tomorrow's date
    predict_df_zas['Date de fin'] = tomorrow.strftime('%Y-%m-%d')

    # Add the 'Zas' column
    predict_df_zas['Zas'] = zas.split("_")[1]

    # Store the prediction dataframe in the dictionary
    predict_dfs[zas] = predict_df_zas[['Date de fin', 'Polluant', 'Zas', 'valeur', 'code qualité']]

# Combine all the prediction dataframes into one dataframe
all_predictions = pd.concat(predict_dfs.values(), ignore_index=True)

all_predictions



Unnamed: 0,Date de fin,Polluant,Zas,valeur,code qualité
0,2023-07-21,NO2,ZAR SAINT-DENIS,6.361169,A
1,2023-07-21,O3,ZAR SAINT-DENIS,31.176393,A
2,2023-07-21,PM10,ZAR SAINT-DENIS,13.176477,A
3,2023-07-21,PM25,ZAR SAINT-DENIS,4.984362,A
4,2023-07-21,NO2,ZR ILE-DE-FRANCE,5.9263,A
5,2023-07-21,O3,ZR ILE-DE-FRANCE,49.216719,A
6,2023-07-21,PM10,ZR ILE-DE-FRANCE,15.61418,A
7,2023-07-21,PM25,ZR ILE-DE-FRANCE,8.605785,A


In [9]:
all_predictions

Unnamed: 0,Date de fin,Polluant,Zas,valeur,code qualité
0,2023-07-21,NO2,ZAR SAINT-DENIS,6.361169,A
1,2023-07-21,O3,ZAR SAINT-DENIS,31.176393,A
2,2023-07-21,PM10,ZAR SAINT-DENIS,13.176477,A
3,2023-07-21,PM25,ZAR SAINT-DENIS,4.984362,A
4,2023-07-21,NO2,ZR ILE-DE-FRANCE,5.9263,A
5,2023-07-21,O3,ZR ILE-DE-FRANCE,49.216719,A
6,2023-07-21,PM10,ZR ILE-DE-FRANCE,15.61418,A
7,2023-07-21,PM25,ZR ILE-DE-FRANCE,8.605785,A


In [23]:
print(df.columns)


Index(['Date de fin,Polluant,Zas,type d'implantation,type d'influence,type d'évaluation,procédure de mesure,valeur,code qualité,unité de mesure'], dtype='object')
