In [None]:
import os
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

base_dir = r"C:\Users\nadim\Desktop\ttair\data\"

# Define the directory where the CSV files are located
dir_path = base_dir+"\gazs"
dir_path2 = base_dir+"\gazs_output"

def custom_agg(x):
    if pd.api.types.is_string_dtype(x):
        return x.iloc[0]
    elif pd.api.types.is_numeric_dtype(x):
        return x.mean()
    else:
        return np.nan

# Load and process the CSV files
data = pd.read_csv(os.path.join(dir_path2, os.listdir(dir_path2)[0]))
for file_name in os.listdir(dir_path2)[1:]:
    temp_data = pd.read_csv(os.path.join(dir_path2, file_name))
    temp_data = temp_data.iloc[4:]
    data = pd.concat([data, temp_data])
data.to_csv(os.path.join(dir_path2, 'concatenated.csv'), index=False)

df = pd.read_csv(os.path.join(dir_path2, 'concatenated.csv'))

# Preprocessing steps
df = df.iloc[:, :-1]
df['Date de fin'] = pd.to_datetime(df['Date de fin'])
df['code qualité'] = df['code qualité'].fillna('U')
df['Polluant'] = df['Polluant'].replace({'NO': 'NO2', 'NOX': 'NO2', 'NOX as NO2': 'NO2', 'PM2.5': 'PM25'})
df = df[~df['Polluant'].isin(['C6H6', 'SO2', 'CO'])]

# Create the shifted target variables
df = df.sort_values('Date de fin')
df['valeur_shifted'] = df.groupby(['Zas', 'Polluant'])['valeur'].shift(-2)
df['code_quality_shifted'] = df.groupby(['Zas', 'Polluant'])['code qualité'].shift(-2)

# Drop the last two days of each group
df = df.groupby(['Zas', 'Polluant']).apply(lambda x: x.iloc[:-2]).reset_index(drop=True)

# One-hot encoding
encoded_df = pd.get_dummies(df, columns=['Polluant', 'Zas', 'type d\'implantation', 
                                         'type d\'influence', 'type d\'évaluation', 
                                         'procédure de mesure', 'code qualité'])

# Sample a subset of the data for model training
sample_df = encoded_df.sample(frac=0.1, random_state=42)

# Separate features and targets
features_sample = sample_df.drop(columns=['Date de fin', 'valeur_shifted', 'code_quality_shifted'])
targets_value_sample = sample_df['valeur_shifted']
targets_quality_sample = sample_df['code_quality_shifted']

# Split the data into train and test sets for value prediction
X_train_value_sample, X_test_value_sample, y_train_value_sample, y_test_value_sample = train_test_split(
    features_sample, targets_value_sample, test_size=0.2, random_state=42)

# Split the data into train and test sets for quality prediction
X_train_quality_sample, X_test_quality_sample, y_train_quality_sample, y_test_quality_sample = train_test_split(
    features_sample, targets_quality_sample, test_size=0.2, random_state=42)

# Initialize LabelEncoder
le_sample = LabelEncoder()

# Train a Linear Regression model for 'valeur_shifted' prediction
linear_regressor_sample = LinearRegression()
linear_regressor_sample.fit(X_train_value_sample, y_train_value_sample)

# Encode the quality labels as integers for Logistic Regression
y_train_quality_encoded_sample = le_sample.fit_transform(y_train_quality_sample)
y_test_quality_encoded_sample = le_sample.transform(y_test_quality_sample)

# Train a Logistic Regression model for 'code_quality_shifted' prediction
logistic_classifier_sample = LogisticRegression(max_iter=1000)
logistic_classifier_sample.fit(X_train_quality_sample, y_train_quality_encoded_sample)

# Load the 'today' data
today_df = pd.read_csv(r'C:\Users\nadim\Desktop\ttair\data\gazs_output\FR_E2_2023-07-20_output.csv')

# Preprocess the 'today' data
today_df = today_df.iloc[:, :-1]
today_df['Date de fin'] = pd.to_datetime(today_df['Date de fin'])
today_df['code qualité'] = today_df['code qualité'].fillna('U')
today_df['Polluant'] = today_df['Polluant'].replace({'NO': 'NO2', 'NOX': 'NO2', 'NOX as NO2': 'NO2', 'PM2.5': 'PM25'})
today_df = today_df[~today_df['Polluant'].isin(['C6H6', 'SO2', 'CO'])]

# Get the features in the training data
training_features = features_sample.columns

# One-hot encode the 'today' data
encoded_today_df = pd.get_dummies(today_df)

# Get the features in the 'today' data after one-hot encoding
today_features = encoded_today_df.columns

# Find the missing and extra features
missing_features = set(training_features) - set(today_features)
extra_features = set(today_features) - set(training_features)

# Add the missing features to the 'today' data with a value of 0
for feature in missing_features:
    encoded_today_df[feature] = 0

# Remove the extra features from the 'today' data
encoded_today_df = encoded_today_df.drop(columns=list(extra_features))

# Ensure the 'today' data has the same feature order as the training data
encoded_today_df = encoded_today_df[training_features]

# Make predictions for 'valeur' and 'code qualité' for the next two days
predictions_value = linear_regressor_sample.predict(encoded_today_df)
predictions_quality_encoded = logistic_classifier_sample.predict(encoded_today_df)

# Decode the predicted 'code qualité'
predictions_quality = le_sample.inverse_transform(predictions_quality_encoded)

# Create a DataFrame for the predictions
predictions_df = pd.DataFrame({
    'Date de fin': today_df['Date de fin'] + pd.DateOffset(days=1),  # the date for the predictions is the next day
    'Polluant': today_df['Polluant'],
    'Zas': today_df['Zas'],
    'valeur': predictions_value,
    'code qualité': predictions_quality
})

# Append the predictions for the day after tomorrow
predictions_df_2 = predictions_df.copy()
predictions_df_2['Date de fin'] = predictions_df_2['Date de fin'] + pd.DateOffset(days=1)  # the date for the predictions is two days later
predictions_df = pd.concat([predictions_df, predictions_df_2])

# Filter the predictions for the 'Zas' value 'ZR ILE-DE-FRANCE'
predictions_df_filtered = predictions_df[predictions_df['Zas'] == 'ZR ILE-DE-FRANCE']

# Print the filtered predictions
print(predictions_df_filtered)

# Save the filtered predictions to a new CSV file
output_prediction_csv_path = os.path.join(base_dir, "prediction_output.csv")
predictions_df_filtered.to_csv(output_prediction_csv_path, index=False)

print("Les prédictions ont été sauvegardées dans :", output_prediction_csv_path)


In [11]:
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

class AirQualityPredictor:
    def __init__(self, filepath):
        self.filepath = filepath
        self.df = self.load_data()

    def load_data(self):
        df = pd.read_csv(self.filepath)
        return df

    def preprocess_data(self):
        self.df = self.df.iloc[:, :-1]
        self.df['Date de fin'] = pd.to_datetime(self.df['Date de fin'])
        self.df['code qualité'] = self.df['code qualité'].fillna('U')
        self.df['Polluant'] = self.df['Polluant'].replace({'NO': 'NO2', 'NOX': 'NO2', 'NOX as NO2': 'NO2', 'PM2.5': 'PM25'})
        self.df = self.df[~self.df['Polluant'].isin(['C6H6', 'SO2', 'CO'])]
        self.df = self.df.sort_values('Date de fin')
        self.df['valeur_shifted'] = self.df.groupby(['Zas', 'Polluant'])['valeur'].shift(-2)
        self.df['code_quality_shifted'] = self.df.groupby(['Zas', 'Polluant'])['code qualité'].shift(-2)
        self.df = self.df.groupby(['Zas', 'Polluant']).apply(lambda x: x.iloc[:-2]).reset_index(drop=True)

        # One-hot encoding
        self.encoded_df = pd.get_dummies(self.df, columns=['Polluant', 'Zas', 'type d\'implantation', 
                                                           'type d\'influence', 'type d\'évaluation', 
                                                           'procédure de mesure', 'code qualité'])

    def train_model(self):
        # Sample a subset of the data for model training
        sample_df = self.encoded_df.sample(frac=0.1, random_state=42)

        # Separate features and targets
        features_sample = sample_df.drop(columns=['Date de fin', 'valeur_shifted', 'code_quality_shifted'])
        targets_value_sample = sample_df['valeur_shifted']
        targets_quality_sample = sample_df['code_quality_shifted']

        # Split the data into train and test sets for value prediction
        X_train_value_sample, X_test_value_sample, y_train_value_sample, y_test_value_sample = train_test_split(
            features_sample, targets_value_sample, test_size=0.2, random_state=42)

        # Split the data into train and test sets for quality prediction
        X_train_quality_sample, X_test_quality_sample, y_train_quality_sample, y_test_quality_sample = train_test_split(
            features_sample, targets_quality_sample, test_size=0.2, random_state=42)

        # Initialize LabelEncoder
        le_sample = LabelEncoder()

        # Train a Linear Regression model for 'valeur_shifted' prediction
        self.linear_regressor_sample = LinearRegression()
        self.linear_regressor_sample.fit(X_train_value_sample, y_train_value_sample)

        # Encode the quality labels as integers for Logistic Regression
        y_train_quality_encoded_sample = le_sample.fit_transform(y_train_quality_sample)
        y_test_quality_encoded_sample = le_sample.transform(y_test_quality_sample)

        # Train a Logistic Regression model for 'code_quality_shifted' prediction
        self.logistic_classifier_sample = LogisticRegression(max_iter=1000)
        self.logistic_classifier_sample.fit(X_train_quality_sample, y_train_quality_encoded_sample)

    def make_predictions(self, today_df):
        # Preprocess the 'today' data
        today_df = today_df.iloc[:, :-1]
        today_df['Date de fin'] = pd.to_datetime(today_df['Date de fin'])
        today_df['code qualité'] = today_df['code qualité'].fillna('U')
        today_df['Polluant'] = today_df['Polluant'].replace({'NO': 'NO2', 'NOX': 'NO2', 'NOX as NO2': 'NO2', 'PM2.5': 'PM25'})
        today_df = today_df[~today_df['Polluant'].isin(['C6H6', 'SO2', 'CO'])]

        # One-hot encode the 'today' data
        encoded_today_df = pd.get_dummies(today_df)

        # Ensure the 'today' data has the same feature order as the training data
        training_features = features_sample.columns
        missing_features = set(training_features) - set(today_features)
        extra_features = set(today_features) - set(training_features)

        for feature in missing_features:
            encoded_today_df[feature] = 0

        encoded_today_df = encoded_today_df.drop(columns=list(extra_features))
        encoded_today_df = encoded_today_df[training_features]

        # Make predictions for 'valeur' and 'code qualité' for the next two days
        predictions_value = self.linear_regressor_sample.predict(encoded_today_df)
        predictions_quality_encoded = self.logistic_classifier_sample.predict(encoded_today_df)

        return predictions_value, predictions_quality_encoded

# Usage:
# predictor = AirQualityPredictor('/path/to/your/data.csv')
# predictor.preprocess_data()
# predictor.train_model()
# predictions = predictor.make_predictions(today_df)
