In [None]:
from cProfile import label

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings

warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.dates as mdates
import lightgbm as lgb

# Pipeline creazione train e test per ogni dataset

In [None]:
# suddivisione train-test: divido per mesi e considero le istanze 
# questa la divisione che ho usato io 
def split_train_test_by_month_istanza(df, target):
    X_train_list = []
    X_test_list = []
    y_train_list = []
    y_test_list = []

    # Creo una colonna 'year_month' per identificare il mese e l'anno così da non avere problemi con gli stessi mesi di diversi anni
    df['year_month'] = df['year'].astype(str) + '-' + df['month'].astype(str)
    df['day_hour'] = df['day'].astype(str) + '-' + df['hour'].astype(str)

    for year_month in df['year_month'].unique():
        month_data = df[df['year_month'] == year_month]
        hours = month_data['day_hour'].unique()
        if len(hours) <= 1:
            continue
        random_seed = hash(year_month) % (2 ** 32 - 1)
        train_hours, test_hours = train_test_split(hours, test_size=0.25, random_state=random_seed)

        train_data = month_data[month_data['day_hour'].isin(train_hours)]
        test_data = month_data[month_data['day_hour'].isin(test_hours)]

        X_train_list.append(train_data.drop(columns=[target, 'year_month', 'day_hour']))
        X_test_list.append(test_data.drop(columns=[target, 'year_month', 'day_hour']))
        y_train_list.append(train_data[target])
        y_test_list.append(test_data[target])

    X_train = pd.concat(X_train_list)
    X_test = pd.concat(X_test_list)
    y_train = pd.concat(y_train_list)
    y_test = pd.concat(y_test_list)

    return X_train, X_test, y_train, y_test


# Pipeline completa
def pipeline_train_test(df):
    target = 'pm2p5_y'
    city = df['city'].iloc[0]
    type_sensor = df['type_sensor'].iloc[0]
    if type_sensor == 'PMS7003':
        type_sensor = 'AIRBEAM'
    X_train, X_test, y_train, y_test = split_train_test_by_month_istanza(df, target)

    training = pd.concat([X_train, y_train], axis=1)
    testing = pd.concat([X_test, y_test], axis=1)

    if city == 'Southampton' or city == 'Lima':
        #concatena nome città e sensore 
        city = city + '_' + type_sensor

    #salvo le divisioni da poter usare successivamente
    training.to_csv(f'../dataset/training_{city}.csv', index=False)
    testing.to_csv(f'../dataset/testing_{city}.csv', index=False)


## Esempio

In [None]:
# aosta = pd.read_csv('../dataset/aosta.csv')
# pipeline_train_test(aosta)

# TEST A COPPIE E PROGRESSIVO

In [None]:
# elimino le features non importanti: city(dato che uso lat e lon) e valid_at
def delete_features(X_train, X_test, column):
    # Assicurati che datetime_column sia una lista
    if isinstance(column, str):
        column = [column]
    for col in column:
        X_train = X_train.drop(columns=col)
        X_test = X_test.drop(columns=col)
    return X_train, X_test


#funzione per gestire i valori mancanti --> non ce ne sono 
""" def handle_missing_values(X_train, X_test, strategy='drop'):
    if strategy == 'drop':
        return X_train.dropna(), X_test.dropna()
    elif strategy == 'mean':
        return X_train.fillna(X_train.mean()), X_test.fillna(X_test.mean())
    else:
        raise ValueError("Strategia non supportata. Usa 'drop' o 'mean'.") """


#funzione per preprocessare le variabili categoriche: type_sensor, sensor_id, day_of_week
def preprocess_categorical(X, categorical_columns):
    for col in categorical_columns:
        if col == 'sensor_id' or col == 'type_sensor':  #rendo numerici
            X[col] = pd.Categorical(X[col])
            X[col] = X[col].cat.codes
        else:
            X = pd.get_dummies(X, columns=[col], drop_first=True)  #day_of_week faccio il one hot encoding
    return X


#funzione per trasformare in variabili cicliche le variabili temporali
def add_cyclic_features(X_train, X_test, columns):
    for col in columns:
        X_train[col + '_sin'] = np.sin(2 * np.pi * X_train[col] / X_train[col].max())
        X_train[col + '_cos'] = np.cos(2 * np.pi * X_train[col] / X_train[col].max())
        X_test[col + '_sin'] = np.sin(2 * np.pi * X_test[col] / X_test[col].max())
        X_test[col + '_cos'] = np.cos(2 * np.pi * X_test[col] / X_test[col].max())
        X_train = X_train.drop(columns=[col])
        X_test = X_test.drop(columns=[col])
    return X_train, X_test


#definizione pipeline preprocessamento
def preprocess_pipeline_simple_model(X_train, X_test, column, categorical_columns, strategy):
    #X_train, X_test = delete_features(X_train, X_test, column) non elimino subito la colonna perchè uso valid_at per il plot
    #X_train, X_test = handle_missing_values(X_train, X_test, strategy)
    X_train = preprocess_categorical(X_train, categorical_columns)
    X_test = preprocess_categorical(X_test, categorical_columns)
    X_train, X_test = add_cyclic_features(X_train, X_test, ['hour', 'day', 'month', 'year'])

    return X_train, X_test


#funzione per scalare le feature
def scale_features(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled


# Valutazione del modello
def evaluate_model(model, X_test, y_test, y_pred):
    if y_pred is None:
        y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    return {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R2': r2}


#funzione per addestrare e valutare il modello
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics = evaluate_model(model, X_test, y_test, y_pred)
    return metrics, model, y_pred


def plot_predictions(y_pred, X_test_plot):
    tempo = X_test_plot['valid_at']
    pm_orig = X_test_plot['pm2p5_x']
    pm_pred = y_pred
    pm_ref = X_test_plot['pm2p5_y']

    fig, ax = plt.subplots(figsize=(8, 6))
    ax.grid(True)
    ax.plot(tempo, pm_orig, label='PM2.5 originale', color='red')
    ax.plot(tempo, pm_pred, label='PM2.5 predetto', color='blue')
    ax.plot(tempo, pm_ref, label='PM2.5 ref station', color='black')
    ax.xaxis.set_major_locator(mdates.MonthLocator())
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    plt.xlabel('Data')
    plt.ylabel('PM2.5')
    plt.title('PM2.5 originale vs PM2.5 predetto')
    plt.xticks(rotation=90)
    plt.legend()
    plt.show()


def full_pipeline(X_train, y_train, X_test, y_test, model, column, categorical_columns, strategy):
    X_train, X_test = preprocess_pipeline_simple_model(X_train, X_test, column, categorical_columns, strategy)
    assert X_train.shape[1] == X_test.shape[1], "Mismatch in feature dimensions after preprocessing!"

    #faccio la copia di X_test per il plot
    X_test_plot = X_test.copy()
    X_test_plot['pm2p5_y'] = y_test
    X_train, X_test = delete_features(X_train, X_test, column)

    #matrice correlazione
    """ corr = X_train.corr().round(2)
    plt.figure(figsize=(15, 10))
    sns.heatmap(corr, annot=True, cmap='coolwarm')
    plt.show()  """

    X_train_scaled, X_test_scaled = scale_features(X_train, X_test)
    metrics, model, y_pred = train_and_evaluate_model(model, X_train_scaled, y_train, X_test_scaled, y_test)
    plot_predictions(y_pred, X_test_plot)
    print(metrics)
    return metrics, model

## TEST PROGRESSIVO: DATASET TARGET --> TRAIN = TRAINING DATASET TARGET + ALTRO DATASET AGGIUNTO UNO ALLA VOLTA E TEST = TESTING DATASET TARGET

In [None]:
#carico i singoli dataset 
aosta = pd.read_csv('../dataset/aosta.csv')
badajoz = pd.read_csv('../dataset/badajoz.csv')
bangalore = pd.read_csv('../dataset/bangalore.csv')
calgary = pd.read_csv('../dataset/calgary.csv')
delhi = pd.read_csv('../dataset/delhi.csv')
hamirpur = pd.read_csv('../dataset/hamirpur.csv')
lima_iqair = pd.read_csv('../dataset/lima_iqair.csv')
lima_airbeam = pd.read_csv('../dataset/lima_airbeam.csv')
uk_pms5003 = pd.read_csv('../dataset/uk_pms5003.csv')
uk_sps030 = pd.read_csv('../dataset/uk_sps030.csv')

In [None]:
pipeline_train_test(aosta)
pipeline_train_test(badajoz)
pipeline_train_test(bangalore)
pipeline_train_test(calgary)
pipeline_train_test(delhi)
pipeline_train_test(hamirpur)
pipeline_train_test(lima_iqair)
pipeline_train_test(lima_airbeam)
pipeline_train_test(uk_pms5003)
pipeline_train_test(uk_sps030)

In [None]:
import lightgbm as lgb
import pandas as pd

metrics_incremental_df = pd.DataFrame(columns=['city', 'type_sensor', 'num dataset', 'R2'])
categorical_columns = ['type_sensor', 'sensor_id', 'day_of_week']

#definizione dei dataset caricati in precedenza
datasets = [aosta, badajoz, bangalore, calgary, delhi, hamirpur, lima_iqair, lima_airbeam, uk_pms5003, uk_sps030]

#cicla su tutti i dataset
for dataset in datasets:
    city = dataset['city'].iloc[0]
    print(f"Testo la città {city}")
    type_sensor = dataset['type_sensor'].iloc[0]
    if type_sensor == 'PMS7003' and city == 'Lima':
        type_sensor = 'AIRBEAM'
    if city == 'Southampton' or city == 'Lima':
        #concatena nome città e sensore 
        city = city + '_' + type_sensor

    training = pd.read_csv(f'../dataset/training_{city}.csv', parse_dates=['valid_at'])
    testing = pd.read_csv(f'../dataset/testing_{city}.csv', parse_dates=['valid_at'])

    num_datasets = 0

    #itero sui dataset
    for data in datasets:
        num_datasets += 1
        city_2 = data['city'].iloc[0]
        type_sensor_2 = data['type_sensor'].iloc[0]
        if city == city_2 and type_sensor == type_sensor_2 or city == city_2:
            continue
        else:
            training = pd.concat([training, data], axis=0)
            var = training['city'].unique()
            var = ', '.join(var)
            print('Traino sui dataset con le città aggiunte', training['city'].unique())
            X_train = training.drop(columns=['pm2p5_y', 'year_month', 'day_hour'])
            y_train = training['pm2p5_y']
            X_test = testing.drop(columns='pm2p5_y')
            y_test = testing['pm2p5_y']
            model = lgb.LGBMRegressor(num_leaves=50, learning_rate=0.05, max_depth=-1, n_estimators=500,
                                      random_state=982, verbose=0)
            metrics, model = full_pipeline(X_train, y_train, X_test, y_test, model, ['city', 'valid_at'],
                                           categorical_columns, 'drop')
            new_row = {'city': var, 'type_sensor': type_sensor, 'num dataset': num_datasets, 'R2': metrics['R2']}
            metrics_incremental_df = pd.concat([metrics_incremental_df, pd.DataFrame(new_row, index=[0])],
                                               ignore_index=True)

metrics_incremental_df

In [None]:
metrics_incremental_df.to_csv('../dataset/metrics_incremental_df.csv', index=False)

## TEST A COPPIE: DATASET TARGET --> TRAIN = TRAINING DATASET TARGET + ALTRO DATASET E TEST = TESTING DATASET TARGET

In [None]:
metrics_df_coppie = pd.DataFrame(columns=['city', 'sensor', 'R2'])
categorical_columns = ['type_sensor', 'sensor_id', 'day_of_week']

datasets = [aosta, badajoz, bangalore, calgary, delhi, hamirpur, lima_iqair, lima_airbeam, uk_pms5003, uk_sps030]
#cicla su tutti i dataset
for dataset in datasets:
    city = dataset['city'].iloc[0]
    type_sensor = dataset['type_sensor'].iloc[0]
    if type_sensor == 'PMS7003' and city == 'Lima':
        type_sensor = 'AIRBEAM'
    if city == 'Southampton' or city == 'Lima':
        #concatena nome città e sensore 
        city = city + '_' + type_sensor

    training = pd.read_csv(f'../dataset/training_{city}.csv', parse_dates=['valid_at'])
    testing = pd.read_csv(f'../dataset/testing_{city}.csv', parse_dates=['valid_at'])

    #itero sui dataset
    for data in datasets:
        city_2 = data['city'].iloc[0]
        type_sensor_2 = data['type_sensor'].iloc[0]
        if city == city_2 and type_sensor == type_sensor_2 or city == city_2:
            continue
        else:
            training_new = pd.concat([training, data], axis=0,
                                     ignore_index=True)  #aggiungo ciclamente un dataset al training
            var = training_new['city'].unique()
            var = ', '.join(var)
            print('Traino sui dataset con le città:', training_new['city'].unique())
            #stampa cittò e sensori 
            print('Traino sui dataset con i sensori:', training_new['type_sensor'].unique())
            #stampa i valori di pm2p5_y
            X_train = training_new.drop(columns=['pm2p5_y', 'day_hour', 'year_month'])
            y_train = training_new['pm2p5_y']
            X_test = testing.drop(columns='pm2p5_y')
            y_test = testing['pm2p5_y']
            model = lgb.LGBMRegressor(num_leaves=50, learning_rate=0.05, max_depth=-1, n_estimators=500,
                                      random_state=982, verbose=0)
            metrics, model = full_pipeline(X_train, y_train, X_test, y_test, model, ['city', 'valid_at'],
                                           categorical_columns, 'drop')
            new_row = {'city': var, 'sensor': type_sensor_2, 'R2': metrics['R2']}
            metrics_df_coppie = pd.concat([metrics_df_coppie, pd.DataFrame(new_row, index=[0])], ignore_index=True)

metrics_df_coppie

In [None]:
metrics_df_coppie.to_csv('../dataset/metrics_df_coppie.csv', index=False)

# Test Generalizzazione
# 1. intero dataset come test
# 2. 25% dataset target e training restante 75% dataset target + altri dataset

In [None]:
# aggiunta di feature temporali
def delete_features(X_train, X_test, column):
    # Assicurati che datetime_column sia una lista
    if isinstance(column, str):
        column = [column]
    for col in column:
        X_train = X_train.drop(columns=col)
        X_test = X_test.drop(columns=col)
    return X_train, X_test


#funzione per gestire i valori mancanti
def handle_missing_values(X_train, X_test, strategy='drop'):
    if strategy == 'drop':
        return X_train.dropna(), X_test.dropna()
    elif strategy == 'mean':
        return X_train.fillna(X_train.mean()), X_test.fillna(X_test.mean())
    else:
        raise ValueError("Strategia non supportata. Usa 'drop' o 'mean'.")


#funzione per preprocessare le variabili categoriche
def preprocess_categorical(X, categorical_columns):
    for col in categorical_columns:
        if col == 'sensor_id' or col == 'type_sensor':  #gestione con lat e long
            X[col] = pd.Categorical(X[col])
            X[col] = X[col].cat.codes
        else:
            X = pd.get_dummies(X, columns=[col], drop_first=True)
    return X


#funzione per trasformare in variabili cicliche
def add_cyclic_features(X_train, X_test, columns):
    for col in columns:
        X_train[col + '_sin'] = np.sin(2 * np.pi * X_train[col] / X_train[col].max())
        X_train[col + '_cos'] = np.cos(2 * np.pi * X_train[col] / X_train[col].max())
        X_test[col + '_sin'] = np.sin(2 * np.pi * X_test[col] / X_test[col].max())
        X_test[col + '_cos'] = np.cos(2 * np.pi * X_test[col] / X_test[col].max())
        X_train = X_train.drop(columns=[col])
        X_test = X_test.drop(columns=[col])
    return X_train, X_test


#definizione pipeline preprocessamento
def preprocess_pipeline_simple_model(X_train, X_test, column, categorical_columns, strategy):
    #X_train, X_test = delete_features(X_train, X_test, column)
    X_train, X_test = handle_missing_values(X_train, X_test, strategy)
    X_train = preprocess_categorical(X_train, categorical_columns)
    X_test = preprocess_categorical(X_test, categorical_columns)
    X_train, X_test = add_cyclic_features(X_train, X_test, ['hour', 'day', 'month', 'year'])

    return X_train, X_test

In [None]:

#funzione per scalare le feature
def scale_features(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled


# Valutazione del modello
def evaluate_model(model, X_test, y_test, y_pred=None):
    if y_pred is None:
        y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    return {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R2': r2}


#funzione per addestrare e valutare il modello
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics = evaluate_model(model, X_test, y_test, y_pred)
    return metrics, model, y_pred


def plot_predictions(y_pred, X_test_plot):
    tempo = X_test_plot['valid_at']
    pm_orig = X_test_plot['pm2p5_x']
    pm_pred = y_pred
    pm_ref = X_test_plot['pm2p5_y']

    fig, ax = plt.subplots(figsize=(8, 6))
    ax.grid(True)
    ax.plot(tempo, pm_orig, label='PM2.5 originale', color='red')
    ax.plot(tempo, pm_pred, label='PM2.5 predetto', color='blue')
    ax.plot(tempo, pm_ref, label='PM2.5 ref station', color='black')
    ax.xaxis.set_major_locator(mdates.MonthLocator())
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    #ruota etichette data
    plt.xticks(rotation=90)
    plt.xlabel('Data')
    plt.ylabel('PM2.5')
    plt.title('PM2.5 originale vs PM2.5 predetto')
    plt.legend()
    plt.show()


def pipeline_generalizzazione_intero(X_train, y_train, X_test, y_test, model, column, categorical_columns, strategy):
    X_train, X_test = preprocess_pipeline_simple_model(X_train, X_test, column, categorical_columns, strategy)
    assert X_train.shape[1] == X_test.shape[1], "Mismatch in feature dimensions after preprocessing!"
    X_test_plot = X_test.copy()
    X_test_plot['pm2p5_y'] = y_test
    X_train, X_test = delete_features(X_train, X_test, column)
    #matrice correlazione
    """ corr = X_train.corr().round(2)
    plt.figure(figsize=(15, 10))
    sns.heatmap(corr, annot=True, cmap='coolwarm')
    plt.show() """

    X_train_scaled, X_test_scaled = scale_features(X_train, X_test)
    metrics, model, y_pred = train_and_evaluate_model(model, X_train_scaled, y_train, X_test_scaled, y_test)
    plot_predictions(y_pred, X_test_plot)
    print(metrics)
    return metrics, model

## Intero dataset come test

In [None]:
df = pd.read_csv('../dataset/all_dataset.csv', parse_dates=['valid_at'])

In [None]:
#dataframe per metriche
metrics_df_intero = pd.DataFrame(columns=['city', 'type_sensor', 'MAE', 'MSE', 'RMSE', 'R2'])
categorical_columns = ['type_sensor', 'sensor_id', 'day_of_week']
# Itera sulle città
for city in df['city'].unique():
    i = 0
    # Filtra i sensori che sono presenti solo in quella città
    city_data = df[df['city'] == city]

    # Itera sui sensori presenti per quella città
    for sensor in city_data['type_sensor'].unique():
        #il mio test diventano le righe con quella città e sensore, il target è pm2p5_y
        X_test = df[(df['city'] == city) & (df['type_sensor'] == sensor)].drop(columns=['pm2p5_y'])
        y_test = df[(df['city'] == city) & (df['type_sensor'] == sensor)]['pm2p5_y']
        X_train = df[(df['city'] != city) & (df['type_sensor'] != sensor)].drop(columns=['pm2p5_y'])
        y_train = df[(df['city'] != city) & (df['type_sensor'] != sensor)]['pm2p5_y']

        print(f"Testo la città {city} e il sensore {sensor}")
        model = lgb.LGBMRegressor(num_leaves=50, learning_rate=0.05, max_depth=-1, n_estimators=500, random_state=982,
                                  verbose=0)
        metrics, model = pipeline_generalizzazione_intero(X_train, y_train, X_test, y_test, model, ['valid_at', 'city'],
                                                          categorical_columns, 'drop')
        #aggiungi le metriche al dataframe
        new_row = {'city': city, 'type_sensor': sensor, 'MAE': metrics['MAE'], 'MSE': metrics['MSE'],
                   'RMSE': metrics['RMSE'], 'R2': metrics['R2']}
        metrics_df_intero = pd.concat([metrics_df_intero, pd.DataFrame(new_row, index=[0])], ignore_index=True)

metrics_df_intero

In [None]:
metrics_df_intero.to_csv('../dataset/metrics_df_intero.csv', index=False)

## Stessa analisi precedente, ma aggiungo il training del dataset target: 25% dataset target e training restante 75% dataset target + altri dataset

In [None]:
#carico i singoli dataset 
aosta = pd.read_csv('../dataset/aosta.csv')
badajoz = pd.read_csv('../dataset/badajoz.csv')
bangalore = pd.read_csv('../dataset/bangalore.csv')
calgary = pd.read_csv('../dataset/calgary.csv')
delhi = pd.read_csv('../dataset/delhi.csv')
hamirpur = pd.read_csv('../dataset/hamirpur.csv')
lima_iqair = pd.read_csv('../dataset/lima_iqair.csv')
lima_airbeam = pd.read_csv('../dataset/lima_airbeam.csv')
uk_pms5003 = pd.read_csv('../dataset/uk_pms5003.csv')
uk_sps030 = pd.read_csv('../dataset/uk_sps030.csv')

In [None]:
#lista dei dataset caricati 
datasets = [aosta, badajoz, bangalore, calgary, delhi, hamirpur, lima_iqair, lima_airbeam, uk_pms5003, uk_sps030]
metrics_df_new = pd.DataFrame(columns=['city', 'type_sensor', 'MAE', 'MSE', 'RMSE', 'R2'])
categorical_columns = ['type_sensor', 'sensor_id', 'day_of_week']

for data in datasets:
    city = data['city'].iloc[0]
    type_sensor = data['type_sensor'].iloc[0]
    if type_sensor == 'PMS7003' and city == 'Lima':
        type_sensor = 'AIRBEAM'
    if city == 'Southampton' or city == 'Lima':
        #concatena nome città e sensore 
        city = city + '_' + type_sensor

    city_data = df[df['city'] == city]

    # Calculate the split index
    split_index = int(len(city_data) * 0.25)

    # Perform the sequential split
    training = city_data.iloc[:split_index]
    testing = city_data.iloc[split_index:]

    training = pd.read_csv(f'../dataset/training_{city}.csv', parse_dates=['valid_at'])
    testing = pd.read_csv(f'../dataset/testing_{city}.csv', parse_dates=['valid_at'])
    #concatena al training tutti gli altri dataset tranne quello corrente
    for dataset in datasets:
        if dataset is not data:
            training = pd.concat([training, dataset])

    X_train = training.drop(columns=['pm2p5_y', 'year_month', 'day_hour'])
    y_train = training['pm2p5_y']
    X_test = testing.drop(columns=['pm2p5_y'])
    y_test = testing['pm2p5_y']

    model = lgb.LGBMRegressor(num_leaves=50, learning_rate=0.05, max_depth=-1, n_estimators=500, random_state=982,
                              verbose=0)
    metrics, model = pipeline_generalizzazione_intero(X_train, y_train, X_test, y_test, model, ['valid_at', 'city'],
                                                      categorical_columns, 'drop')
    #aggiungi le metriche al dataframe
    new_row = {'city': city, 'type_sensor': type_sensor, 'MAE': metrics['MAE'], 'MSE': metrics['MSE'],
               'RMSE': metrics['RMSE'], 'R2': metrics['R2']}
    metrics_df_new = pd.concat([metrics_df_new, pd.DataFrame(new_row, index=[0])], ignore_index=True)

metrics_df_new

In [None]:
metrics_df_new.to_csv('../dataset/metrics_df_25_ele.csv', index=False)

# PCA

In [None]:
from sklearn.decomposition import PCA


In [None]:
df = pd.read_csv('../dataset/all_dataset.csv')

#
# # df not categorical to categorical
# df['day_of_week'] = pd.Categorical(df['day_of_week'])
# df['city'] = pd.Categorical(df['city'])
# df['sensor_id'] = pd.Categorical(df['sensor_id'])
# df['type_sensor'] = pd.Categorical(df['type_sensor'])
#
# cities_dic = {i: city for i, city in enumerate(df['city'].unique())}
# print(cities_dic)
#
# df['day_of_week'] = pd.Categorical(df['day_of_week']).codes
# df['city'] = pd.Categorical(df['city']).codes
# df['sensor_id'] = pd.Categorical(df['sensor_id']).codes
# df['type_sensor'] = pd.Categorical(df['type_sensor']).codes

In [None]:
df

In [None]:
df['city'] = df.apply(
    lambda x: (
        x['city'] + '_IQAir' if x['city'] == 'Lima' and x['type_sensor'] == 'IQAir'
        else x['city'] + '_AIRBEAM' if x['city'] == 'Lima' and x['type_sensor'] != 'IQAir'
        else x['city'] + '_' + x['type_sensor'] if x['city'] == 'Southampton'
        else x['city']
    ),
    axis=1
)


In [None]:
# concat datasets and perform pca
# df_for_pca = df.drop(columns=['valid_at'])
df_for_pca = df[['pm2p5_x', 'pm2p5_y', 'relative_humidity', 'temperature', 'wind_speed', 'rain']]

pca_2 = PCA(n_components=2)
pca_res = pca_2.fit_transform(df_for_pca)
pca_df_2 = pd.DataFrame(data=pca_res, columns=['principal_component_1', 'principal_component_2'])
print(pca_2.explained_variance_ratio_)
print(pca_2.singular_values_)
# total variance
print(pca_2.explained_variance_ratio_.sum())

pca_3 = PCA(n_components=3)
pca_res = pca_3.fit_transform(df_for_pca)
pca_df_3 = pd.DataFrame(data=pca_res,
                        columns=['principal_component_1', 'principal_component_2', 'principal_component_3'])
print(pca_3.explained_variance_ratio_)
print(pca_3.singular_values_)
print(pca_3.explained_variance_ratio_.sum())

In [None]:
# add city to pca_df
pca_df_2['city'] = df['city']
pca_df_3['city'] = df['city']


In [None]:
 # for each dataset plot in red the dataset and leave the rest black
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(1, 1, 1)
ax.set_xlabel('Principal Component 1', fontsize=15)
ax.set_ylabel('Principal Component 2', fontsize=15)
ax.set_title('2 component PCA', fontsize=20)

targets = pca_df_2['city'].unique()
colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k', 'w', 'orange', 'purple']
for target, color in zip(targets, colors):
    indicesToKeep = pca_df_2['city'] == target
    ax.scatter(pca_df_2.loc[indicesToKeep, 'principal_component_1'],
               pca_df_2.loc[indicesToKeep, 'principal_component_2'],
               c=color,
               alpha=0.5,
               s=50,
               label=target)
plt.legend()
ax.grid()
plt.show()

In [None]:
# pca visualization for 3 dimension with rotation with seaborn
import seaborn as sns

sns.pairplot(pca_df_2, hue='city', palette='bright')

# put legend in top left corner inside the plot

plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.scatterplot(data=pca_df_2, x='principal_component_1', y='principal_component_2', hue='city', palette='bright')
plt.title('PCA Scatter Plot')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.show()

In [None]:
# pca visualization for 3 dimension with rotation with seaborn
import seaborn as sns

sns.pairplot(pca_df_3, hue='city', palette='bright')
plt.show()


In [None]:
for column in ['pm2p5_x', 'pm2p5_y', 'relative_humidity', 'temperature', 'wind_speed', 'rain']:
    pca_df_3[column] = df[column]

    plt.figure(figsize=(10, 8))
    # pca visualization for 3 dimension hue based on relative_humidity with ranges
    sns.scatterplot(data=pca_df_3, x='principal_component_1', y='principal_component_2', hue=column)
    plt.title('C1 - C2: ' + column)
    plt.show()

    plt.figure(figsize=(10, 8))
    # pca visualization for 3 dimension hue based on relative_humidity with ranges
    sns.scatterplot(data=pca_df_3, x='principal_component_2', y='principal_component_3', hue=column)
    plt.title('C2 - C3: ' + column)
    plt.show()

    plt.figure(figsize=(10, 8))
    # pca visualization for 3 dimension hue based on relative_humidity with ranges
    sns.scatterplot(data=pca_df_3, x='principal_component_1', y='principal_component_3', hue=column)
    plt.title('C1 - C3: ' + column)
    plt.show()

In [None]:
# # pca visualization for 3 dimension hue based on relative_humidity with ranges
# pca_df_3['relative_humidity'] = df['relative_humidity']
# # pca_df_3['relative_humidity'] = pd.cut(pca_df_3['relative_humidity'], bins=5)
# sns.pairplot(pca_df_3, hue='relative_humidity')
# plt.show()

In [None]:

fig = plt.figure(figsize=(15, 8))
ax = fig.add_subplot(111, projection='3d')
ax.set_xlabel('Principal Component 1', fontsize=15)
ax.set_ylabel('Principal Component 2', fontsize=15)
ax.set_zlabel('Principal Component 3', fontsize=15)
ax.set_title('3 component PCA', fontsize=20)

targets = pca_df_3['city'].unique()
colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k', 'w', 'orange', 'purple']
for target, color in zip(targets, colors):
    indicesToKeep = pca_df_3['city'] == target
    ax.scatter(pca_df_3.loc[indicesToKeep, 'principal_component_1'],
               pca_df_3.loc[indicesToKeep, 'principal_component_2'],
               pca_df_3.loc[indicesToKeep, 'principal_component_3'],
               c=color,
               alpha=0.5,
               s=50,
               label=target)
plt.legend()
ax.grid()
plt.show()

In [None]:
# use pca for selecting datasets

In [None]:
# Define a function to calculate Euclidean distances
def calculate_intersected_data(pca_df, number_of_components, target_city):
    if number_of_components == 2:
        target_data = pca_df[pca_df['city'] == target_city][['principal_component_1', 'principal_component_2']]
        other_data = pca_df[pca_df['city'] != target_city][['principal_component_1', 'principal_component_2']]
    else:
        target_data = pca_df[pca_df['city'] == target_city][
            ['principal_component_1', 'principal_component_2', 'principal_component_3']]
        other_data = pca_df[pca_df['city'] != target_city][
            ['principal_component_1', 'principal_component_2', 'principal_component_3']]

    if number_of_components == 2:
        target_pc1_min, target_pc1_max = target_data['principal_component_1'].min(axis=0), target_data[
            'principal_component_1'].max(axis=0)
        target_pc2_min, target_pc2_max = target_data['principal_component_2'].min(axis=0), target_data[
            'principal_component_2'].max(axis=0)
    else:
        target_pc1_min, target_pc1_max = target_data['principal_component_1'].min(axis=0), target_data[
            'principal_component_1'].max(axis=0)
        target_pc2_min, target_pc2_max = target_data['principal_component_2'].min(axis=0), target_data[
            'principal_component_2'].max(axis=0)
        target_pc3_min, target_pc3_max = target_data['principal_component_3'].min(axis=0), target_data[
            'principal_component_3'].max(axis=0)

    addition = 50

    # select in other_data the data that are between min and max of the different components
    if number_of_components == 3:
        data_intersected = other_data[
            (other_data['principal_component_1'] >= target_pc1_min - addition) & (
                    other_data['principal_component_1'] <= target_pc1_max + addition) &
            (other_data['principal_component_2'] >= target_pc2_min - addition) & (
                    other_data['principal_component_2'] <= target_pc2_max + addition) &
            (other_data['principal_component_3'] >= target_pc3_min - addition) & (
                    other_data['principal_component_3'] <= target_pc3_max + addition)]
    else:
        data_intersected = other_data[
            (other_data['principal_component_1'] >= target_pc1_min - addition) & (
                    other_data['principal_component_1'] <= target_pc1_max + addition) &
            (other_data['principal_component_2'] >= target_pc2_min - addition) & (
                    other_data['principal_component_2'] <= target_pc2_max + addition)]

    return data_intersected.index


## tutto e 75 con 2 componenti

In [None]:
# Initialize a dictionary to store results
results = {}

# Train a model for each city using nearby cities' data
for target_city in pca_df_3['city'].unique():
    data_intersected_indexes = calculate_intersected_data(pca_df_3, 2, target_city)

    print(f"Intersected data for {target_city}: {data_intersected_indexes}")

    pca_df_3_ = pca_df_3.copy()
    print(pca_df_3_.columns)
    # add column: where city is target_city put 1, where index is in data_intersected_indexes put 2, else 0
    pca_df_3_['intersection'] = pca_df_3_.apply(
        lambda x: 1 if x['city'] == target_city else 2 if x.name in data_intersected_indexes else 0, axis=1)

    results[target_city] = pca_df_3_

    # scatterplot pca_df_3_ based on intersection class
    plt.figure(figsize=(10, 8))
    sns.scatterplot(data=pca_df_3_, x='principal_component_1', y='principal_component_2', hue='intersection')
    plt.title(f'PCA 2D for {target_city}')
    plt.show()


In [None]:
# in results for each city print how many values for each one of the other cities are set to 2
for city, data in results.items():
    print(f"City: {city}")
    print(data['intersection'].value_counts())

    # print in intersection of value = 2 how many for each unique city
    print(data[data['intersection'] == 2]['city'].value_counts())


### tutto

In [None]:
metrics_df_pca = pd.DataFrame(columns=['city', 'intersected_cities', 'MAE', 'MSE', 'RMSE', 'R2'])
categorical_columns = ['type_sensor', 'sensor_id', 'day_of_week']

for city, data in results.items():
    target_city_indexes = data[data['intersection'] == 1].index
    intersected_cities_indexes = data[data['intersection'] == 2].index

    target_city = df[df.index.isin(target_city_indexes)]
    intersected_cities = df[df.index.isin(intersected_cities_indexes)]

    print(f"Testo la città {city}")
    # print unique cities value count
    print(target_city['city'].value_counts())
    print(intersected_cities['city'].value_counts())

    if len(intersected_cities) == 0:
        continue

    X_train = intersected_cities.drop(columns=['pm2p5_y'])
    y_train = intersected_cities['pm2p5_y']
    X_test = target_city.drop(columns=['pm2p5_y'])
    y_test = target_city['pm2p5_y']
    model = lgb.LGBMRegressor(num_leaves=50, learning_rate=0.05, max_depth=-1, n_estimators=500, random_state=982,
                              verbose=0)
    metrics, model = pipeline_generalizzazione_intero(X_train, y_train, X_test, y_test, model, ['valid_at', 'city'],
                                                      categorical_columns, 'drop')
    #aggiungi le metriche al dataframe
    new_row = {'city': city, 'intersected_cities': ", ".join(map(str, intersected_cities['city'].unique())),
               'MAE': metrics['MAE'], 'MSE': metrics['MSE'], 'RMSE': metrics['RMSE'], 'R2': metrics['R2']}
    metrics_df_pca = pd.concat([metrics_df_pca, pd.DataFrame(new_row, index=[0])], ignore_index=True)

metrics_df_pca

In [None]:
metrics_df_pca.to_csv('../dataset/metrics_df_pca_2_intero.csv', index=False)

### 75%

In [None]:
metrics_df_pca_75 = pd.DataFrame(columns=['city', 'intersected_cities', 'MAE', 'MSE', 'RMSE', 'R2'])
categorical_columns = ['type_sensor', 'sensor_id', 'day_of_week']

for city, data in results.items():
    intersected_cities_indexes = data[data['intersection'] == 2].index

    target_city_training = pd.read_csv(f'../dataset/training_{city}.csv', parse_dates=['valid_at'])
    target_city_testing = pd.read_csv(f'../dataset/testing_{city}.csv', parse_dates=['valid_at'])

    intersected_cities = df[df.index.isin(intersected_cities_indexes)]
    if len(intersected_cities) == 0:
        continue

    training = pd.concat([intersected_cities, target_city_training])

    print(f"Testo la città {city}")
    # print unique cities value count
    print(training['city'].value_counts())
    print(target_city_testing['city'].value_counts())

    X_train = training.drop(columns=['pm2p5_y'])
    y_train = training['pm2p5_y']
    X_test = target_city_testing.drop(columns=['pm2p5_y'])
    y_test = target_city_testing['pm2p5_y']
    model = lgb.LGBMRegressor(num_leaves=50, learning_rate=0.05, max_depth=-1, n_estimators=500, random_state=982,
                              verbose=0)
    metrics, model = pipeline_generalizzazione_intero(X_train, y_train, X_test, y_test, model, ['valid_at', 'city'],
                                                      categorical_columns, 'drop')
    #aggiungi le metriche al dataframe
    new_row = {'city': city, 'intersected_cities': ", ".join(map(str, intersected_cities['city'].unique())),
               'MAE': metrics['MAE'], 'MSE': metrics['MSE'], 'RMSE': metrics['RMSE'], 'R2': metrics['R2']}
    metrics_df_pca_75 = pd.concat([metrics_df_pca_75, pd.DataFrame(new_row, index=[0])], ignore_index=True)

metrics_df_pca_75

In [None]:
metrics_df_pca_75.to_csv('../dataset/metrics_df_pca_2_75.csv', index=False)

### 25 initial

In [None]:
metrics_df_pca_75_init = pd.DataFrame(columns=['city', 'intersected_cities', 'MAE', 'MSE', 'RMSE', 'R2'])
categorical_columns = ['type_sensor', 'sensor_id', 'day_of_week']

for city, data in results.items():
    intersected_cities_indexes = data[data['intersection'] == 2].index

    target_city_indexes = data[data['intersection'] == 1].index
    target_city = df[df.index.isin(target_city_indexes)]

    # Calculate the split index
    split_index = int(len(target_city) * 0.25)

    # Perform the sequential split
    target_city_training = target_city.iloc[:split_index]
    target_city_testing = target_city.iloc[split_index:]

    intersected_cities = df[df.index.isin(intersected_cities_indexes)]
    if len(intersected_cities) == 0:
        continue

    training = pd.concat([intersected_cities, target_city_training])

    print(f"Testo la città {city}")
    # print unique cities value count
    print(training['city'].value_counts())
    print(target_city_testing['city'].value_counts())

    X_train = training.drop(columns=['pm2p5_y'])
    y_train = training['pm2p5_y']
    X_test = target_city_testing.drop(columns=['pm2p5_y'])
    y_test = target_city_testing['pm2p5_y']
    model = lgb.LGBMRegressor(num_leaves=50, learning_rate=0.05, max_depth=-1, n_estimators=500, random_state=982,
                              verbose=0)
    metrics, model = pipeline_generalizzazione_intero(X_train, y_train, X_test, y_test, model, ['valid_at', 'city'],
                                                      categorical_columns, 'drop')
    #aggiungi le metriche al dataframe
    new_row = {'city': city, 'intersected_cities': ", ".join(map(str, intersected_cities['city'].unique())),
               'MAE': metrics['MAE'], 'MSE': metrics['MSE'], 'RMSE': metrics['RMSE'], 'R2': metrics['R2']}
    metrics_df_pca_75_init = pd.concat([metrics_df_pca_75_init, pd.DataFrame(new_row, index=[0])], ignore_index=True)

metrics_df_pca_75_init

In [None]:
metrics_df_pca_75_init.to_csv('../dataset/metrics_df_pca_75_init_range50.csv', index=False)


## tutto e 75 con 3 componenti

In [None]:
pca_df_3

In [None]:
# Initialize a dictionary to store results
results = {}

# Train a model for each city using nearby cities' data
for target_city in pca_df_3['city'].unique():
    data_intersected_indexes = calculate_intersected_data(pca_df_3, 2, target_city)

    print(f"Intersected data for {target_city}: {data_intersected_indexes}")

    pca_df_3_ = pca_df_3.copy()
    print(pca_df_3_.columns)
    # add column: where city is target_city put 1, where index is in data_intersected_indexes put 2, else 0
    pca_df_3_['intersection'] = pca_df_3_.apply(
        lambda x: 1 if x['city'] == target_city else 2 if x.name in data_intersected_indexes else 0, axis=1)

    results[target_city] = pca_df_3_

    # scatterplot pca_df_3_ based on intersection class
    plt.figure(figsize=(10, 8))
    sns.scatterplot(data=pca_df_3_, x='principal_component_1', y='principal_component_2', hue='intersection')
    plt.title(f'PCA 2D for {target_city}')
    plt.show()

### tutto

In [None]:
metrics_df_pca_3 = pd.DataFrame(columns=['city', 'intersected_cities', 'MAE', 'MSE', 'RMSE', 'R2'])
categorical_columns = ['type_sensor', 'sensor_id', 'day_of_week']

for city, data in results.items():
    target_city_indexes = data[data['intersection'] == 1].index
    intersected_cities_indexes = data[data['intersection'] == 2].index

    target_city = df[df.index.isin(target_city_indexes)]
    intersected_cities = df[df.index.isin(intersected_cities_indexes)]

    print(f"Testo la città {city}")
    # print unique cities value count
    print(target_city['city'].value_counts())
    print(intersected_cities['city'].value_counts())

    if len(intersected_cities) == 0:
        continue

    X_train = intersected_cities.drop(columns=['pm2p5_y'])
    y_train = intersected_cities['pm2p5_y']
    X_test = target_city.drop(columns=['pm2p5_y'])
    y_test = target_city['pm2p5_y']
    model = lgb.LGBMRegressor(num_leaves=50, learning_rate=0.05, max_depth=-1, n_estimators=500, random_state=982,
                              verbose=0)
    metrics, model = pipeline_generalizzazione_intero(X_train, y_train, X_test, y_test, model, ['valid_at', 'city'],
                                                      categorical_columns, 'drop')
    #aggiungi le metriche al dataframe
    new_row = {'city': city, 'intersected_cities': ", ".join(map(str, intersected_cities['city'].unique())),
               'MAE': metrics['MAE'], 'MSE': metrics['MSE'], 'RMSE': metrics['RMSE'], 'R2': metrics['R2']}
    metrics_df_pca_3 = pd.concat([metrics_df_pca_3, pd.DataFrame(new_row, index=[0])], ignore_index=True)

metrics_df_pca_3

In [None]:
metrics_df_pca_3.to_csv('../dataset/metrics_df_pca_3_intero.csv', index=False)


### 75%


In [None]:
metrics_df_pca_3_75 = pd.DataFrame(columns=['city', 'intersected_cities', 'MAE', 'MSE', 'RMSE', 'R2'])
categorical_columns = ['type_sensor', 'sensor_id', 'day_of_week']

for city, data in results.items():
    intersected_cities_indexes = data[data['intersection'] == 2].index

    target_city_training = pd.read_csv(f'../dataset/training_{city}.csv', parse_dates=['valid_at'])
    target_city_testing = pd.read_csv(f'../dataset/testing_{city}.csv', parse_dates=['valid_at'])

    intersected_cities = df[df.index.isin(intersected_cities_indexes)]
    if len(intersected_cities) == 0:
        continue

    training = pd.concat([intersected_cities, target_city_training])

    print(f"Testo la città {city}")
    # print unique cities value count
    print(training['city'].value_counts())
    print(target_city_testing['city'].value_counts())

    X_train = training.drop(columns=['pm2p5_y'])
    y_train = training['pm2p5_y']
    X_test = target_city_testing.drop(columns=['pm2p5_y'])
    y_test = target_city_testing['pm2p5_y']
    model = lgb.LGBMRegressor(num_leaves=50, learning_rate=0.05, max_depth=-1, n_estimators=500, random_state=982,
                              verbose=0)
    metrics, model = pipeline_generalizzazione_intero(X_train, y_train, X_test, y_test, model, ['valid_at', 'city'],
                                                      categorical_columns, 'drop')
    #aggiungi le metriche al dataframe
    new_row = {'city': city, 'intersected_cities': ", ".join(map(str, intersected_cities['city'].unique())),
               'MAE': metrics['MAE'], 'MSE': metrics['MSE'], 'RMSE': metrics['RMSE'], 'R2': metrics['R2']}
    metrics_df_pca_3_75 = pd.concat([metrics_df_pca_3_75, pd.DataFrame(new_row, index=[0])], ignore_index=True)

metrics_df_pca_3_75


In [None]:
metrics_df_pca_3_75.to_csv('../dataset/metrics_df_pca_3_75_range50.csv', index=False)

### amplified ranges

In [None]:
import matplotlib.pyplot as plt

# Cities and their R² values
cities_data = {
    'Aosta': [0.9046, 0.8926, 0.7981, 0.7668, 0.7609, 0.7242, 0.7140, 0.6827, 0.5262, -0.2862],
    'Calgary': [0.9846, 0.9761, 0.9638, 0.6745, -0.1688, -0.7081, 0.6078, -2.2869, -2.0571, -0.8954],
    'Badajoz': [0.8903, 0.7905, 0.7067, 0.7550, 0.6827, 0.6522, 0.5752, 0.5238, 0.5323, 0.2876],
    'Bangalore': [0.9051, 0.9036, 0.8847, 0.8898, 0.8388, 0.8536, 0.6078, 0.5507, 0.6692, 0.5144],
    'Delhi': [0.9023, 0.9042, 0.8993, 0.8280, 0.8223, 0.8273, 0.8145, 0.8062, 0.8045, 0.8029],
    'Hamirpur': [0.9598, 0.9590, 0.9408, 0.9441, 0.9378, 0.8956, 0.8972, 0.9070, 0.8907, 0.8828],
    'Lima_IQAir': [0.7318, -0.1936, -0.2543, -0.1882, -0.1417, -0.4159, -0.4512, -0.4322, -0.4463, -0.4951],
    'Lima_AIRBEAM': [0.8041, 0.8307, 0.8201, 0.7759, 0.7469, 0.7074, 0.7568, -0.2989, 0.0590, 0.2788],
    'Southampton_PMS5003': [0.8851, 0.9399, 0.9377, 0.9334, 0.9298, 0.9294, 0.9056, 0.9016, 0.6052, 0.0803],
    'Southampton_SPS030': [0.8895, 0.9021, 0.8793, 0.8712, 0.8644, 0.8603, 0.2201, -0.1713, 0.4067, 0.4933]
}

# Create the plot
plt.figure(figsize=(12, 5))
x_values = list(range(10))

for city, r_squared_values in cities_data.items():
    plt.plot(x_values, r_squared_values, marker='o', label=city)

plt.title('R² Values by Number of Datasets Combined', fontsize=16)
plt.xlabel('Number of Datasets Combined', fontsize=12)
plt.ylabel('R² Value', fontsize=12)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()