# 📈 Modeling.

### In this Notebook we'll build our lightgbm model and save it for future use.

### Importing Necessary Libraries.TimeSeriesSplit as TSS

In [1]:
import pandas as pd
import numpy as np

import lightgbm as lgb
from sklearn.model_selection import KFold, train_test_split as TTS, TimeSeriesSplit as TSS
from sklearn.preprocessing import StandardScaler as SS
from sklearn.metrics import mean_absolute_error as MAE
import optuna

from datetime import datetime
import time

import os
from dotenv import load_dotenv

load_dotenv("../.env")

import sys

sys.path.append("../Scripts/")
sys.path.append("../")

import paths
import config
import sourcing
import featureengineering

### Loading the Data From Disk. (Later we might want to load it from the Database)

In [2]:
FEData = pd.read_parquet(paths.TRANSFORMED_DATA_DIR / "TRANSFORMED_All_Cities_HistoricalData_01092022_30042024.parquet")
FEData.head()

Unnamed: 0,Temperature_2m,Relative_Humidity_2m,Dew_Point_2m,Precipitation,Pressure_msl,Surface_Pressure,Cloud_Cover,Wind_Speed_10m,Wind_Speed_100m,Wind_Wirection_10m,...,IsHour_14,IsHour_15,IsHour_16,IsHour_17,IsHour_18,IsHour_19,IsHour_20,IsHour_21,IsHour_22,IsHour_23
0,26.9,52,16.4,0.2,1014.9,1011.2,39,1.8,1.8,143,...,0,0,0,0,0,0,0,0,0,0
1,24.2,68,18.0,0.0,1014.4,1010.7,25,4.8,6.6,153,...,0,0,0,0,0,0,0,0,0,0
2,26.8,58,17.7,0.0,1013.8,1010.1,39,3.1,3.6,249,...,0,0,0,0,0,0,0,0,0,0
3,25.1,62,17.3,0.0,1013.4,1009.7,4,4.0,7.2,265,...,0,0,0,0,0,0,0,0,0,0
4,24.4,74,19.5,0.0,1013.6,1009.9,11,3.9,7.7,236,...,0,0,0,0,0,0,0,0,0,0


### Splitting the Data.

In [3]:
Features = np.array(FEData.drop(["EuropeanAQI"], axis=1, inplace=False))
Targets = np.array(FEData["EuropeanAQI"])

# Creating Train and Test Set.
xTrain, xTest, yTrain, yTest = TTS(Features, Targets, test_size=0.2)

print(f"{xTrain.shape = }")
print(f"{yTrain.shape = }")
print(f"{xTest.shape = }")
print(f"{yTest.shape = }")

xTrain.shape = (233472, 49)
yTrain.shape = (233472,)
xTest.shape = (58368, 49)
yTest.shape = (58368,)


### Scaling the Data.

In [4]:
Scaler = SS()
ScaledxTrain = Scaler.fit_transform(xTrain) 

### Creating the Model and searching for the best parameters.

In [6]:
def Objective(T:optuna.trial.Trial) -> float:
    
    Hyperparams = {"metric":"mae",
                   "verbose":-1,
                   "num_leaves":T.suggest_int("num_leaves", 2, 256),
                   "feature_fraction":T.suggest_float("feature_fraction", 0.2, 1.0),
                   "bagging_fraction":T.suggest_float("bagging_fraction", 0.2, 1.0),
                   "min_child_samples":T.suggest_int("min_child_samples", 3, 100),
                  }
    
    tss = TSS(n_splits=5)
    Scores = []
    
    for trainIndex, valIndex in tss.split(ScaledxTrain):
        
        #Split Data for Training and Validation
        xTrain_, xVal_ = ScaledxTrain[trainIndex, :], ScaledxTrain[valIndex, :]
        yTrain_, yVal_ = yTrain[trainIndex], yTrain[valIndex]
        
        #Train the Model
        LGB = lgb.LGBMRegressor(**Hyperparams)
        LGB.fit(xTrain_, yTrain_)
        
        #Evaluate the Model
        yPred = LGB.predict(xVal_)
        mae = MAE(yVal_, yPred)
        
        Scores.append(mae)
        
    #Return Avg Score
    return np.array(Scores).mean()

In [7]:
Study = optuna.create_study(direction="minimize")
Study.optimize(Objective, n_trials=5)

[I 2024-05-03 10:00:38,665] A new study created in memory with name: no-name-2431f7a8-3caa-4231-b637-05329f8af953
[I 2024-05-03 10:00:46,957] Trial 0 finished with value: 4.994062285698539 and parameters: {'num_leaves': 151, 'feature_fraction': 0.6157120368385596, 'bagging_fraction': 0.6480122204791041, 'min_child_samples': 25}. Best is trial 0 with value: 4.994062285698539.
[I 2024-05-03 10:00:51,409] Trial 1 finished with value: 6.1590849266430405 and parameters: {'num_leaves': 75, 'feature_fraction': 0.22019819140994, 'bagging_fraction': 0.68204175544525, 'min_child_samples': 67}. Best is trial 0 with value: 4.994062285698539.
[I 2024-05-03 10:00:58,606] Trial 2 finished with value: 4.698438716480531 and parameters: {'num_leaves': 183, 'feature_fraction': 0.892969458261786, 'bagging_fraction': 0.21665666426395414, 'min_child_samples': 9}. Best is trial 2 with value: 4.698438716480531.
[I 2024-05-03 10:01:06,043] Trial 3 finished with value: 4.862616038683241 and parameters: {'num_le

In [8]:
BestParams = Study.best_trial.params
BestParams

{'num_leaves': 183,
 'feature_fraction': 0.892969458261786,
 'bagging_fraction': 0.21665666426395414,
 'min_child_samples': 9}

In [9]:
BestModel = lgb.LGBMRegressor(**BestParams)
BestModel.fit(ScaledxTrain, yTrain)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032583 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4175
[LightGBM] [Info] Number of data points in the train set: 233472, number of used features: 49
[LightGBM] [Info] Start training from score 35.934605


In [10]:
Preds = BestModel.predict(Scaler.transform(xTest))
testMae = MAE(yTest, Preds)
print(f"{testMae = :.4f}")

testMae = 4.4712


### Dumping Model, Scaler and Data Used to Disk.

In [18]:
ModelVersion = len(os.listdir(paths.MODELS_DIR))+1
ModelVersion

1

In [21]:
import joblib

joblib.dump(BestModel, paths.MODELS_DIR / f'ModelVersion{ModelVersion}.pkl')
joblib.dump(Scaler, paths.MODELS_DIR / f'ScalerVersion{ModelVersion}.pkl')

['/localsynch/Models/ScalerVersion1.pkl']

In [22]:
FEData.to_parquet(paths.MODELS_DIR / f'DataVersion{ModelVersion}.parquet')