# Import Packages

In [2]:
import src.preprocessing
from src.preprocessing import Dataset,Preprocessor
import pandas as pd
import numpy as np

from catboost import CatBoostRegressor, Pool
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeRegressor


pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)

## Chargement de la donnée

In [3]:
train=Dataset("data/train.csv")
data_train=train.load_data()
test=Dataset("data/test.csv")
data_test=test.load_data()

# Preprocessing



In [4]:
train_preprocessor=Preprocessor(data_train,train=True)
test_preprocessor=Preprocessor(data_test,train=False)

## Récupération d'observations

Lorsque la voiture est électrique: on peut se permettre de set `ec(cm3)`, `Fuel consumption `, `z (Wh/km)` à 0

lorsque la voiture n'est pas hybride / électrique : on peut mettre `Electric range (km)` à 0

In [5]:
train_preprocessor.recup_electric()
test_preprocessor.recup_electric()

## Delete columns

Supprimer les colonnes avec 1 seul valeur unique (aucune info) ou 0 valeur unique (que des NaN)

Supprimer les colonnes avec **+ de 50%** de NaN

In [6]:
train_preprocessor.delete_useless_columns()
test_preprocessor.delete_useless_columns()

['MMS', 'r', 'Vf', 'Enedc (g/km)', 'De', 'Status', 'Ernedc (g/km)'] have been deleted on train data
['MMS', 'r', 'Vf', 'Enedc (g/km)', 'De', 'Status', 'Ernedc (g/km)'] have been deleted on test data


## Outliers 

Utilisation de l'écart interquartile pour identifier les valeurs aberrantes.

Imputation des outliers:

Fixer les valeurs aberrantes à un certain pourcentage (par exemple, 5e et 95e percentiles).

### windorization of outliers

In [7]:
numerical_cols=data_train.select_dtypes(include='number').columns.tolist()
numerical_cols.remove('ID')
numerical_cols.remove('Ewltp (g/km)')

for col in numerical_cols:
    train_preprocessor.winsorize_outliers(col)
    test_preprocessor.winsorize_outliers(col)

## Impute NaN by median/mode



In [8]:
x_variables=list(filter(lambda x : x not in ['ID','Ewltp (g/km)','Date of registration'],data_train.columns.tolist()))

for col in x_variables:
    train_preprocessor.fill_missing_values(col)
    test_preprocessor.fill_missing_values(col)


## Encode categorical columns

Customized encoding. 

- Count Encoder (nunique >=15)
```
Country 29  valeurs uniques
VFN 8456  valeurs uniques
Mh 95  valeurs uniques
Man 104  valeurs uniques
Tan 6318  valeurs uniques
T 1506  valeurs uniques
Va 5413  valeurs uniques
Ve 25570  valeurs uniques
Mk 694  valeurs uniques
Cn 8323  valeurs uniques
IT 487  valeurs uniques
```
- OHE Encoder (nunique < 15) rajoute 35 colonnes 
```
Mp 10  valeurs uniques
Ct 5  valeurs uniques
Cr 3  valeurs uniques
Ft 11  valeurs uniques
Fm 6  valeurs uniques
```

In [9]:
col_categoricals=list(filter(lambda x: x not in numerical_cols,x_variables))

for col in col_categoricals:
    if Preprocessor.nombre_val_unique[col]>=15: 
        train_preprocessor.count_encoder(col)
        test_preprocessor.count_encoder(col)
        print(f"encoding count : {col}")
    else:
        train_preprocessor.ohe_encoder(col)
        test_preprocessor.ohe_encoder(col)
        print(f"encoding OHE : {col}")


encoding count : Country
encoding count : VFN
encoding OHE : Mp
encoding count : Mh
encoding count : Man
encoding count : Tan
encoding count : T
encoding count : Va
encoding count : Ve
encoding count : Mk
encoding count : Cn
encoding OHE : Ct
encoding OHE : Cr
encoding OHE : Ft
encoding OHE : Fm
encoding count : IT


In [10]:
data_train.drop(columns=['Date of registration','ID','Erwltp (g/km)'], inplace=True)
data_test.drop(columns=['Date of registration','Erwltp (g/km)'], inplace=True)

## Split 

In [11]:
train, test = train_test_split(data_train, test_size=0.33, random_state=42)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

X_train, y_train = train.drop(columns=["Ewltp (g/km)"]), train["Ewltp (g/km)"]
X_test, y_test = test.drop(columns=["Ewltp (g/km)"]), test["Ewltp (g/km)"]

# Model Testing

### Xgboost final 


In [12]:
params = {'n_estimators':4000,
          'max_depth': 35,
          'learning_rate': 0.005,
          'colsample_bytree':0.80,
          'gamma':10,
          'reg_alpha':0.8,
          'reg_lambda':0.1,
          'objective': 'reg:squarederror',
          'tree_method': 'hist',
          'n_jobs':-1
}

model = xgb.XGBRegressor(**params)

model.fit(data_train.drop(columns='Ewltp (g/km)'),data_train['Ewltp (g/km)'])

data_test["Ewltp (g/km)"] = model.predict(data_test.drop(columns="ID"))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_final_xgb_no_erwltp.csv", index=False)

### Xgboost k-fold = 15 (Modèle Challenger)

approx time: 8hours

computing time: 614min =10hours

In [12]:
params = {'n_estimators':4000,
          'max_depth': 35,
          'learning_rate': 0.005,
          'colsample_bytree':0.80,
          'gamma':10,
          'reg_alpha':0.8,
          'reg_lambda':0.1,
          'objective': 'reg:squarederror',
          'tree_method': 'hist',
          'n_jobs':-1
}
models=[]

num_folds = 15
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

test_predictions = []

for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    model = xgb.XGBRegressor(**params)
    model.fit(X_train_fold, y_train_fold)
    models.append(model)

    val_preds = model.predict(X_val_fold)

    fold_mae = mean_absolute_error(y_val_fold, val_preds)
    print(f'MAE for this fold: {fold_mae}')

    test_fold_preds = model.predict(X_test)
    test_predictions.append(test_fold_preds)

final_test_predictions = np.mean(np.array(test_predictions), axis=0)

final_mae = mean_absolute_error(y_test, final_test_predictions)
print(f'\nFinal MAE on test set: {final_mae}')

test_predictions = []

for model in models:
    test_fold_preds = model.predict(data_test.drop(columns="ID"))
    test_predictions.append(test_fold_preds)

data_test["Ewltp (g/km)"] = np.mean(np.array(test_predictions), axis=0)

data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_kf15_xgb.csv", index=False)

MAE for this fold: 2.8137879227060356
MAE for this fold: 2.807222170572855
MAE for this fold: 2.8214681395667456
MAE for this fold: 2.8266807353876753
MAE for this fold: 2.8236043179970443
MAE for this fold: 2.808979971720696
MAE for this fold: 2.8249682811011914
MAE for this fold: 2.814930391837997
MAE for this fold: 2.81335782161276
MAE for this fold: 2.8092082567323153
MAE for this fold: 2.8295350483578
MAE for this fold: 2.8119301966516437
MAE for this fold: 2.8212127127769056
MAE for this fold: 2.817554941246961
MAE for this fold: 2.8095714221765045

Final MAE on test set: 2.8178609466498625
