In [1]:
import src.preprocessing
from src.preprocessing import Dataset,Preprocessor
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from catboost import CatBoostRegressor, Pool
import lightgbm as lgb
import xgboost as xgb
from xgboost.callback import EarlyStopping, LearningRateScheduler


### Chargement de la donnée

In [2]:
train=Dataset("data/train.csv")
data_train=train.load_data()
test=Dataset("data/test.csv")
data_test=test.load_data()

### Preprocessing de la donnée

In [3]:
train_preprocessor=Preprocessor(data_train,train=True)
test_preprocessor=Preprocessor(data_test,train=False)

In [4]:
train_preprocessor.recup_electric()
test_preprocessor.recup_electric()

In [5]:
train_preprocessor.delete_useless_columns()
test_preprocessor.delete_useless_columns()

['Vf', 'Ernedc (g/km)', 'MMS', 'De', 'Enedc (g/km)', 'Status', 'r'] have been deleted on train data
['Vf', 'Ernedc (g/km)', 'MMS', 'De', 'Enedc (g/km)', 'Status', 'r'] have been deleted on test data


In [6]:
numerical_cols=data_train.select_dtypes(include='number').columns.tolist()

In [7]:
numerical_cols.remove('ID')
numerical_cols.remove('Ewltp (g/km)')

In [8]:
for col in numerical_cols:
    train_preprocessor.winsorize_outliers(col)
    test_preprocessor.winsorize_outliers(col)

In [9]:
x_variables=list(filter(lambda x : x not in ['ID','Ewltp (g/km)','Date of registration'],data_train.columns.tolist())) #,'Erwltp (g/km)'

In [10]:
for col in x_variables:
    train_preprocessor.fill_missing_values(col)
    test_preprocessor.fill_missing_values(col)

In [11]:
col_categoricals=list(filter(lambda x: x not in numerical_cols,x_variables))

In [12]:
for col in col_categoricals:
    if Preprocessor.nombre_val_unique[col]>=15: #eventually replace by catboost encoder but careful cuz of Target ! (do TTS first)
        train_preprocessor.count_encoder(col)
        test_preprocessor.count_encoder(col)
        print(f"encoding count : {col}")
    else:
        train_preprocessor.ohe_encoder(col)
        test_preprocessor.ohe_encoder(col)
        print(f"encoding OHE : {col}")

encoding count : Country
encoding count : VFN
encoding OHE : Mp
encoding count : Mh
encoding count : Man
encoding count : Tan
encoding count : T
encoding count : Va
encoding count : Ve
encoding count : Mk
encoding count : Cn
encoding OHE : Ct
encoding OHE : Cr
encoding OHE : Ft
encoding OHE : Fm
encoding count : IT


In [13]:
data_train.drop(columns=['Date of registration','ID','Erwltp (g/km)'], inplace=True)#
data_test.drop(columns=['Date of registration','Erwltp (g/km)'], inplace=True)#,'Erwltp (g/km)'

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [15]:
numerical_cols.remove('Erwltp (g/km)')

In [16]:
# poly = PolynomialFeatures(degree=2,include_bias=False)
# poly.fit(data_train[numerical_cols])

# poly_columns=poly.get_feature_names_out(numerical_cols)
# data_train[poly_columns]=poly.transform(data_train[numerical_cols])
# data_test[poly_columns]=poly.transform(data_test[numerical_cols])


  data_train[poly_columns]=poly.transform(data_train[numerical_cols])
  data_train[poly_columns]=poly.transform(data_train[numerical_cols])
  data_train[poly_columns]=poly.transform(data_train[numerical_cols])
  data_train[poly_columns]=poly.transform(data_train[numerical_cols])
  data_train[poly_columns]=poly.transform(data_train[numerical_cols])
  data_train[poly_columns]=poly.transform(data_train[numerical_cols])
  data_train[poly_columns]=poly.transform(data_train[numerical_cols])
  data_test[poly_columns]=poly.transform(data_test[numerical_cols])
  data_test[poly_columns]=poly.transform(data_test[numerical_cols])
  data_test[poly_columns]=poly.transform(data_test[numerical_cols])
  data_test[poly_columns]=poly.transform(data_test[numerical_cols])
  data_test[poly_columns]=poly.transform(data_test[numerical_cols])
  data_test[poly_columns]=poly.transform(data_test[numerical_cols])
  data_test[poly_columns]=poly.transform(data_test[numerical_cols])


In [17]:
train, test = train_test_split(data_train, test_size=0.33, random_state=42)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

X_train, y_train = train.drop(columns=["Ewltp (g/km)"]).to_numpy(), train["Ewltp (g/km)"].to_numpy()
X_test, y_test = test.drop(columns=["Ewltp (g/km)"]).to_numpy(), test["Ewltp (g/km)"].to_numpy()

### Modélisation

In [19]:
params = {'n_estimators':4000,
          'max_depth': 35,
          'learning_rate': 0.005,
          'colsample_bytree':0.80,
          'gamma':10,
          'reg_alpha':0.8,
          'reg_lambda':0.1,
          'objective': 'reg:squarederror',
          'tree_method': 'hist',
          'n_jobs':-1,
          'device':'cuda'
}

model = xgb.XGBRegressor(**params)

model.fit(X_train, y_train,verbose=True)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

### Prédiction