In [None]:
import src.final_preprocessing
from src.final_preprocessing import Dataset,Preprocessor
from catboost import CatBoostRegressor, Pool
import lightgbm as lgb
import xgboost as xgb
from xgboost.callback import EarlyStopping, LearningRateScheduler


### Chargement de la donnée

In [None]:
train=Dataset("data/train.csv")
data_train=train.load_data()
test=Dataset("data/test.csv")
data_test=test.load_data()

### Preprocessing de la donnée

In [None]:
train_preprocessor=Preprocessor(data_train,train=True)
test_preprocessor=Preprocessor(data_test,train=False)

In [None]:
train_preprocessor.recup_electric()
test_preprocessor.recup_electric()

In [None]:
train_preprocessor.delete_useless_columns()
test_preprocessor.delete_useless_columns()

In [None]:
numerical_cols=data_train.select_dtypes(include='number').columns.tolist()

In [None]:
numerical_cols.remove('ID')
numerical_cols.remove('Ewltp (g/km)')

In [None]:
for col in numerical_cols:
    train_preprocessor.winsorize_outliers(col)
    test_preprocessor.winsorize_outliers(col)

In [None]:
x_variables=list(filter(lambda x : x not in ['ID','Ewltp (g/km)','Date of registration'],data_train.columns.tolist())) #,'Erwltp (g/km)'

In [None]:
for col in x_variables:
    train_preprocessor.fill_missing_values(col)
    test_preprocessor.fill_missing_values(col)

In [None]:
col_categoricals=list(filter(lambda x: x not in numerical_cols,x_variables))

In [None]:
for col in col_categoricals:
    if Preprocessor.nombre_val_unique[col]>=15: #eventually replace by catboost encoder but careful cuz of Target ! (do TTS first)
        train_preprocessor.count_encoder(col)
        test_preprocessor.count_encoder(col)
        print(f"encoding count : {col}")
    else:
        train_preprocessor.ohe_encoder(col)
        test_preprocessor.ohe_encoder(col)
        print(f"encoding OHE : {col}")

In [None]:
data_train.drop(columns=['Date of registration','ID','Erwltp (g/km)'], inplace=True)
data_test.drop(columns=['Date of registration','Erwltp (g/km)'], inplace=True)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [None]:
train, test = train_test_split(data_train, test_size=0.33, random_state=42)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

X_train, y_train = train.drop(columns=["Ewltp (g/km)"]), train["Ewltp (g/km)"]
X_test, y_test = test.drop(columns=["Ewltp (g/km)"]), test["Ewltp (g/km)"]

### Modélisation

In [None]:
params = {'n_estimators':4000,
          'max_depth': 35,
          'learning_rate': 0.005,
          'colsample_bytree':0.80,
          'gamma':10,
          'reg_alpha':0.8,
          'reg_lambda':0.1,
          'objective': 'reg:squarederror',
          'tree_method': 'hist',
          'n_jobs':-1
}

model = xgb.XGBRegressor(**params)

model.fit(X_train, y_train,verbose=True)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

### Prédiction