# Import Data + Packages

In [14]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, RobustScaler, PolynomialFeatures, TargetEncoder
from sklearn.metrics import mean_absolute_error
from category_encoders import LeaveOneOutEncoder
import joblib

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)
sns.set_theme(style="ticks", palette="pastel")


In [2]:
data_train= pd.read_csv("data/train.csv",sep=",",low_memory=False)
data_test = pd.read_csv("data/test.csv",sep=",",low_memory=False)

# Preprocessing



## Récupération d'observations

Lorsque la voiture est électrique: on peut se permettre de set `ec(cm3)`, `Fuel consumption `, `z (Wh/km)` à 0

lorsque la voiture n'est pas hybride / électrique : on peut mettre `Electric range (km)` à 0

A voir si on choisir de prendre le traitement.

## Delete columns

Supprimer les colonnes avec 1 seul valeur unique (aucune info) ou 0 valeur unique (que des NaN)

In [3]:
valeurs_uniques = {}
nombre_val_unique={}
for col in data_train.columns:
    valeurs_uniques[col]=data_train[col].unique().tolist()
    nombre_val_unique[col]=data_train[col].nunique()

for element in nombre_val_unique:
    if nombre_val_unique[element]<=1:
        print(f"colonne supprimée: {element}")
        data_train.drop(columns=element, inplace=True)
        data_test.drop(columns=element, inplace=True)

colonne supprimée: MMS
colonne supprimée: r
colonne supprimée: Ernedc (g/km)
colonne supprimée: De
colonne supprimée: Vf
colonne supprimée: Status


Supprimer les colonnes avec + de 50% de NaN

In [4]:
for col in data_train.columns:
    if (data_train[col].isna().sum()/data_train.shape[0] > 0.5):
        print(f"colonne supprimée: {col}")
        data_train.drop(columns=col, inplace=True)
        data_test.drop(columns=col, inplace=True)


colonne supprimée: Enedc (g/km)
colonne supprimée: z (Wh/km)
colonne supprimée: Electric range (km)


Supprimer les `Date` et `ID` (seulement pour train)

In [5]:
data_train.drop(columns=['Date of registration','ID'], inplace=True)
data_test.drop(columns='Date of registration', inplace=True)

print(f"colonne supprimée pour data_train: Date of registration, ID")
print(f"colonne supprimée pour data_test: Date of registration")

colonne supprimée pour data_train: Date of registration, ID
colonne supprimée pour data_test: Date of registration


In [6]:
col_categoricals = data_test.select_dtypes(include="object").columns.tolist()
col_numericals = [col for col in data_test.columns if col not in col_categoricals]
col_numericals.remove("ID")

## Outliers 

Utilisation de l'écart interquartile pour identifier les valeurs aberrantes.

Imputation des outliers:

Fixer les valeurs aberrantes à un certain pourcentage (par exemple, 5e et 95e percentiles).

**on pourrait aussi tenter d'imputer par la médiane si cela n'aboutit pas** 

In [7]:
def winsorize_outliers(data, column_name, lower_percentile=5, upper_percentile=95):
    """
    Detects and imputes outliers using winsorizing for a specific column in a DataFrame.

    Parameters:
    - data: Pandas DataFrame, input data
    - column_name: str, name of the column to be winsorized
    - lower_percentile: int, lower percentile for winsorizing (default: 5)
    - upper_percentile: int, upper percentile for winsorizing (default: 95)

    Returns:
    - winsorized_data: Pandas DataFrame, data with outliers winsorized for the specified column
    """

    column_data = data[column_name]

    q1 = np.percentile(column_data, lower_percentile)
    q3 = np.percentile(column_data, upper_percentile)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    data[column_name] = np.clip(column_data, lower_bound, upper_bound)

    return data


In [8]:
for col in col_numericals:
    data_train=winsorize_outliers(data_train,col)
    data_test=winsorize_outliers(data_test,col)

## Impute NaN by median/mode



In [9]:
imputers={}
_coefficient_variation= lambda series : series.std()/series.mean()

def fill_missing_values(colname : str,data:pd.DataFrame) -> None:
    
    if data[colname].dtype in ["float64"]:
        if _coefficient_variation(data[colname]) > 0.15 :
            imputers[colname]=SimpleImputer(missing_values=np.nan,strategy="median")
        else:
            imputers[colname]=SimpleImputer(missing_values=np.nan,strategy="mean")
    else:
        imputers[colname]=SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    imputers[colname].fit(data[colname].to_numpy().reshape(-1,1))
    pass

for col in data_test.columns[1:]:
    fill_missing_values(col,data_train)
    data_train[col]=pd.Series(imputers[col].transform(data_train[col].to_numpy().reshape(-1,1)).flatten())
    data_test[col]=pd.Series(imputers[col].transform(data_test[col].to_numpy().reshape(-1,1)).flatten())


## Encode categorical columns

Many choices:
- Label/Ordinal encoding
- Target encoding
- Impact encoding


Label/Ordinal encoding

In [10]:
encoders={}
def ordinal_encoder(colname:str,data:pd.DataFrame,train=True):
    if train:
        encoders[colname]=OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        data[colname]=encoders[colname].fit_transform(data[[colname]])
        pass
    else:
        data[colname]=encoders[colname].transform(data[[colname]])
        pass

for col in col_categoricals:
    ordinal_encoder(col,data_train)
    ordinal_encoder(col,data_test,False)

Target Encoding

In [20]:
encoders={}
def target_encoder(colname:str,data:pd.DataFrame,train=True):
    if train:
        encoders[colname]=TargetEncoder(target_type='continuous', smooth='auto',random_state=42)
        data[colname]=encoders[colname].fit_transform(data[[colname]],data['Ewltp (g/km)'])
        pass
    else:
        data[colname]=encoders[colname].transform(data[[colname]])
        pass

for col in col_categoricals:
    target_encoder(col,data_train)
    target_encoder(col,data_test,False)

Impact Encoding

Proposé par Sam B. J'ai utilisé Chat GPT pour l'implémenter honnêtement.

In [25]:
encoders={}
def impact_encoder(colname:str,data:pd.DataFrame,train=True):
    if train:
        encoders[colname]=LeaveOneOutEncoder(handle_unknown=-1)
        data[colname]=encoders[colname].fit_transform(data[[colname]],data['Ewltp (g/km)'])
        pass
    else:
        data[colname]=encoders[colname].transform(data[[colname]])
        pass

for col in col_categoricals:
    impact_encoder(col,data_train)
    impact_encoder(col,data_test,False)

## Feature Engineering

Doit-on utiliser ces variables ?

In [None]:
def create_conforme(df):
    df['conforme'] = df['Tan'].isna()
    df['conforme'] = df['conforme'].apply(lambda x: 1 if x==False else 0)
    df.drop(columns='Tan', inplace=True)
    pass
def compute_surface(obs):
    max_largeur= max(obs['At1 (mm)'], obs['At2 (mm)'])
    return obs['W (mm)']*obs['At1 (mm)'] if max_largeur == obs['At1 (mm)'] else obs['W (mm)'] * obs['At2 (mm)']

def create_surface(df):
    df['surface']= df.apply(compute_surface, axis=1)
    pass

create_conforme(data_train)
create_surface(data_train)

create_conforme(data_test)
create_surface(data_test)

## Split 

In [11]:
train, test = train_test_split(data_train, test_size=0.33, random_state=42)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

X_train, y_train = train.drop(columns=["Ewltp (g/km)"]), train["Ewltp (g/km)"]
X_test, y_test = test.drop(columns=["Ewltp (g/km)"]), test["Ewltp (g/km)"]

# Model Testing

## Random Forest 

Use all columns except those dropped (MMS, r, Ernedc (g/km), De,Vf,StatusEnedc (g/km),z (Wh/km),Electric range (km),Date of registration). 

ordinal encoding. Strandard fill NaN. Random forest. No feature Engineering.

Computation time: 41min

In [12]:
random_forest = RandomForestRegressor(random_state=42, n_jobs=-1)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 2.8986731992878796


In [13]:
data_test["Ewltp (g/km)"] = random_forest.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("data/new_simple_rf.csv", index=False)

In [15]:
joblib.dump(random_forest, 'models/random_forest_simple_model.joblib')

['random_forest_simple_model.joblib']

## Bagging

# Boosting

# Xgboost