# Modélisation

In [8]:
import pandas as pd

In [9]:
df = pd.read_csv("clean_dataset.csv", index_col=0)

In [10]:
df

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,distance
0,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,205.527899
1,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,203.952229
2,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,205.244676
3,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,206.075218
4,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,206.075218
...,...,...,...,...,...,...,...,...,...
20635,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND,266.229776
20636,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND,271.008439
20637,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND,265.183978
20638,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND,268.667236


## Transformation des variables catégorielles en variables numériques

In [11]:
df.dropna(inplace=True)

In [12]:
df = df.sample(frac=0.1, random_state=88)

In [13]:
# Créer une liste contenant toutes les variables catégorielles

list_cat = [i for i in df.columns if df.dtypes[i]=="object"]
list_cat

['ocean_proximity']

In [14]:
# Encodage a chaud (one-hot encoding)
df_dumies = pd.get_dummies(df, columns=list_cat, prefix="Column", dtype=float)
df_dumies

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,distance,Column_<1H OCEAN,Column_INLAND,Column_NEAR BAY,Column_NEAR OCEAN
14048,41.0,1545.0,420.0,747.0,415.0,2.3750,154400.0,563.504941,0.0,0.0,0.0,1.0
4762,45.0,1579.0,357.0,713.0,335.0,2.1711,179200.0,386.400335,1.0,0.0,0.0,0.0
16273,21.0,1185.0,237.0,960.0,245.0,2.0893,65000.0,130.797597,0.0,1.0,0.0,0.0
1849,48.0,2083.0,298.0,685.0,286.0,7.3089,331200.0,210.795738,0.0,0.0,1.0,0.0
12577,36.0,2430.0,426.0,1199.0,437.0,3.1667,81900.0,188.757995,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3659,45.0,972.0,181.0,554.0,187.0,4.8194,181300.0,367.210167,1.0,0.0,0.0,0.0
16787,18.0,4172.0,806.0,3226.0,790.0,5.7535,297900.0,219.143675,0.0,0.0,0.0,1.0
9667,14.0,1250.0,272.0,721.0,234.0,2.3500,95700.0,149.184397,0.0,1.0,0.0,0.0
6306,22.0,3272.0,618.0,1784.0,591.0,4.0324,211300.0,409.604399,1.0,0.0,0.0,0.0


## Séparation du jeu de données (jeux d'entraînement et jeu de test)

In [15]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df_dumies, test_size = 0.3, random_state = 88)

In [16]:
# Choix des variables cibles et predictives entrainement
Y_train = train_set["median_house_value"]
X_train = train_set.drop(["median_house_value"], axis=1)

In [17]:
# Choix des variables cibles et predictives test
Y_test = test_set["median_house_value"]
X_test = test_set.drop(["median_house_value"], axis=1)

## Normalisation des variables ou mise a l'échelle

In [18]:
# Pour la normalisation plusieurs techniquaes sont employées : min-max scaler, standardscaler et robustscaler

from sklearn.preprocessing import StandardScaler
scaler1 = StandardScaler()
X_train_norm = scaler1.fit_transform(X_train)


In [19]:
scaler2 = StandardScaler()
X_test_norm = scaler2.fit_transform(X_test)

## Predictions 

In [20]:
from lazypredict.Supervised import LazyRegressor
reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)
models, prediction = reg.fit(X_train_norm, X_test_norm, Y_train, Y_test)

100%|██████████| 42/42 [00:15<00:00,  2.72it/s]


In [26]:
models

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GradientBoostingRegressor,0.63,0.64,69734.85,0.3
LGBMRegressor,0.63,0.64,69870.02,0.1
HistGradientBoostingRegressor,0.61,0.62,71211.65,0.46
RandomForestRegressor,0.61,0.62,71768.05,0.67
ExtraTreesRegressor,0.61,0.61,71846.95,0.32
BaggingRegressor,0.59,0.6,73094.59,0.08
XGBRegressor,0.58,0.59,73897.52,0.1
LarsCV,0.56,0.56,76433.35,0.02
LassoLarsCV,0.56,0.56,76433.35,0.01
LassoCV,0.55,0.56,76539.81,0.05


In [27]:
prediction

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GradientBoostingRegressor,0.63,0.64,69734.85,0.3
LGBMRegressor,0.63,0.64,69870.02,0.1
HistGradientBoostingRegressor,0.61,0.62,71211.65,0.46
RandomForestRegressor,0.61,0.62,71768.05,0.67
ExtraTreesRegressor,0.61,0.61,71846.95,0.32
BaggingRegressor,0.59,0.6,73094.59,0.08
XGBRegressor,0.58,0.59,73897.52,0.1
LarsCV,0.56,0.56,76433.35,0.02
LassoLarsCV,0.56,0.56,76433.35,0.01
LassoCV,0.55,0.56,76539.81,0.05
