1. Récupérer la donnée : train.csv ; test.csv : https://discord.com/channels/1020279842798841876/1020279843599958109/1059747069553803285
2. Entrainer les modeles : Regression lineaire, Ridge, Lasso, ElasticNet. (Alpha et solveur)
3. Modifier les hyperparmetres de ces models à l'aide de GridSearch et RandomizeSearch.
Utiliser ces méthodes : 
.bestscore
.bestparams
.bestestimator

4. Récupérer le .bestestimator et afficher la learning curve : https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.learning_curve.html
5. Interpreter la learning curve et conclure sur l'over, l'under ou la généralisation

6. Essayer la GridSearch avec un pipeline 


In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import set_config
set_config(display="diagram") 
from sklearn.model_selection import cross_validate

In [3]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [4]:
df_train.head()

Unnamed: 0,beds,baths,size,size_units,lot_size,lot_size_units,zip_code,price
0,3,2.5,2590.0,sqft,6000.0,sqft,98144,795000.0
1,4,2.0,2240.0,sqft,0.31,acre,98106,915000.0
2,4,3.0,2040.0,sqft,3783.0,sqft,98107,950000.0
3,4,3.0,3800.0,sqft,5175.0,sqft,98199,1950000.0
4,2,2.0,1042.0,sqft,,,98102,950000.0


In [5]:
df_test.head()

Unnamed: 0,beds,baths,size,size_units,lot_size,lot_size_units,zip_code,price
0,3,3.0,2850.0,sqft,4200.0,sqft,98119,1175000.0
1,4,5.0,3040.0,sqft,5002.0,sqft,98106,1057500.0
2,3,1.0,1290.0,sqft,6048.0,sqft,98125,799000.0
3,3,2.0,2360.0,sqft,0.28,acre,98188,565000.0
4,3,3.5,1942.0,sqft,1603.0,sqft,98107,1187000.0


In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 505 entries, 0 to 504
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   beds            505 non-null    int64  
 1   baths           505 non-null    float64
 2   size            505 non-null    float64
 3   size_units      505 non-null    object 
 4   lot_size        428 non-null    float64
 5   lot_size_units  428 non-null    object 
 6   zip_code        505 non-null    int64  
 7   price           505 non-null    float64
dtypes: float64(4), int64(2), object(2)
memory usage: 31.7+ KB


In [14]:
df_train.describe()

Unnamed: 0,beds,baths,size,lot_size,zip_code,price
count,2016.0,2016.0,2016.0,1669.0,2016.0,2016.0
mean,2.857639,2.15997,1735.740575,3871.059694,98123.638889,963625.2
std,1.255092,1.002023,920.132591,2719.402066,22.650819,944095.4
min,1.0,0.5,250.0,0.23,98101.0,159000.0
25%,2.0,1.5,1068.75,1252.0,98108.0,601750.0
50%,3.0,2.0,1560.0,4000.0,98117.0,800000.0
75%,4.0,2.5,2222.5,6000.0,98126.0,1105250.0
max,15.0,9.0,11010.0,9998.0,98199.0,25000000.0


## Preparation de la donnée
- vérification des doublons
- transformation des colonnes 'beds', 'baths', 'zip_code' en données catégorielles
- transformation de la colonne lot_size en uniformisant l'unité de mesure (1 Acre = 43560 Sqrft)

In [10]:
df_train.duplicated().any()

True

In [22]:
df_train_2 = df_train.drop_duplicates()

In [23]:
df_train_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2007 entries, 0 to 2015
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   beds            2007 non-null   int64  
 1   baths           2007 non-null   float64
 2   size            2007 non-null   float64
 3   size_units      2007 non-null   object 
 4   lot_size        1660 non-null   float64
 5   lot_size_units  1660 non-null   object 
 6   zip_code        2007 non-null   int64  
 7   price           2007 non-null   float64
dtypes: float64(4), int64(2), object(2)
memory usage: 141.1+ KB


In [36]:
df_train_2["baths"].unique()

array([2.5, 2. , 3. , 1. , 3.5, 1.5, 5.5, 5. , 4. , 8.5, 4.5, 6. , 0.5,
       7. , 9. , 6.5])

In [24]:
df_train_2["zip_code"].nunique()

28

In [25]:
y_train = df_train_2['price']
X_train = df_train_2.drop(columns=['price'])

Pipeline

In [29]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('minmax_scaler', MinMaxScaler())
])

In [30]:
cat_transformer = OneHotEncoder(handle_unknown='ignore', drop='first')

In [31]:
df_train_2.columns

Index(['beds', 'baths', 'size', 'size_units', 'lot_size', 'lot_size_units',
       'zip_code', 'price'],
      dtype='object')

In [32]:
preprocessor = ColumnTransformer([
    ('num_transformer', num_pipeline, ["size","lot_size"]),
    ('cat_transformer', cat_transformer, ["beds","baths","zip_code"])
])

In [33]:
pipeline_workflow = make_pipeline(preprocessor, LinearRegression())

In [34]:
pipeline_workflow