In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
#######
# Analyse des valeurs manquantes azeaze
#######

def missing_statistics(df):    
    statitics = pd.DataFrame(df.isnull().sum()).reset_index()
    statitics.columns=['Variable',"Valeurs Manquantes"]
    statitics['Nombre de lignes'] = df.shape[0]
    statitics['% Manquant'] = round((statitics['Valeurs Manquantes']/statitics['Nombre de lignes'])*100,2)
    return statitics

missing_statistics(df)

Unnamed: 0,Variable,Valeurs Manquantes,Nombre de lignes,% Manquant
0,Id,0,1460,0.00
1,MSSubClass,0,1460,0.00
2,MSZoning,0,1460,0.00
3,LotFrontage,259,1460,17.74
4,LotArea,0,1460,0.00
5,Street,0,1460,0.00
6,Alley,1369,1460,93.77
7,LotShape,0,1460,0.00
8,LandContour,0,1460,0.00
9,Utilities,0,1460,0.00


In [4]:
##########
# Selection des 3 variables de test
##########

df = df.filter(['LotArea', 'SalePrice', 'Neighborhood'])

# Création du modèle

In [5]:
X = df.filter(['LotArea', 'SalePrice', 'Neighborhood'])
y = df.SalePrice

In [6]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(X["Neighborhood"])
X["Neighborhood"] = le.transform(X["Neighborhood"])

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

param_grid = {
    'bootstrap': [True],
    'max_depth': [20, 40, 60, None],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [10, 30, 50, 100, 200]
}

rf = RandomForestRegressor(random_state=0)
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X_train,y_train)

Fitting 3 folds for each of 180 candidates, totalling 540 fits
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   26.1s
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:   34.9s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'bootstrap': [True], 'max_depth': [20, 40, 60, None], 'min_samples_leaf': [3, 4, 5], 'min_samples_split': [8, 10, 12], 'n_estimators': [10, 30, 50, 100, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [9]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 20,
 'min_samples_leaf': 3,
 'min_samples_split': 8,
 'n_estimators': 50}

In [10]:
model_opti = RandomForestRegressor(random_state=0, bootstrap=True, max_depth=20, min_samples_leaf= 3, min_samples_split=8, n_estimators=50)
model_opti.fit(X,y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=3, min_samples_split=8,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [11]:
import pickle

pkl_filename = "pickle_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model_opti, file)