In [1]:
import os
import numpy as np # linear algebra
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 
import statsmodels.api as sm

import copy
from statsmodels.formula.api import ols
from collections import Counter, defaultdict
from sklearn.model_selection import train_test_split
import LibrairiePerso_v4_5 as ownLibrary

pd.set_option('display.max_columns', 500)

In [2]:
path ='C:/Users/Julie/Documents/Big_Data/github/'

X_train = pd.read_csv(path + "Initial_train_rwrk.csv", sep=",")
y_train=X_train['SalePrice']

X_test = pd.read_csv(path + "Initial_test_rwrk.csv", sep=",")
y_test=X_test['SalePrice']

submission = pd.read_csv(path + "Initial_submission_rwrk.csv", sep=",")


for df in [X_train, X_test]:
    df.drop(['SalePrice'], axis=1, inplace=True)


## Define features transformation based on Best transformation finder

In [3]:
base_features = ['Id', 'MiscVal', 'SalePrice', 'SalePrice_log']
continuous_features_created = ['YrBltAndRemod','TotalSF','Total_sqr_footage','Total_Bathrooms','Total_porch_sf']
categorical_features_created = ['haspool','has2ndfloor','hasgarage','hasbsmt','hasfireplace']

scaleMethod3 = continuous_features_created + ['LotFrontage','LotArea','YearBuilt','YearRemodAdd','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','GrLivArea','GarageYrBlt','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','ScreenPorch']
scaleMethod1 = []
scaleMethod0 = []

no_modification = ['MasVnrArea','ScreenPorch']
dtr_best_performance = ['LotFrontage','LotArea','YearBuilt','YearRemodAdd','BsmtFinSF2','GarageYrBlt','WoodDeckSF','OpenPorchSF','EnclosedPorch']
pr2_best_performance = ['1stFlrSF','GarageArea']
pr3_best_performance = ['BsmtFinSF1','BsmtUnfSF','TotalBsmtSF','2ndFlrSF','GrLivArea']


ignore = base_features + pr2_best_performance + pr3_best_performance + no_modification + continuous_features_created

## Scale features

In [4]:
if (scaleMethod1 !=[]): 
    X_train = ownLibrary.scale_features(X_train, scaleMethod1, 1)
    X_test = ownLibrary.scale_features(X_test, scaleMethod1, 1)
    submission = ownLibrary.scale_features(submission, scaleMethod1, 1)

In [5]:
if (scaleMethod3 !=[]): 
    X_train = ownLibrary.scale_features(X_train, scaleMethod3, 3)
    X_test = ownLibrary.scale_features(X_test, scaleMethod3, 3)
    submission = ownLibrary.scale_features(submission, scaleMethod3, 3)

## Transformations

In [6]:
ownLibrary.discretisationSupervise(X_train[dtr_best_performance], X_test[dtr_best_performance], y_train, y_test, False, True, 0)

'LotFrontage': {
	1: [0,-0.5068718561420614],
	2: [-0.45212412448097644,0.25959638711312877],
	3: [0.31434411877421375,1.135560093690489],
	4: [1.190307825351574,4.461029453824379]
},
'LotArea': {
	1: [0,-0.17955721657972745],
	2: [-0.17907197505472838,0.0470505755948299],
	3: [0.04908858999982592,0.31005148214431616],
	4: [0.31024557875431574,39.74318615886221]
},
'YearBuilt': {
	1: [0,-0.5053470033892713],
	2: [-0.47231440745298503,0.41956568282674467],
	3: [0.45259827876303094,1.1132501974887565],
	4: [1.1462827934250428,2.556826354340376]
},
'YearRemodAdd': {
	1: [0,-1.4037144315974344],
	2: [-1.3552270503457842,-0.0945551378028778],
	3: [-0.046067756551227565,1.0691420122367281],
	4: [1.1176293934883783,2.4292083119833574]
},
'BsmtFinSF2': {
	1: [0,0.1754504262377594],
	2: [0.18130622244346756,5.732601025454819],
	3: [5.744312617866235,5.896563319214648],
	4: [5.931698096448897,16.676860673989918]
},
'GarageYrBlt': {
	1: [0,-0.5884145189416965],
	2: [-0.5535914464789935,0.42145458

Unnamed: 0,DTR0_R²
LotFrontage,14.68
LotArea,20.71
YearBuilt,35.79
YearRemodAdd,29.07
BsmtFinSF2,1.13
GarageYrBlt,33.39
WoodDeckSF,7.82
OpenPorchSF,21.43
EnclosedPorch,2.29


## Categorical replacements

In [7]:
ListeDesReglesQuali = { 

    'HeatingQC': {
        1: ["Ex"],
        2: ["Gd","TA"],
        3: ["Fa","Po"]
    },
    'CentralAir': {
        1: ["Y"],
        2: ["N"]
    },
    'Electrical': {
        1: ["SBrkr"],
        2: ["FuseA","FuseF","FuseP"],
        3: ["Mix","eNA"] 
    },
    'BsmtFullBath': {
        1: ["0"],
        2: ["1"],
        3: ["2","3"]
    },
    'BsmtHalfBath': {
        1: ["0"],
        2: ["1","2"]
    },
    'FullBath': {
        1: ["0","1"],
        2: ["2","3","4"]
    },
    'HalfBath': {
        1: ["0"],
        2: ["1"],
        3: ["2"]
    },
    'BedroomAbvGr': {
        1: ["0","1","2"],
        2: ["3"],
        3: ["4"],
        4: ["5","6","7","8"]
    },
    'KitchenAbvGr': {
        1: ["0","1"],
        2: ["2","3"]
    },
    'KitchenQual': {
        1: ["Ex"],
        2: ["Fa","Po","TA"],
        3: ["Gd"]
    },
    'TotRmsAbvGrd': {
        1: ["2","3"],
        2: ["4"],
        3: ["5","6","7","8","9"],
        4: ["10","11","12","13","14","15"]
    },
    'Functional': {
        1: ["Typ","Mod"],
        2: ["Min1","Min2"],
        3:["Maj1","Maj2","Sev"]
    },
    'Fireplaces': {
        1: ["0"],
        2: ["1","2","3","4"]
    },
    'FireplaceQu': {
        1: ["Ex"],
        2: ["Gd"],
        3: ["TA"],
        4: ["Fa","Po","eNA"]
    },
    'GarageType': {
        1: ["Attchd"],
        2: ["BuiltIn"],
        3: ["2Types","Detchd"],
        4: ["Basment"],
        5: ["CarPort", "eNA"]
    },
    'GarageFinish': {
        1: ["RFn","Fin"],
        2: ["Unf"],
        3: ["eNA"]
    },
    
    'GarageCars': {
        1: ["0"],
        2: ["1"],
        3: ["2"],
        4: ["3","4","5"]
    },
    
    'GarageQual': {
        1: ["TA","Gd"],
        2: ["Ex"],
        3: ["Fa","eNA","Po"]
    },
    'GarageCond': {
        1: ["TA"],
        2:  ["Gd"],
        3:  ["Fa","eNA"],
        4:  ["Po"],
        5:  ["Ex"]
    },
    'PavedDrive': {
        1: ["Y"],
        2: ["N","P"]
    },
    'Fence': {
        1: ["eNA"],
        2: ["MnPrv","GdWo","MnWw"],
        3: ["GdPrv"]
    },
    #'MiscFeature': {
    #    1: ["eNA"],
    #    2: ["Shed","Gar2","Othr","TenC"]
    #},
    'MoSold': {
        1: ["1","3","4","5","6","8","9","10","11","12"],
        2: ["7"],
        3: ["2"]
    },
    'YrSold': {
        1: ["2006"],
        2: ["2007"],
        3: ["2008"],
        4: ["2009"],
        5: ["2010"]
    },
    'SaleType': {
        1: ["WD","COD","ConLw","ConLI","ConLD","Oth","CWD","VWD","Con"],
        2: ["New"]
    },
    'SaleCondition': {
        1: ["Normal"],
        2: ["Abnorml","AdjLand"],
        3: ["Alloca","Family","Partial"]
    },
    'MSSubClass': {
        1: ["20","60","120"],
        2: ["30","45","160","190","40","180","90","150"],
        3: ["50","70"],
        4: ["75","80","85"]
    },
    'RoofStyle': {
        1: ["Hip"],
        2: ["Gable","Gambrel","Mansard","Flat","Shed"]
    },
    'Exterior1st': {
        1: ["CemntBd","VinylSd","BrkFace"],
        2: ["MetalSd","Wd Sdng","HdBoard","Plywood","Stucco","WdShing","AsbShng"],
        3: ["BrkComm","AsphShn","ImStucc","CBlock","Stone"]
    },
    'Exterior2nd': {
        1: ["CmentBd","VinylSd","BrkFace","ImStucc","Stucco","Stone"],
        2: ["MetalSd","Wd Sdng","HdBoard","Plywood","AsbShng","AsphShn","Wd Shng"],
        3: ["Brk Cmn","CBlock","Other"]
    },
    'MasVnrType': {
        1: ["None", "BrkCmn", "eNA"],
        2: ["Stone", "BrkFace"]
    },
    'ExterQual': {
        1: ["Ex","Gd"],
        2: ["TA","Fa"]
    },
    'ExterCond': {
        1: ["Ex","Gd","TA"],
        2: ["Fa","Po"]
    },
    'Foundation': {
        1: ["PConc","Wood","Stone"],
        2: ["CBlock","BrkTil","Slab"]
    },
    'BsmtQual': {
        1: ["Ex"],
        2: ["Gd","TA"],
        3: ["Fa","eNA"]
    },
    'BsmtCond': {
        1: ["TA"],
        2: ["Gd"],
        3: ["Fa","Po","eNA"]
    },
    'BsmtExposure': {
        1: ["No","Mn","Av"],
        2: ["Gd"],
        3: ["eNA"]
    },
    'BsmtFinType1': {
        1: ["GLQ"],
        2: ["ALQ","Rec","BLQ","LwQ","Unf","eNA"]
    },
    'BsmtFinType2': {
        1: ["Unf"],
        2: ["Rec","BLQ","LwQ","eNA"],
        3: ["ALQ","GLQ"]
    },

    'MSZoning': {
        1: ["A","I","RP"],
        2: ["FV","RL"],
        3: ["C (all)","RH","RM"]
    },
    'Alley': {
        1: ["Grvl","Pave"],
        2: ["eNA"]
    },
    'LotShape': {
        1: ["Reg"],
        2: ["IR1","IR2","IR3"]
    },
    'LandContour': {
        1: ["Lvl"],
        2: ["Bnk","HLS","Low"]
    },
    'LotConfig': {
        1: ["Inside"],
        2: ["Corner","FR2"],
        3: ["CulDSac","FR3"]
    },
    'LandSlope': {
        1: ["Gtl"],
        2: ["Mod","Sev"]
    },
    'Condition1': {
        1: ["Artery","Feedr","RRAe","RRAn","RRNe"],
        2: ["Norm"],
        3: ["PosA","PosN","RRNn"]
    },
    'BldgType': {
        1: ["1Fam"],
        2: ["2fmCon","Duplex","Twnhs"],
        3: ["TwnhsE"]
    },
    'HouseStyle': {
        1: ["1.5Fin","1.5Unf","2.5Unf","SFoyer","SLvl"],
        2: ["1Story","2.5Fin","2Story"]
    },
    'OverallQual': {
        1: ["1","2","3","4"],
        2: ["5"],
        3: ["6"],
        4: ["7","8","9","10"]
    },
    
    'Neighborhood' : {
        1: ["NoRidge","NridgHt","StoneBr"],
        2: ["CollgCr","Veenker","Crawfor","Somerst","NWAmes","SawyerW","Timber","Gilbert","ClearCr","Blmngtn"],
        3: ["Mitchel","OldTown","BrkSide","Sawyer","NAmes","IDOTRR","MeadowV","Edwards","NPkVill","BrDale","SWISU","Blueste"]
    },    
    'OverallCond': {
        1: ["1","2","3","4"],
        2: ["5"],
        3: ["6","7","8","9"]
    }
   
}




## Regroupement KB discretizer

If you set a new list of replacements you will probably have to fill manually the gaps between classes

In [8]:
#Left cap excluded, right cap included


'''
ListeDesReglesQuantiKB = { 
    'LotFrontage': {
        1: [0,60.0],
        2: [61.0,74.0],
        3: [75.0,81.0],
        4: [82.0,220.0]
    },
    'LotArea': {
        1: [0,9100],
        2: [9101,10980],
        3: [10980,13680],
        4: [13682,430490]
    },
    'YearBuilt': {
        1: [0,1952],
        2: [1953,1984],
        3: [1985,2005],
        4: [2006,4020]
    },
    'YearRemodAdd': {
        1: [0,1955],
        2: [1956,1983],
        3: [1984,2007],
        4: [2008,4020]
    },
    'BsmtFinSF2': {
        1: [0,234],
        2: [234,1029],
        3: [1031,1060],
        4: [1060,2948]
    },
    'GarageYrBlt': {
        1: [0,1958.0],
        2: [1959.0,1987.0],
        3: [1988.0,2005.0],
        4: [2006.0,4020.0]
    },
    'WoodDeckSF': {
        1: [0,84],
        2: [85,96],
        3: [97,158],
        4: [159,1714]
    },
    'OpenPorchSF': {
        1: [0,0],
        2: [4,8],
        3: [10,29],
        4: [30,1046]
    },
    'EnclosedPorch': {
        1: [0,36],
        2: [37,45],
        3: [45,140],
        4: [143,1104]
    },
}

'''

"\nListeDesReglesQuantiKB = { \n    'LotFrontage': {\n        1: [0,60.0],\n        2: [61.0,74.0],\n        3: [75.0,81.0],\n        4: [82.0,220.0]\n    },\n    'LotArea': {\n        1: [0,9100],\n        2: [9101,10980],\n        3: [10980,13680],\n        4: [13682,430490]\n    },\n    'YearBuilt': {\n        1: [0,1952],\n        2: [1953,1984],\n        3: [1985,2005],\n        4: [2006,4020]\n    },\n    'YearRemodAdd': {\n        1: [0,1955],\n        2: [1956,1983],\n        3: [1984,2007],\n        4: [2008,4020]\n    },\n    'BsmtFinSF2': {\n        1: [0,234],\n        2: [234,1029],\n        3: [1031,1060],\n        4: [1060,2948]\n    },\n    'GarageYrBlt': {\n        1: [0,1958.0],\n        2: [1959.0,1987.0],\n        3: [1988.0,2005.0],\n        4: [2006.0,4020.0]\n    },\n    'WoodDeckSF': {\n        1: [0,84],\n        2: [85,96],\n        3: [97,158],\n        4: [159,1714]\n    },\n    'OpenPorchSF': {\n        1: [0,0],\n        2: [4,8],\n        3: [10,29],\n   

In [9]:

ListeDesReglesQuantiKB = { 
    'LotFrontage': {
        1: [-3,-0.5068718561420614],
        2: [-0.45212412448097644,0.25959638711312877],
        3: [0.31434411877421375,0.6428305087407239],
        4: [0.6975782404018088,4.461029453824379]
    },
    'LotArea': {
        1: [-3,-0.1344297547548156],
        2: [-0.1343327064498158,0.0470505755948299],
        3: [0.04908858999982592,0.31005148214431616],
        4: [0.31024557875431574,39.74318615886221]
    },
    'YearBuilt': {
        1: [-3,-0.6374773871344165],
        2: [-0.6044447911981302,0.41956568282674467],
        3: [0.45259827876303094,1.1132501974887565],
        4: [1.1462827934250428,2.556826354340376]
    },
    'YearRemodAdd': {
        1: [-3,-1.4522018128490846],
        2: [-1.4037144315974344,-0.0945551378028778],
        3: [-0.046067756551227565,1.0691420122367281],
        4: [1.1176293934883783,2.4292083119833574]
    },
    'BsmtFinSF2': {
        1: [-3,1.0421082646825695],
        2: [1.1065220229453594,5.732601025454819],
        3: [5.744312617866235,5.896563319214648],
        4: [5.931698096448897,16.676860673989918]
    },
    'GarageYrBlt': {
        1: [-3,-0.5884145189416965],
        2: [-0.5535914464789935,0.42145458247668904],
        3: [0.456277654939392,1.0482698868053422],
        4: [1.0830929592680452,12.444770498237714]
    },
    'WoodDeckSF': {
        1: [-3,-0.06870833970910595],
        2: [-0.06067310419214677,0.027714486494404236],
        3: [0.035749722011363415,0.5258990885458735],
        4: [0.5339343240628327,12.285057429800684]
    },
    'OpenPorchSF': {
        1: [-3,-0.7100281535780881],
        2: [-0.6495863841795244,-0.5891446147809607],
        3: [-0.5589237300816788,-0.27182532543850135],
        4: [-0.2567148830888604,14.38546639056823]
    },
    'EnclosedPorch': {
        1: [-3,0.20081280741520016],
        2: [0.21664118079005246,0.29578304766431385],
        3: [0.39075328791342745,1.8469636383998367],
        4: [1.8944487585243937,16.736506937677948]
    },
}


# Regroupement de modalités

In [10]:
X_train_rgrpd = copy.deepcopy(X_train)
X_test_rgrpd = copy.deepcopy(X_test)
submission_rgrpd = copy.deepcopy(submission)

In [11]:
for dataset in [X_train_rgrpd, X_test_rgrpd, submission_rgrpd]: 
    # ----------- Pour chaque colonne du DataSet nous allons regrouper ------------
    for feature in dataset:      
        # ------------------------ If the feature is categorical ----------------------    
        if (feature in ListeDesReglesQuali):
            featureDiscretise = ownLibrary.discretise_1col_quali(dataset[feature], feature, ListeDesReglesQuali[feature])
            dataset.drop([feature], axis=1, inplace=True)
            dataset[feature] = featureDiscretise

        # ------------------------ If the feature is continuous ----------------------                            
        elif (feature in ListeDesReglesQuantiKB):
            featureDiscretise = ownLibrary.discretise_1col_quanti(dataset[feature], feature, ListeDesReglesQuantiKB[feature])
            if( len(dataset[feature]) != len(featureDiscretise)):
                print('len(dataset[feature]) != len(featureDiscretise) : ' + str(dataset[feature]) + ' != ' + str(featureDiscretise))
            else:    
                dataset.drop([feature], axis=1, inplace=True)
                dataset[feature] = featureDiscretise          
                    
     


In [12]:
X_train = copy.deepcopy(X_train_rgrpd)
X_test = copy.deepcopy(X_test_rgrpd)
submission = copy.deepcopy(submission_rgrpd)

In [13]:
X_train.reset_index(drop=True, inplace=True)   
y_train.reset_index(drop=True, inplace=True) 
rgrpd_train = pd.concat([X_train, y_train], axis=1, sort=True)

X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True) 
rgrpd_test = pd.concat([X_test, y_test], axis=1, sort=True)

In [14]:
rgrpd_train.rename(columns={0: "SalePrice"}, inplace=True)
rgrpd_test.rename(columns={0: "SalePrice"}, inplace=True)
submission.rename(columns={0: "SalePrice"}, inplace=True)

# Features regroupment

In [15]:

for df in [rgrpd_train, rgrpd_test, submission]:
    df["NewFirePlaces"] = df["Fireplaces"].astype(str) + df["FireplaceQu"].astype(str)
    df["NewExterQualCond"] = df["ExterQual"].astype(str) + df["ExterCond"].astype(str)
    df["NewCentrAirElec"] = df["CentralAir"].astype(str) + df["Electrical"].astype(str)
    df["NewKitchen"] = df["KitchenAbvGr"].astype(str) + df["KitchenQual"].astype(str)
    df["NewSale"] = df["SaleType"].astype(str) + df["SaleCondition"].astype(str)
    
    df.drop(['Fireplaces'], axis=1, inplace=True)
    df.drop(['FireplaceQu'], axis=1, inplace=True)
    df.drop(['ExterQual'], axis=1, inplace=True)
    df.drop(['ExterCond'], axis=1, inplace=True)
    df.drop(['CentralAir'], axis=1, inplace=True)
    df.drop(['Electrical'], axis=1, inplace=True)
    df.drop(['KitchenAbvGr'], axis=1, inplace=True)
    df.drop(['KitchenQual'], axis=1, inplace=True)
    df.drop(['SaleType'], axis=1, inplace=True)
    df.drop(['SaleCondition'], axis=1, inplace=True)
    

## Dichotomization

In [16]:
train_dicho = pd.DataFrame()
test_dicho = pd.DataFrame()
submission_dicho = pd.DataFrame()

In [17]:
ignore

['Id',
 'MiscVal',
 'SalePrice',
 'SalePrice_log',
 '1stFlrSF',
 'GarageArea',
 'BsmtFinSF1',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '2ndFlrSF',
 'GrLivArea',
 'MasVnrArea',
 'ScreenPorch',
 'YrBltAndRemod',
 'TotalSF',
 'Total_sqr_footage',
 'Total_Bathrooms',
 'Total_porch_sf']

In [18]:
train_dicho = ownLibrary.dichotomize_dataset(rgrpd_train, ignore)
test_dicho = ownLibrary.dichotomize_dataset(rgrpd_test, ignore)
submission_dicho = ownLibrary.dichotomize_dataset(submission, ignore)

In [19]:
train_dicho.head(10)

Unnamed: 0,Id,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,GarageArea,ScreenPorch,MiscVal,YrBltAndRemod,TotalSF,Total_sqr_footage,Total_Bathrooms,Total_porch_sf,haspool_0,haspool_1,has2ndfloor_0,has2ndfloor_1,hasgarage_0,hasgarage_1,hasbsmt_0,hasbsmt_1,hasfireplace_0,hasfireplace_1,SalePrice_log,MSSubClass_1,MSSubClass_2,MSSubClass_3,MSSubClass_4,MSZoning_2,MSZoning_3,LotFrontage_1,LotFrontage_2,LotFrontage_4,LotArea_2,LotArea_4,Alley_1,Alley_2,LotShape_1,LotShape_2,LandContour_1,LandContour_2,LotConfig_1,LotConfig_2,LotConfig_3,LandSlope_1,LandSlope_2,Neighborhood_1,Neighborhood_2,Neighborhood_3,Condition1_1,Condition1_2,Condition1_3,BldgType_1,BldgType_2,BldgType_3,HouseStyle_1,HouseStyle_2,OverallQual_1,OverallQual_2,OverallQual_3,OverallQual_4,OverallCond_1,OverallCond_2,OverallCond_3,YearBuilt_1,YearBuilt_2,YearBuilt_3,YearRemodAdd_2,YearRemodAdd_3,RoofStyle_1,RoofStyle_2,Exterior1st_1,Exterior1st_2,Exterior1st_3,Exterior2nd_1,Exterior2nd_2,Exterior2nd_3,MasVnrType_1,MasVnrType_2,Foundation_1,Foundation_2,BsmtQual_1,BsmtQual_2,BsmtQual_3,BsmtCond_1,BsmtCond_2,BsmtCond_3,BsmtExposure_1,BsmtExposure_2,BsmtExposure_3,BsmtFinType1_1,BsmtFinType1_2,BsmtFinType2_1,BsmtFinType2_2,BsmtFinType2_3,BsmtFinSF2_1,BsmtFinSF2_2,BsmtFinSF2_4,HeatingQC_1,HeatingQC_2,HeatingQC_3,BsmtFullBath_1,BsmtFullBath_2,BsmtFullBath_3,BsmtHalfBath_1,BsmtHalfBath_2,FullBath_1,FullBath_2,HalfBath_1,HalfBath_2,HalfBath_3,BedroomAbvGr_1,BedroomAbvGr_2,BedroomAbvGr_3,BedroomAbvGr_4,TotRmsAbvGrd_1,TotRmsAbvGrd_2,TotRmsAbvGrd_3,TotRmsAbvGrd_4,Functional_1,Functional_2,Functional_3,GarageType_1,GarageType_2,GarageType_3,GarageType_4,GarageType_5,GarageYrBlt_1,GarageYrBlt_2,GarageYrBlt_3,GarageFinish_1,GarageFinish_2,GarageFinish_3,GarageCars_1,GarageCars_2,GarageCars_3,GarageCars_4,GarageQual_1,GarageQual_2,GarageQual_3,GarageCond_1,GarageCond_2,GarageCond_3,GarageCond_4,GarageCond_5,PavedDrive_1,PavedDrive_2,WoodDeckSF_2,WoodDeckSF_4,OpenPorchSF_4,EnclosedPorch_1,EnclosedPorch_3,EnclosedPorch_4,Fence_1,Fence_2,Fence_3,MoSold_1,MoSold_2,MoSold_3,YrSold_1,YrSold_2,YrSold_3,YrSold_4,YrSold_5,SalePrice,NewFirePlaces_14,NewFirePlaces_21,NewFirePlaces_22,NewFirePlaces_23,NewFirePlaces_24,NewExterQualCond_11,NewExterQualCond_21,NewExterQualCond_22,NewCentrAirElec_11,NewCentrAirElec_12,NewCentrAirElec_21,NewCentrAirElec_22,NewKitchen_11,NewKitchen_12,NewKitchen_13,NewKitchen_22,NewKitchen_23,NewSale_11,NewSale_12,NewSale_13,NewSale_23
0,211,-0.606718,0.042623,-0.373606,-0.472654,-0.80637,-0.788218,-1.341479,-2.277767,-0.274846,0,-1.777435,-0.995746,-0.852669,-0.287816,-0.547527,1,0,1,0,1,0,0,1,1,0,11.492723,0,1,0,0,1,0,0,1,0,1,0,0,1,1,0,1,0,1,0,0,1,0,0,0,1,0,1,0,1,0,0,0,1,0,1,0,0,0,0,1,1,0,0,1,0,0,1,0,1,0,1,0,0,1,0,0,1,0,1,0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,0,1,0,0,1,0,1,0,1,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,1,0,1,0,1,0,0,1,0,0,0,0,1,0,0,98000,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0
1,319,0.946845,1.159022,-0.453865,0.72536,0.567413,2.137704,2.398699,0.905895,-0.274846,0,0.651531,1.672308,2.042231,1.630538,2.344288,1,0,0,1,0,1,0,1,0,1,12.468437,1,0,0,0,1,0,0,0,1,1,0,0,1,1,0,0,1,1,0,0,0,1,1,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,1,0,1,0,0,1,0,0,1,1,0,0,1,0,1,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,1,1,0,1,0,1,0,0,1,0,0,0,0,0,1,0,260000,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0
2,240,-0.606718,-0.761872,0.172598,-0.79262,-0.984853,0.794169,-0.033886,-1.210076,-0.274846,0,-1.339783,-0.413839,-0.539401,-0.927267,-0.26351,1,0,0,1,0,1,0,1,0,1,11.635143,0,0,1,0,1,0,0,1,0,1,0,0,1,1,0,1,0,1,0,0,1,0,0,0,1,0,1,0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,1,1,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1,1,0,0,0,1,1,0,0,1,0,0,0,1,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,1,1,0,0,0,1,0,1,0,0,0,0,0,0,1,113000,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0
3,987,-0.606718,-0.964071,-0.175189,-1.41271,-0.435881,0.667853,0.276746,-1.040216,-0.274846,0,-0.945897,-0.53399,-0.471463,-1.566718,1.376046,1,0,0,1,0,1,0,1,1,0,11.669929,0,0,1,0,0,1,0,1,0,1,0,0,1,1,0,1,0,0,1,0,1,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,1,1,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1,1,0,0,0,1,1,0,0,1,0,0,0,1,0,1,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,1,1,1,0,0,1,0,0,1,0,0,1,0,0,0,0,117000,1,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0
4,1417,-0.606718,-0.964071,0.475797,-0.688445,0.226671,1.609476,1.651503,0.439993,-0.274846,0,-2.652738,0.581527,0.352596,-0.287816,-0.431338,1,0,0,1,0,1,0,1,1,0,11.715866,0,1,0,0,0,1,0,1,0,1,0,0,1,1,0,1,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,1,0,0,0,0,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,1,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,1,122500,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,0
5,391,-0.606718,-0.445667,-0.745917,-0.480095,-0.544053,0.144219,-0.28575,-0.41416,-0.274846,0,-2.324499,-0.406771,-0.013513,-0.287816,0.394895,1,0,0,1,0,1,0,1,1,0,11.686879,0,0,1,0,1,0,1,0,0,1,0,1,0,1,0,1,0,1,0,0,1,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0,1,0,0,1,0,1,0,0,1,0,1,0,0,0,1,0,1,0,1,0,0,1,0,0,0,1,0,1,0,1,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,1,0,1,0,0,1,0,1,0,0,0,0,1,0,0,119000,1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0
6,1006,0.728375,0.315808,-0.386983,-0.172531,-0.47915,-0.788218,-1.087516,-0.685936,-0.274846,0,-0.048711,-0.710682,-0.540659,-0.287816,0.188337,1,0,1,0,0,1,0,1,1,0,11.917724,0,0,0,1,1,0,0,1,0,1,0,0,1,1,0,1,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,1,0,1,0,1,0,0,1,0,0,1,0,1,0,1,0,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,1,1,0,0,1,0,0,1,0,0,0,0,1,0,0,149900,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0
7,569,-0.606718,2.135602,-1.133834,1.094933,0.902746,0.672446,1.319882,0.692356,-0.274846,0,0.782827,1.242357,1.966744,0.351636,1.343772,1,0,0,1,0,1,0,1,0,1,12.665394,0,0,1,0,1,0,0,1,0,1,0,0,1,0,1,0,1,1,0,0,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,1,1,1,0,0,1,0,0,1,0,0,0,0,0,1,0,316600,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0
8,343,1.456608,-0.964071,-1.256451,-2.355247,-0.330414,-0.788218,-0.972078,-0.33651,-0.274846,0,-1.252253,-1.806174,-1.220036,-0.287816,-1.167202,1,0,1,0,0,1,1,0,1,0,11.379394,0,1,0,0,1,0,0,1,0,1,0,0,1,1,0,1,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,1,0,0,0,1,0,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,1,0,1,0,1,0,0,0,0,1,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,1,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0,0,87500,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0
9,115,-0.606718,0.700847,-1.024593,-0.065875,0.740488,1.242014,1.714469,-1.404201,-0.274846,0,-0.201889,0.912531,1.552828,1.630538,0.278706,1,0,0,1,0,1,0,1,0,1,12.466512,0,0,1,0,1,0,0,1,0,1,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,1,0,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,1,0,0,0,1,0,1,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,1,1,1,0,0,0,1,0,0,1,0,0,1,0,0,0,259500,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0


## Polynomial Transformation

In [20]:

train_pr2_transformed = ownLibrary.PolynomialRegrTransformationReturnDF(train_dicho, pr2_best_performance, 2)
test_pr2_transformed = ownLibrary.PolynomialRegrTransformationReturnDF(test_dicho, pr2_best_performance, 2)
submission_pr2_transformed = ownLibrary.PolynomialRegrTransformationReturnDF(submission_dicho, pr2_best_performance, 2)


['1stFlrSF²', '1stFlrSFGarageArea', 'GarageArea²']
['1stFlrSF²', '1stFlrSFGarageArea', 'GarageArea²']
['1stFlrSF²', '1stFlrSFGarageArea', 'GarageArea²']


In [21]:

train_pr3_transformed = ownLibrary.PolynomialRegrTransformationReturnDF(train_pr2_transformed, pr3_best_performance, 3)
test_pr3_transformed = ownLibrary.PolynomialRegrTransformationReturnDF(test_pr2_transformed, pr3_best_performance, 3)
submission_pr3_transformed = ownLibrary.PolynomialRegrTransformationReturnDF(submission_pr2_transformed, pr3_best_performance, 3)


['PR3_6', 'PR3_7', 'PR3_8', 'PR3_9', 'PR3_10', 'PR3_11', 'PR3_12', 'PR3_13', 'PR3_14', 'PR3_15', 'PR3_16', 'PR3_17', 'PR3_18', 'PR3_19', 'PR3_20', 'PR3_21', 'PR3_22', 'PR3_23', 'PR3_24', 'PR3_25', 'PR3_26', 'PR3_27', 'PR3_28', 'PR3_29', 'PR3_30', 'PR3_31', 'PR3_32', 'PR3_33', 'PR3_34', 'PR3_35', 'PR3_36', 'PR3_37', 'PR3_38', 'PR3_39', 'PR3_40', 'PR3_41', 'PR3_42', 'PR3_43', 'PR3_44', 'PR3_45', 'PR3_46', 'PR3_47', 'PR3_48', 'PR3_49', 'PR3_50', 'PR3_51', 'PR3_52', 'PR3_53', 'PR3_54', 'PR3_55']
['PR3_6', 'PR3_7', 'PR3_8', 'PR3_9', 'PR3_10', 'PR3_11', 'PR3_12', 'PR3_13', 'PR3_14', 'PR3_15', 'PR3_16', 'PR3_17', 'PR3_18', 'PR3_19', 'PR3_20', 'PR3_21', 'PR3_22', 'PR3_23', 'PR3_24', 'PR3_25', 'PR3_26', 'PR3_27', 'PR3_28', 'PR3_29', 'PR3_30', 'PR3_31', 'PR3_32', 'PR3_33', 'PR3_34', 'PR3_35', 'PR3_36', 'PR3_37', 'PR3_38', 'PR3_39', 'PR3_40', 'PR3_41', 'PR3_42', 'PR3_43', 'PR3_44', 'PR3_45', 'PR3_46', 'PR3_47', 'PR3_48', 'PR3_49', 'PR3_50', 'PR3_51', 'PR3_52', 'PR3_53', 'PR3_54', 'PR3_55']
['PR3_

In [22]:
train_rwrk = copy.deepcopy(train_pr3_transformed)
test_rwrk = copy.deepcopy(test_pr3_transformed) 
submission_rwrk = copy.deepcopy(submission_pr3_transformed) 

## Features harmonization between datasets

Each data must have all features available

In [23]:
a = train_rwrk.columns.difference(test_rwrk.columns)
b = submission_rwrk.columns.difference(test_rwrk.columns)
manqueTest = a.tolist() + b.tolist()
print (manqueTest)


c = test_rwrk.columns.difference(train_rwrk.columns)
d = submission_rwrk.columns.difference(train_rwrk.columns)
manqueTrain = c.tolist() + d.tolist()
print (manqueTrain)

e = test_rwrk.columns.difference(submission_rwrk.columns)
f = train_rwrk.columns.difference(submission_rwrk.columns)
manqueSubmission = e.tolist() + f.tolist()
print (manqueSubmission)


for colonne in manqueTest :
    test_rwrk[colonne] = 0

for colonne in manqueTrain :
    train_rwrk[colonne] = 0

for colonne in manqueSubmission :
    submission_rwrk[colonne] = 0


['Exterior1st_3', 'NewKitchen_23', 'Exterior1st_3', 'GarageYrBlt_4', 'NewExterQualCond_12', 'NewKitchen_23']
['LotArea_1', 'NewCentrAirElec_13', 'GarageYrBlt_4', 'LotArea_1', 'NewExterQualCond_12']
['GarageQual_2', 'NewCentrAirElec_13', 'SalePrice_log', 'GarageQual_2', 'SalePrice_log']


In [24]:
a = train_rwrk.columns.difference(test_rwrk.columns)
b = submission_rwrk.columns.difference(test_rwrk.columns)
manqueTest = a.tolist() + b.tolist()
print (manqueTest)


c = test_rwrk.columns.difference(train_rwrk.columns)
d = submission_rwrk.columns.difference(train_rwrk.columns)
manqueTrain = c.tolist() + d.tolist()
print (manqueTrain)

e = test_rwrk.columns.difference(submission_rwrk.columns)
f = train_rwrk.columns.difference(submission_rwrk.columns)
manqueSubmission = e.tolist() + f.tolist()
print (manqueSubmission)


[]
[]
[]


# Modelisation

In [25]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge

import scipy
from scipy import stats

## Research of bests hyperparameters

In [26]:
# ---------------- KNN ----------------
'''
params = {
    'n_neighbors':[5,7,9,11,13,15,17],
    'algorithm':['auto']
    }
#'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute']
knn = KNeighborsRegressor()

model = GridSearchCV(knn, params, cv=5)
model.fit(train_rwrk[X], train_rwrk[y])
model.best_params_
'''

#{'algorithm': 'auto', 'n_neighbors': 9}

"\nparams = {\n    'n_neighbors':[5,7,9,11,13,15,17],\n    'algorithm':['auto']\n    }\n#'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute']\nknn = KNeighborsRegressor()\n\nmodel = GridSearchCV(knn, params, cv=5)\nmodel.fit(train_rwrk[X], train_rwrk[y])\nmodel.best_params_\n"

In [27]:
# ---------------- Réseaux de neurones ----------------
'''
params = {
    'solver': ['adam','lbfgs','sgd'], 
    }

rn = MLPClassifier()

model = GridSearchCV(rn, params, cv=5)
model.fit(train_rwrk[X], train_rwrk[y])
model.best_params_
'''
#{'solver': 'adam'}

"\nparams = {\n    'solver': ['adam','lbfgs','sgd'], \n    }\n\nrn = MLPClassifier()\n\nmodel = GridSearchCV(rn, params, cv=5)\nmodel.fit(train_rwrk[X], train_rwrk[y])\nmodel.best_params_\n"

In [28]:
# ---------------- Forêt aléatoire ----------------
'''params = {
    'n_estimators':[40,45,50,55,60], 
    'max_depth':[5,6,7,8,9,10], 
    'min_samples_split':[2,3,4],
    'min_samples_leaf':[1,2,3], 
    }

rfr = RandomForestRegressor()

model = GridSearchCV(rfr, params, cv=5)
model.fit(train_rwrk[X], train_rwrk[y])
model.best_params_
'''
#{'max_depth': 8,
# 'min_samples_leaf': 2,
# 'min_samples_split': 2,
# 'n_estimators': 55}

"params = {\n    'n_estimators':[40,45,50,55,60], \n    'max_depth':[5,6,7,8,9,10], \n    'min_samples_split':[2,3,4],\n    'min_samples_leaf':[1,2,3], \n    }\n\nrfr = RandomForestRegressor()\n\nmodel = GridSearchCV(rfr, params, cv=5)\nmodel.fit(train_rwrk[X], train_rwrk[y])\nmodel.best_params_\n"

In [29]:
# ---------------- Arbre de décision regressor ----------------

'''
params = {
    'max_depth':[40,45,50,55,60], 
    'min_samples_split':[2,3,4],
    'min_samples_leaf':[1,2,3], 
    }

dtr = DecisionTreeClassifier()

model = GridSearchCV(dtr, params, cv=5)
model.fit(train_rwrk[X], train_rwrk[y])
model.best_params_
'''
#{'max_depth': 45, 'min_samples_leaf': 2, 'min_samples_split': 3}

"\nparams = {\n    'max_depth':[40,45,50,55,60], \n    'min_samples_split':[2,3,4],\n    'min_samples_leaf':[1,2,3], \n    }\n\ndtr = DecisionTreeClassifier()\n\nmodel = GridSearchCV(dtr, params, cv=5)\nmodel.fit(train_rwrk[X], train_rwrk[y])\nmodel.best_params_\n"

# Features selection

In [31]:
print('For 261 features the stepwise needs around 3 minutes 17')
print('You have ' + str(len(train_rwrk.columns)) + ' features')

perform_stepwise = False
perform_exhaustive_features_selection = False


features = train_rwrk.columns
features_selection_ignore = ['SalePrice','Id','SalePrice_log']
features_to_use = [var for var in features if var not in features_selection_ignore ]
target_feature = 'SalePrice_log'

X = train_rwrk[features_to_use]
y = train_rwrk[target_feature]

For 261 features the stepwise needs around 3 minutes 17
You have 246 features


## Stepwise function

In [32]:
if (perform_stepwise == True):

    modelisation_feature = ownLibrary.stepwise_selection(X, y)

    print('resulting features:')
    print()
    print(result_stepwise)
    print(str(len(result_stepwise)) + ' features selected')

else:
    modelisation_feature = ['OpenPorchSF_4', 'TotalSF', 'YrBltAndRemod', 'PR3_24', 'NewFirePlaces_14', 'hasfireplace_1', 'hasfireplace_0', 'GarageArea', 'OverallCond_1', 'BsmtUnfSF', 'Total_sqr_footage', 'OverallQual_4', 'MSSubClass_2', 'PR3_52', 'OverallCond_3', 'OverallCond_2', 'Neighborhood_3', 'BsmtQual_1', 'MSZoning_3', 'MSZoning_2', 'PR3_28', 'PR3_20', 'NewKitchen_11', 'NewSale_12', 'PR3_27', 'BsmtExposure_2', 'OverallQual_1', 'PR3_47', 'Functional_3', 'Condition1_2', 'NewCentrAirElec_22', 'MSSubClass_3', 'LotArea_2', 'OverallQual_2', 'OverallQual_3', 'Exterior1st_1', 'PR3_21', 'LotArea_4', 'PR3_41', 'PR3_6', 'YrSold_4', 'Total_porch_sf', 'YearRemodAdd_3', 'YearRemodAdd_2', 'BsmtFinType2_1', 'PR3_40', 'PR3_48', 'BsmtCond_3', 'PR3_25']


## Exhaustive Feature Selector

In [33]:
if (perform_exhaustive_features_selection == True):
    ownLibrary.ExhaustiveFeatureSelector(train_rwrk[var_stepwise], train_rwrk['SalePrice_log'], min_features=1 , max_features=10)


In [34]:
len(modelisation_feature)

49

# Correlation

In [None]:
matrice_corr = pd.concat([train_rwrk[result_stepwise],train_rwrk['SalePrice_log']], axis=1, sort=True)
plt.matshow(matrice_corr.corr())
plt.show()

#matrice_corr.corr().to_excel (path + "corr.xlsx", index = False, header=True)

# Correlation des variables issue du stepwise

In [35]:
y = 'SalePrice_log'

In [36]:
    models = {
        'fa' : {
            'label' : 'Forêt aléatoire',
            #'function' : RandomForestRegressor(n_estimators=7, max_depth=11, min_samples_split=3, min_samples_leaf=1, random_state=0, n_jobs=-1)
            'function' : RandomForestRegressor(n_estimators=35, max_depth=11, min_samples_split=3, min_samples_leaf=1, random_state=0, n_jobs=-1)
        },
        'knn' : {
            'label' : 'KNN',
            'function' : KNeighborsRegressor(n_neighbors=9, algorithm = "auto")
        },
        'dtr' : {
            'label' : 'Arbre de décision - Regressor',
            'function' : DecisionTreeRegressor(min_samples_leaf = 2, min_samples_split = 3, max_depth =45)
        },
        'mrl' : {
            'label' : 'Regression linéaire multivariée',
            'function' : LinearRegression()
        },
        'rr' : {
            'label' : 'Ridge Regression',
            'function' : Ridge(alpha=9)
        }
    }
    
    
'''
'rn' : {
    'label' : 'Réseaux de neurones',
    'function' : MLPClassifier(hidden_layer_sizes=(5, 15), random_state=0, max_iter = 500, solver = 'adam', alpha= 0.05),
},
'''
'''
'dtc' : {
    'label' : 'Arbre de décision - Classifier',
    'function' : DecisionTreeClassifier(max_depth=6)
},
'''

"\n'dtc' : {\n'label' : 'Arbre de décision - Classifier',\n'function' : DecisionTreeClassifier(max_depth=6)\n},\n"

## Lancement modélisation

In [38]:
#var_to_use = soluce[1]
#var_to_use = colonnes
modelisation_feature = ['YrBltAndRemod', 'hasfireplace_0', 'GarageArea', 'BsmtUnfSF', 'OverallCond_1', 'MSSubClass_2', 'NewKitchen_11', 'OverallCond_2', 'PR3_24', 'BsmtExposure_1', 'MSZoning_2', 'YearBuilt_2', 'OverallQual_1', 'OverallQual_2', 'OverallQual_3', 'NewSale_12', 'Functional_1', 'Neighborhood_1', 'Neighborhood_2', 'BsmtCond_3', '2ndFlrSF', '1stFlrSF', 'TotalBsmtSF', 'YearBuilt_3', 'BsmtFinSF1']
#var_to_use = result_stepwise
print(len(modelisation_feature))

25


In [39]:
# Les deux méthodes font la même chose

# -------------------- fit model + Train et test en deux fois + affichage résultats -----------------

for model in models :
    models[model]['function'].fit(train_rwrk[modelisation_feature], train_rwrk[y])
predictions_train = ownLibrary.runModels1DS(train_rwrk, 'train', modelisation_feature, y, models)
predictions_test = ownLibrary.runModels1DS(test_rwrk, 'test', modelisation_feature, y, models)

ownLibrary.afficheResults(predictions_train, predictions_test, 'SalePrice', models)


------------- KNN -------------

Train : R² = 82.5% , rsquared = 83.9% , corr = 91.6% , MSE = 1096509505.94
Test : R² = 79.8% , rsqaured = 80.9% , corr = 90.0% , MSE = 1302144443.70

------------- Ridge Regression -------------

Train : R² = 90.1% , rsquared = 90.3% , corr = 95.0% , MSE = 615773556.84
Test : R² = 89.4% , rsqaured = 89.4% , corr = 94.6% , MSE = 683837382.63

------------- Regression linéaire multivariée -------------

Train : R² = 90.3% , rsquared = 90.4% , corr = 95.1% , MSE = 605357588.78
Test : R² = 89.7% , rsqaured = 89.7% , corr = 94.7% , MSE = 665333400.48

------------- Arbre de décision - Regressor -------------

Train : R² = 98.1% , rsquared = 98.1% , corr = 99.1% , MSE = 118150752.96
Test : R² = 70.7% , rsqaured = 73.1% , corr = 85.5% , MSE = 1884126757.06

------------- Forêt aléatoire -------------

Train : R² = 97.0% , rsquared = 97.5% , corr = 98.8% , MSE = 184701539.94
Test : R² = 83.4% , rsqaured = 83.6% , corr = 91.4% , MSE = 1069800265.71

