In [354]:
import numpy as np
import pandas as pd

In [355]:
houses = pd.read_csv("train.csv")
print(houses.head())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [356]:
houses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [357]:
houses["MSSubClass"].value_counts()

20     536
60     299
50     144
120     87
30      69
160     63
70      60
80      58
90      52
190     30
85      20
75      16
45      12
180     10
40       4
Name: MSSubClass, dtype: int64

We'll divide the dataset between expansive (> 180000) and non-expensive houses. A classification approach will allow to see which variables are important for our problem.

In [358]:
houses["Expensive"] = 0
houses.loc[houses["SalePrice"]>180000, "Expensive"] = 1

In [359]:
houses["Expensive"].value_counts()

0    896
1    564
Name: Expensive, dtype: int64

In [360]:
pd.crosstab(houses["MSSubClass"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
MSSubClass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20,327,209,536
30,69,0,69
40,3,1,4
45,12,0,12
50,122,22,144
60,64,235,299
70,42,18,60
75,10,6,16
80,46,12,58
85,19,1,20


Categories 60 and 120 seem to contain more expensive houses, while category 30 contains only non-expensive houses. 30 means old and small houses.

In [361]:
houses["MSSubClass_Recode"] = "Cheap"
houses.loc[houses["MSSubClass"] == 20, "MSSubClass_Recode"] = "Normal"
houses.loc[houses["MSSubClass"] == 150, "MSSubClass_Recode"] = "Normal"
houses.loc[houses["MSSubClass"] == 60, "MSSubClass_Recode"] = "Expensive"
houses.loc[houses["MSSubClass"] == 120, "MSSubClass_Recode"] = "Expensive"

In [362]:
pd.crosstab(houses["MSSubClass_Recode"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
MSSubClass_Recode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cheap,469,69,538
Expensive,100,286,386
Normal,327,209,536
Total,896,564,1460


In [363]:
pd.crosstab(houses["MSZoning"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
MSZoning,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C (all),10,0,10
FV,22,43,65
RH,14,2,16
RL,643,508,1151
RM,207,11,218
Total,896,564,1460


Category RM seems to contain mostly non-expensive houses. RM means Residential Medium Density. RL means Residential Low Density. We can see that in residential areas where the density is higher, prices in houses tend to be cheaper.

In [364]:
houses["MSZoning_Recode"] = "Cheap"
houses.loc[houses["MSZoning"] == "RL", "MSZoning_Recode"] = "Normal"
houses.loc[houses["MSZoning"] == "FV", "MSZoning_Recode"] = "Expensive"

In [365]:
pd.crosstab(houses["MSZoning_Recode"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
MSZoning_Recode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cheap,231,13,244
Expensive,22,43,65
Normal,643,508,1151
Total,896,564,1460


In [366]:
houses.groupby(houses["Expensive"])["LotFrontage"].mean()

Expensive
0    64.954485
1    78.433921
Name: LotFrontage, dtype: float64

Not surprisingly, more expensive houses have more feet of street connected to property.

In [367]:
houses.groupby(houses["Expensive"])["LotArea"].mean()

Expensive
0     8816.839286
1    13217.519504
Name: LotArea, dtype: float64

Same conclusion for the lot area. Bigger is more expensive.

In [368]:
pd.crosstab(houses["Street"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
Street,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Grvl,4,2,6
Pave,892,562,1454
Total,896,564,1460


The type of road access is not usable for our analysis

In [369]:
pd.crosstab(houses["Alley"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
Alley,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Grvl,49,1,50
Pave,27,14,41
Total,76,15,91


Alley has too much missing data to be useful (only 91 non-null for 1460 houses)

In [370]:
pd.crosstab(houses["LotShape"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
LotShape,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
IR1,213,271,484
IR2,13,28,41
IR3,3,7,10
Reg,667,258,925
Total,896,564,1460


Slightly irregular properties tend to be more expensive, which is surprising

In [371]:
houses["LotShape_Recode"] = "Expensive"
houses.loc[houses["LotShape"] == "Reg", "LotShape_Recode"] = "Normal"

In [372]:
pd.crosstab(houses["LotShape_Recode"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
LotShape_Recode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Expensive,229,306,535
Normal,667,258,925
Total,896,564,1460


In [373]:
pd.crosstab(houses["LandContour"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
LandContour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bnk,51,12,63
HLS,19,31,50
Low,15,21,36
Lvl,811,500,1311
Total,896,564,1460


HLS and Low seem more expensive, but few houses are impacted by this factor.

In [374]:
houses["LandContour_Recode"] = "Expensive"
houses.loc[houses["LandContour"] == "Bnk", "LandContour_Recode"] = "Cheap"
houses.loc[houses["LandContour"] == "Lvl", "LandContour_Recode"] = "Normal"

In [375]:
pd.crosstab(houses["LandContour_Recode"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
LandContour_Recode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cheap,51,12,63
Expensive,34,52,86
Normal,811,500,1311
Total,896,564,1460


In [376]:
pd.crosstab(houses["Utilities"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
Utilities,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AllPub,895,564,1459
NoSeWa,1,0,1
Total,896,564,1460


Not useful

In [377]:
pd.crosstab(houses["LotConfig"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
LotConfig,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Corner,161,102,263
CulDSac,32,62,94
FR2,28,19,47
FR3,1,3,4
Inside,674,378,1052
Total,896,564,1460


Inside might be slightly cheaper, but it's not obvious.

In [378]:
houses["LotConfig_Recode"] = "Normal"
houses.loc[houses["LotConfig"] == "Inside", "LotConfig_Recode"] = "Cheap"
houses.loc[houses["LotConfig"] == "CulDSac", "LotConfig_Recode"] = "Expensive"

In [379]:
pd.crosstab(houses["LotConfig_Recode"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
LotConfig_Recode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cheap,674,378,1052
Expensive,32,62,94
Normal,190,124,314
Total,896,564,1460


In [380]:
pd.crosstab(houses["LandSlope"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
LandSlope,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Gtl,858,524,1382
Mod,32,33,65
Sev,6,7,13
Total,896,564,1460


Most of the houses have gentle slope, so it's not gonna be very discriminant

In [381]:
houses["LandSlope_Recode"] = "Expensive"
houses.loc[houses["LandSlope"] == "Gtl", "LandSlope_Recode"] = "Normal"

In [382]:
pd.crosstab(houses["LandSlope_Recode"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
LandSlope_Recode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Expensive,38,40,78
Normal,858,524,1382
Total,896,564,1460


In [383]:
pd.crosstab(houses["Neighborhood"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Blmngtn,7,10,17
Blueste,2,0,2
BrDale,16,0,16
BrkSide,51,7,58
ClearCr,7,21,28
CollgCr,52,98,150
Crawfor,21,30,51
Edwards,91,9,100
Gilbert,37,42,79
IDOTRR,37,0,37


Neighborhood is important, but it is to be used with a good knowledge of the city.

In [384]:
cheap_neighborhood = ["Blueste", "BrDale", "BrkSide", "Edwards", "IDOTRR", "MeadowV", "Mitchel", "NAmes", "NPkVill",
                     "OldTown", "SWISU", "Sawyer"]
medium_neighborhood = ["Blmngtn", "CollgCr", "Crawfor", "Gilbert", "NWAmes", "SawyerW"]
expensive_neighborhood = ["ClearCr", "NoRidge", "NridgHt", "Somerst", "StoneBr", "Timber", "Veenker"]

houses["Neighborhood_Recode"] = "Unassigned"

for n in cheap_neighborhood:
    houses.loc[houses["Neighborhood"] == n, "Neighborhood_Recode"] = "Cheap"
for n in medium_neighborhood:
    houses.loc[houses["Neighborhood"] == n, "Neighborhood_Recode"] = "Medium"
for n in expensive_neighborhood:
    houses.loc[houses["Neighborhood"] == n, "Neighborhood_Recode"] = "Expensive"


In [385]:
pd.crosstab(houses["Neighborhood_Recode"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
Neighborhood_Recode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cheap,668,57,725
Expensive,46,260,306
Medium,182,247,429
Total,896,564,1460


In [386]:
pd.crosstab(houses["Condition1"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
Condition1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Artery,44,4,48
Feedr,68,13,81
Norm,747,513,1260
PosA,1,7,8
PosN,6,13,19
RRAe,11,0,11
RRAn,17,9,26
RRNe,0,2,2
RRNn,2,3,5
Total,896,564,1460


Most of the houses are normal, so not very discriminant

In [387]:
pd.crosstab(houses["Condition2"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
Condition2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Artery,2,0,2
Feedr,6,0,6
Norm,885,560,1445
PosA,0,1,1
PosN,0,2,2
RRAe,0,1,1
RRAn,1,0,1
RRNn,2,0,2
Total,896,564,1460


Not enough data

In [388]:
pd.crosstab(houses["BldgType"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
BldgType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1Fam,714,506,1220
2fmCon,29,2,31
Duplex,48,4,52
Twnhs,39,4,43
TwnhsE,66,48,114
Total,896,564,1460


Not discriminant

In [389]:
pd.crosstab(houses["HouseStyle"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
HouseStyle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.5Fin,130,24,154
1.5Unf,14,0,14
1Story,465,261,726
2.5Fin,3,5,8
2.5Unf,9,2,11
2Story,188,257,445
SFoyer,35,2,37
SLvl,52,13,65
Total,896,564,1460


2Story is generally more expensive than 1Story

In [390]:
houses["HouseStyle_Recode"] = "Cheap"
houses.loc[houses["HouseStyle"] == "2Story", "HouseStyle_Recode"] = "Expensive"
houses.loc[houses["HouseStyle"] == "2.5Fin", "HouseStyle_Recode"] = "Expensive"

In [391]:
pd.crosstab(houses["HouseStyle_Recode"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
HouseStyle_Recode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cheap,705,302,1007
Expensive,191,262,453
Total,896,564,1460


In [392]:
pd.crosstab(houses["OverallQual"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
OverallQual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2,0,2
2,3,0,3
3,20,0,20
4,115,1,116
5,378,19,397
6,277,97,374
7,89,230,319
8,11,157,168
9,0,43,43
10,1,17,18


More quality = More expensive

In [393]:
pd.crosstab(houses["OverallCond"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
OverallCond,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,0,1
2,4,1,5
3,23,2,25
4,54,3,57
5,386,435,821
6,203,49,252
7,155,50,205
8,58,14,72
9,12,10,22
Total,896,564,1460


Results are a bit surprising. Low condition houses seem to be cheaper, but they are not very numerous. Less useful than the quality of the materials.

In [394]:
pd.crosstab(houses["YearBuilt"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
YearBuilt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1872,1,0,1
1875,1,0,1
1880,2,2,4
1882,1,0,1
1885,2,0,2
...,...,...,...
2007,8,41,49
2008,1,22,23
2009,2,16,18
2010,0,1,1


In [395]:
houses.groupby(houses["Expensive"])["YearBuilt"].mean()

Expensive
0    1958.851562
1    1990.992908
Name: YearBuilt, dtype: float64

Older houses are cheaper

In [396]:
houses.groupby(houses["Expensive"])["YearRemodAdd"].mean()

Expensive
0    1976.712054
1    1997.819149
Name: YearRemodAdd, dtype: float64

To explore further

In [397]:
pd.crosstab(houses["RoofStyle"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
RoofStyle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Flat,6,7,13
Gable,730,411,1141
Gambrel,7,4,11
Hip,149,137,286
Mansard,4,3,7
Shed,0,2,2
Total,896,564,1460


Might be a bit useful

In [398]:
pd.crosstab(houses["RoofMatl"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
RoofMatl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ClyTile,1,0,1
CompShg,886,548,1434
Membran,0,1,1
Metal,1,0,1
Roll,1,0,1
Tar&Grv,6,5,11
WdShake,0,5,5
WdShngl,1,5,6
Total,896,564,1460


Not useful

In [399]:
pd.crosstab(houses["Exterior1st"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
Exterior1st,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AsbShng,20,0,20
AsphShn,1,0,1
BrkComm,2,0,2
BrkFace,28,22,50
CBlock,1,0,1
CemntBd,23,38,61
HdBoard,169,53,222
ImStucc,0,1,1
MetalSd,185,35,220
Plywood,69,39,108


To be used

In [400]:
houses["Exterior1st_Recode"] = "Cheap"
houses.loc[houses["Exterior1st"] == "CemntBd", "Exterior1st_Recode"] = "Expensive"
houses.loc[houses["Exterior1st"] == "VinylSd", "Exterior1st_Recode"] = "Expensive"
houses.loc[houses["Exterior1st"] == "Stone", "Exterior1st_Recode"] = "Expensive"


In [401]:
pd.crosstab(houses["Exterior1st_Recode"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
Exterior1st_Recode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cheap,681,201,882
Expensive,215,363,578
Total,896,564,1460


In [402]:
pd.crosstab(houses["Exterior2nd"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
Exterior2nd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AsbShng,19,1,20
AsphShn,3,0,3
Brk Cmn,7,0,7
BrkFace,15,10,25
CBlock,1,0,1
CmentBd,22,38,60
HdBoard,151,56,207
ImStucc,5,5,10
MetalSd,180,34,214
Other,0,1,1


Mostly the same as Exterior1st

In [403]:
pd.crosstab(houses["MasVnrType"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
MasVnrType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BrkCmn,13,2,15
BrkFace,219,226,445
,639,225,864
Stone,25,103,128
Total,896,556,1452


No veneer is cheaper

In [404]:
houses["MasVnrType_Recode"] = "Cheap"
houses.loc[houses["MasVnrType"] == "BrkFace", "MasVnrType_Recode"] = "Normal"
houses.loc[houses["MasVnrType"] == "Stone", "MasVnrType_Recode"] = "Expensive"

In [405]:
pd.crosstab(houses["MasVnrType_Recode"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
MasVnrType_Recode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cheap,652,235,887
Expensive,25,103,128
Normal,219,226,445
Total,896,564,1460


In [406]:
houses.groupby(houses["Expensive"])["MasVnrArea"].mean()

Expensive
0     60.393973
1    173.449640
Name: MasVnrArea, dtype: float64

In [407]:
pd.crosstab(houses["ExterQual"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
ExterQual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ex,3,49,52
Fa,13,1,14
Gd,109,379,488
TA,771,135,906
Total,896,564,1460


Good quality exterior means more expensive house

In [408]:
houses["ExterQual_Recode"] = "Cheap"
houses.loc[houses["ExterQual"] == "Gd", "ExterQual_Recode"] = "Expensive"
houses.loc[houses["ExterQual"] == "Ex", "ExterQual_Recode"] = "Expensive"

In [409]:
pd.crosstab(houses["ExterQual_Recode"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
ExterQual_Recode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cheap,784,136,920
Expensive,112,428,540
Total,896,564,1460


In [410]:
pd.crosstab(houses["ExterCond"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
ExterCond,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ex,2,1,3
Fa,28,0,28
Gd,104,42,146
Po,1,0,1
TA,761,521,1282
Total,896,564,1460


Not discriminant

In [411]:
pd.crosstab(houses["Foundation"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
Foundation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BrkTil,128,18,146
CBlock,525,109,634
PConc,214,433,647
Slab,23,1,24
Stone,4,2,6
Wood,2,1,3
Total,896,564,1460


Has a significant impact

In [412]:
houses["Foundation_Recode"] = "Cheap"
houses.loc[houses["Foundation"] == "PConc", "Foundation_Recode"] = "Expensive"

In [413]:
pd.crosstab(houses["Foundation_Recode"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
Foundation_Recode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cheap,682,131,813
Expensive,214,433,647
Total,896,564,1460


In [414]:
pd.crosstab(houses["BsmtQual"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
BsmtQual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ex,11,110,121
Fa,33,2,35
Gd,246,372,618
TA,570,79,649
Total,860,563,1423


Higher basement means more expensive. We have some missing values.

In [415]:
houses["BsmtQual_Recode"] = "Cheap"
houses.loc[houses["BsmtQual"] == "Gd", "BsmtQual_Recode"] = "Normal"
houses.loc[houses["BsmtQual"] == "Ex", "BsmtQual_Recode"] = "Expensive"

In [416]:
pd.crosstab(houses["BsmtQual_Recode"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
BsmtQual_Recode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cheap,639,82,721
Expensive,11,110,121
Normal,246,372,618
Total,896,564,1460


In [417]:
pd.crosstab(houses["BsmtCond"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
BsmtCond,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fa,39,6,45
Gd,27,38,65
Po,2,0,2
TA,792,519,1311
Total,860,563,1423


Not discriminant

In [418]:
pd.crosstab(houses["BsmtExposure"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
BsmtExposure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Av,105,116,221
Gd,41,93,134
Mn,57,57,114
No,657,296,953
Total,860,562,1422


An exposure means slightly more expensive house

In [419]:
houses["BsmtExposure_Recode"] = "Normal"
houses.loc[houses["BsmtExposure"] == "No", "BsmtExposure_Recode"] = "Cheap"
houses.loc[houses["BsmtExposure"] == "Gd", "BsmtExposure_Recode"] = "Expensive"

In [420]:
pd.crosstab(houses["BsmtExposure_Recode"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
BsmtExposure_Recode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cheap,657,296,953
Expensive,41,93,134
Normal,198,175,373
Total,896,564,1460


In [421]:
pd.crosstab(houses["BsmtFinType1"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
BsmtFinType1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ALQ,168,52,220
BLQ,120,28,148
GLQ,127,291,418
LwQ,56,18,74
Rec,115,18,133
Unf,274,156,430
Total,860,563,1423


GLQ regroups most of the expensive houses

In [422]:
houses["BsmtFinType1_Recode"] = "Cheap"
houses.loc[houses["BsmtFinType1"] == "GLQ", "BsmtFinType1_Recode"] = "Expensive" 

In [423]:
pd.crosstab(houses["BsmtFinType1_Recode"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
BsmtFinType1_Recode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cheap,769,273,1042
Expensive,127,291,418
Total,896,564,1460


In [424]:
houses.groupby(houses["Expensive"])["BsmtFinSF1"].mean()

Expensive
0    360.664062
1    575.459220
Name: BsmtFinSF1, dtype: float64

More expensive houses have higher finished square feet

In [425]:
pd.crosstab(houses["BsmtFinType2"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
BsmtFinType2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ALQ,11,8,19
BLQ,27,6,33
GLQ,6,8,14
LwQ,34,12,46
Rec,40,14,54
Unf,742,514,1256
Total,860,562,1422


Not useful

In [426]:
houses.groupby(houses["Expensive"])["BsmtFinSF2"].mean()

Expensive
0    48.357143
1    43.677305
Name: BsmtFinSF2, dtype: float64

Not useful because type 2 is often absent.

In [427]:
houses.groupby(houses["Expensive"])["BsmtUnfSF"].mean()

Expensive
0    481.475446
1    703.491135
Name: BsmtUnfSF, dtype: float64

Bigger means more expensive

In [428]:
houses.groupby(houses["Expensive"])["TotalBsmtSF"].mean()

Expensive
0     890.496652
1    1322.627660
Name: TotalBsmtSF, dtype: float64

Bigger means more expensive

In [429]:
pd.crosstab(houses["Heating"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
Heating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Floor,1,0,1
GasA,870,558,1428
GasW,12,6,18
Grav,7,0,7
OthW,2,0,2
Wall,4,0,4
Total,896,564,1460


Not useful

In [430]:
pd.crosstab(houses["HeatingQC"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
HeatingQC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ex,298,443,741
Fa,41,8,49
Gd,187,54,241
Po,1,0,1
TA,369,59,428
Total,896,564,1460


The higher the quality and condition of heating, the more expensive the house

In [431]:
houses["HeatingQC_Recode"] = "Cheap"
houses.loc[houses["HeatingQC"] == "Gd", "HeatingQC_Recode"] = "Normal"
houses.loc[houses["HeatingQC"] == "Ex", "HeatingQC_Recode"] = "Expensive"

In [432]:
pd.crosstab(houses["HeatingQC_Recode"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
HeatingQC_Recode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cheap,411,67,478
Expensive,298,443,741
Normal,187,54,241
Total,896,564,1460


In [433]:
pd.crosstab(houses["CentralAir"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
CentralAir,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
N,91,4,95
Y,805,560,1365
Total,896,564,1460


Most of the houses have central air conditioning so not sure it it will have a big impact

In [434]:
pd.crosstab(houses["Electrical"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
Electrical,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FuseA,87,7,94
FuseF,27,0,27
FuseP,3,0,3
Mix,1,0,1
SBrkr,777,557,1334
Total,895,564,1459


Again, most of the houses are the same

In [435]:
houses["Electrical_Recode"] = "Cheap"
houses.loc[houses["Electrical"] == "SBrkr", "Electrical_Recode"] = "Normal"

In [436]:
pd.crosstab(houses["Electrical_Recode"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
Electrical_Recode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cheap,119,7,126
Normal,777,557,1334
Total,896,564,1460


In [437]:
houses.groupby(houses["Expensive"])["1stFlrSF"].mean()

Expensive
0    1018.421875
1    1391.718085
Name: 1stFlrSF, dtype: float64

Stating the obvious here, a bigger first floor makes the house more expensive

In [438]:
houses.groupby(houses["Expensive"])["2ndFlrSF"].mean()

Expensive
0    244.120536
1    510.420213
Name: 2ndFlrSF, dtype: float64

Same for second floor

In [439]:
houses.groupby(houses["Expensive"])["LowQualFinSF"].mean()

Expensive
0    6.466518
1    4.856383
Name: LowQualFinSF, dtype: float64

This seems to concern few houses

In [440]:
houses.groupby(houses["Expensive"])["GrLivArea"].mean()

Expensive
0    1269.008929
1    1906.994681
Name: GrLivArea, dtype: float64

Is just an addition of first floor and second floor

In [441]:
pd.crosstab(houses["BsmtFullBath"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
BsmtFullBath,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,584,272,856
1,304,284,588
2,7,8,15
3,1,0,1
Total,896,564,1460


One basement bathroom makes the house slightly more expensive

In [442]:
pd.crosstab(houses["BsmtHalfBath"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
BsmtHalfBath,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,834,544,1378
1,61,19,80
2,1,1,2
Total,896,564,1460


Not useful

In [443]:
pd.crosstab(houses["FullBath"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
FullBath,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,6,3,9
1,603,47,650
2,283,485,768
3,4,29,33
Total,896,564,1460


2 bathrooms makes the house more expensive

In [444]:
pd.crosstab(houses["HalfBath"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
HalfBath,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,652,261,913
1,236,299,535
2,8,4,12
Total,896,564,1460


1 half bathroom is a nice addition, but doesn't that just indicate the presence of a second floor.

In [445]:
pd.crosstab(houses["BedroomAbvGr"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
BedroomAbvGr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3,3,6
1,32,18,50
2,264,94,358
3,484,320,804
4,95,118,213
5,12,9,21
6,6,1,7
8,0,1,1
Total,896,564,1460


More bedrooms is more expensive

In [446]:
pd.crosstab(houses["KitchenAbvGr"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
KitchenAbvGr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,0,1
1,834,558,1392
2,59,6,65
3,2,0,2
Total,896,564,1460


Not useful

In [447]:
pd.crosstab(houses["KitchenQual"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
KitchenQual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ex,11,89,100
Fa,38,1,39
Gd,198,388,586
TA,649,86,735
Total,896,564,1460


A good quality kitchen makes the house more expensive

In [448]:
houses["KitchenQual_Recode"] = "Normal"
houses.loc[houses["KitchenQual"] == "Gd", "KitchenQual_Recode"] = "Expensive"
houses.loc[houses["KitchenQual"] == "Ex", "KitchenQual_Recode"] = "Very expensive"

In [449]:
pd.crosstab(houses["KitchenQual_Recode"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
KitchenQual_Recode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Expensive,198,388,586
Normal,687,87,774
Very expensive,11,89,100
Total,896,564,1460


In [450]:
pd.crosstab(houses["TotRmsAbvGrd"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
TotRmsAbvGrd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,1,0,1
3,17,0,17
4,89,8,97
5,237,38,275
6,293,109,402
7,152,177,329
8,71,116,187
9,16,59,75
10,14,33,47
11,3,15,18


Very useful information

In [451]:
pd.crosstab(houses["Functional"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
Functional,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Maj1,10,4,14
Maj2,5,0,5
Min1,27,4,31
Min2,31,3,34
Mod,10,5,15
Sev,1,0,1
Typ,812,548,1360
Total,896,564,1460


Most of the houses are the same, so not very useful

In [452]:
pd.crosstab(houses["Fireplaces"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
Fireplaces,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,566,124,690
1,283,367,650
2,46,69,115
3,1,4,5
Total,896,564,1460


The precense of a fireplace makes the house more expensive

In [453]:
pd.crosstab(houses["FireplaceQu"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
FireplaceQu,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ex,1,23,24
Fa,23,10,33
Gd,146,234,380
Po,20,0,20
TA,140,173,313
Total,330,440,770


Value is missing if no fireplace, should be "NA". Ex and Gd quality makes a house even more expensive. Not that much of a difference though.

In [454]:
pd.crosstab(houses["GarageType"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
GarageType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2Types,5,1,6
Attchd,410,460,870
Basment,15,4,19
BuiltIn,22,66,88
CarPort,9,0,9
Detchd,356,31,387
Total,817,562,1379


Very useful information

In [455]:
houses["GarageType_Recode"] = "Cheap"
houses.loc[houses["GarageType"] == "Attchd", "GarageType_Recode"] = "Normal"
houses.loc[houses["GarageType"] == "BuiltIn", "GarageType_Recode"] = "Expensive"

In [456]:
pd.crosstab(houses["GarageType_Recode"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
GarageType_Recode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cheap,464,38,502
Expensive,22,66,88
Normal,410,460,870
Total,896,564,1460


In [457]:
houses.groupby(houses["Expensive"])["GarageYrBlt"].mean()

Expensive
0    1968.614443
1    1992.886121
Name: GarageYrBlt, dtype: float64

A more recent garage means a more expensive house. Probably too correlated to when the house was built. To be investigated further.

In [458]:
pd.crosstab(houses["GarageFinish"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
GarageFinish,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fin,105,247,352
RFn,178,244,422
Unf,534,71,605
Total,817,562,1379


Very useful information

In [459]:
pd.crosstab(houses["GarageCars"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
GarageCars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,79,2,81
1,355,14,369
2,445,379,824
3,15,166,181
4,2,3,5
Total,896,564,1460


The more cars a garage can fit, the more expensive the house

In [460]:
houses.groupby(houses["Expensive"])["GarageArea"].mean()

Expensive
0    385.017857
1    612.721631
Name: GarageArea, dtype: float64

Bigger garage = more expensive

In [461]:
pd.crosstab(houses["GarageQual"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
GarageQual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ex,2,1,3
Fa,44,4,48
Gd,6,8,14
Po,3,0,3
TA,762,549,1311
Total,817,562,1379


Not useful

In [462]:
pd.crosstab(houses["GarageCond"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
GarageCond,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ex,2,0,2
Fa,34,1,35
Gd,7,2,9
Po,7,0,7
TA,767,559,1326
Total,817,562,1379


Not useful

In [463]:
pd.crosstab(houses["PavedDrive"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
PavedDrive,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
N,80,10,90
P,29,1,30
Y,787,553,1340
Total,896,564,1460


Most of the houses are the same

In [464]:
houses.groupby(houses["Expensive"])["WoodDeckSF"].mean()

Expensive
0     66.685268
1    138.026596
Name: WoodDeckSF, dtype: float64

Significant impact

In [465]:
houses.groupby(houses["Expensive"])["OpenPorchSF"].mean()

Expensive
0    29.714286
1    73.581560
Name: OpenPorchSF, dtype: float64

Significant impact

In [466]:
houses.groupby(houses["Expensive"])["EnclosedPorch"].mean()

Expensive
0    27.654018
1    12.898936
Name: EnclosedPorch, dtype: float64

Here it's less expensive houses which have a higher enclosed porch area

In [467]:
houses.groupby(houses["Expensive"])["3SsnPorch"].mean()

Expensive
0    1.802455
1    5.962766
Name: 3SsnPorch, dtype: float64

Significant impact

In [468]:
houses.groupby(houses["Expensive"])["ScreenPorch"].mean()

Expensive
0    12.386161
1    19.310284
Name: ScreenPorch, dtype: float64

Useful

In [469]:
houses.groupby(houses["Expensive"])["PoolArea"].mean()

Expensive
0    1.178571
1    5.269504
Name: PoolArea, dtype: float64

In [470]:
pd.crosstab(houses["PoolQC"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
PoolQC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ex,0,2,2
Fa,0,2,2
Gd,2,1,3
Total,2,5,7


Only 7 houses have a pool, so we will not use this information. We might even exclude houses with pool if they are too different in price.

In [471]:
pd.crosstab(houses["Fence"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
Fence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GdPrv,40,19,59
GdWo,49,5,54
MnPrv,138,19,157
MnWw,10,1,11
Total,237,44,281


Too many missing values

In [472]:
pd.crosstab(houses["MiscFeature"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
MiscFeature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Gar2,1,1,2
Othr,2,0,2
Shed,37,12,49
TenC,0,1,1
Total,40,14,54


Most of the time no miscellaneous feature

In [473]:
houses.groupby(houses["Expensive"])["MiscVal"].mean()

Expensive
0    48.341518
1    35.780142
Name: MiscVal, dtype: float64

Not relevant

In [474]:
pd.crosstab(houses["MoSold"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
MoSold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,37,21,58
2,29,23,52
3,69,37,106
4,95,46,141
5,132,72,204
6,160,93,253
7,144,90,234
8,68,54,122
9,31,32,63
10,57,32,89


It seems risky to link the price of a house with the month in which it is sold

In [475]:
pd.crosstab(houses["YrSold"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
YrSold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2006,194,120,314
2007,197,132,329
2008,189,115,304
2009,207,131,338
2010,109,66,175
Total,896,564,1460


Maybe an inflation factor to take into account. We might try to build different models per year to see if there is any difference.

In [476]:
houses.groupby(houses["YrSold"])["SalePrice"].mean()

YrSold
2006    182549.458599
2007    186063.151976
2008    177360.838816
2009    179432.103550
2010    177393.674286
Name: SalePrice, dtype: float64

We see a drop in price after 2008, we will have to take this into account

In [477]:
pd.crosstab(houses["SaleType"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
SaleType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
COD,35,8,43
CWD,2,2,4
Con,0,2,2
ConLD,7,2,9
ConLI,3,2,5
ConLw,3,2,5
New,23,99,122
Oth,3,0,3
WD,820,447,1267
Total,896,564,1460


A new home is more expensive

In [478]:
pd.crosstab(houses["SaleCondition"], houses["Expensive"], margins=True, margins_name="Total")

Expensive,0,1,Total
SaleCondition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Abnorml,81,20,101
AdjLand,4,0,4
Alloca,8,4,12
Family,16,4,20
Normal,762,436,1198
Partial,25,100,125
Total,896,564,1460


Partial probably means the house is new. Let's check it.

In [479]:
pd.crosstab(houses["SaleCondition"], houses["SaleType"], margins=True, margins_name="Total")

SaleType,COD,CWD,Con,ConLD,ConLI,ConLw,New,Oth,WD,Total
SaleCondition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Abnorml,24,1,0,2,1,0,0,3,70,101
AdjLand,0,0,0,0,0,0,0,0,4,4
Alloca,0,0,0,0,0,0,0,0,12,12
Family,0,1,0,0,0,0,0,0,19,20
Normal,19,2,2,6,4,5,0,0,1160,1198
Partial,0,0,0,1,0,0,122,0,2,125
Total,43,4,2,9,5,5,122,3,1267,1460


Our hypothesis was correct

# FILL THE MISSING VALUES

In [480]:
houses['FireplaceQu'].fillna("No", inplace=True)
houses['BsmtQual'].fillna("No", inplace=True)
houses['BsmtCond'].fillna("No", inplace=True)
houses['BsmtFinType1'].fillna("No", inplace=True)
houses['BsmtFinType2'].fillna("No", inplace=True)
houses['BsmtFinType2'].fillna("None", inplace=True)
houses['GarageType'].fillna("NA", inplace=True)

def fill_all_missing_values(data):
    for col in data.columns:
        if((data[col].dtype == 'float64') or (data[col].dtype == 'int64')):
            data[col].fillna(data[col].mean(), inplace=True)
        else:
             data[col].fillna(data[col].mode()[0], inplace=True)


fill_all_missing_values(houses)

# DROP UNWANTED DATA

drop_col = ['Id', 'Alley', 'PoolQC', 'MiscFeature', 'Fence', 'MoSold', 'YrSold',
            'LandSlope',
            'LowQualFinSF', 'Condition1', 'Condition2', 'Heating',
             'Exterior2nd', 'Functional',
             'FireplaceQu', 'GarageQual', 'GarageCond', 'OverallCond'
           ]

train_data.drop(drop_col, axis=1, inplace=True)

# SELECTING RELEVANT VARIABLES

In [481]:
vars_to_keep = ["MSSubClass_Recode", "MSZoning_Recode", "LotFrontage", "LotArea", "LotShape_Recode",
               "LandContour_Recode", "LotConfig_Recode", "Neighborhood_Recode", "HouseStyle_Recode",
               "OverallQual", "YearBuilt", "YearRemodAdd", "Exterior1st_Recode", "MasVnrType_Recode",
               "MasVnrArea", "ExterQual_Recode", "Foundation_Recode", "BsmtQual_Recode",
               "BsmtExposure_Recode", "BsmtFinType1_Recode", "BsmtFinSF1", "BsmtUnfSF", 
               "TotalBsmtSF", "HeatingQC_Recode", "CentralAir", "Electrical_Recode",
               "1stFlrSF", "2ndFlrSF", "GrLivArea", "BsmtFullBath", "FullBath", "HalfBath",
               "BedroomAbvGr", "KitchenQual_Recode", "TotRmsAbvGrd", "Fireplaces", "FireplaceQu",
               "GarageType_Recode", "GarageYrBlt", "GarageFinish", "GarageCars", "GarageArea",
               "WoodDeckSF", "OpenPorchSF", "3SsnPorch", "ScreenPorch"]

# PREPROCESSING

In [482]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_col = ["MSSubClass_Recode", "MSZoning_Recode", "LotShape_Recode", "LandContour_Recode", "LotConfig_Recode",
              "Neighborhood_Recode", "HouseStyle_Recode", "Exterior1st_Recode", "MasVnrType_Recode",
              "ExterQual_Recode", "Foundation_Recode", "BsmtQual_Recode", "BsmtExposure_Recode", 
              "BsmtFinType1_Recode", "HeatingQC_Recode", "CentralAir", "Electrical_Recode", "KitchenQual_Recode", "FirePlaceQu",
              "GarageType_Recode", "GarageFinish"]

OE = OrdinalEncoder(categories=[['Cheap', 'Normal', 'Expensive']])
houses["MSSubClass_Recode"] = OE.fit_transform(houses[["MSSubClass_Recode"]])

OE = OrdinalEncoder(categories=[['Cheap', 'Normal', 'Expensive']])
houses["MSZoning_Recode"] = OE.fit_transform(houses[["MSZoning_Recode"]])

OE = OrdinalEncoder(categories=[['Normal', 'Expensive']])
houses["LotShape_Recode"] = OE.fit_transform(houses[["LotShape_Recode"]])

OE = OrdinalEncoder(categories=[['Cheap', 'Normal', 'Expensive']])
houses["LandContour_Recode"] = OE.fit_transform(houses[["LandContour_Recode"]])

OE = OrdinalEncoder(categories=[['Cheap', 'Normal', 'Expensive']])
houses["LotConfig_Recode"] = OE.fit_transform(houses[["LotConfig_Recode"]])

OE = OrdinalEncoder(categories=[['Cheap', 'Medium', 'Expensive']])
houses["Neighborhood_Recode"] = OE.fit_transform(houses[["Neighborhood_Recode"]])

OE = OrdinalEncoder(categories=[['Cheap', 'Expensive']])
houses["HouseStyle_Recode"] = OE.fit_transform(houses[["HouseStyle_Recode"]])

OE = OrdinalEncoder(categories=[['Cheap', 'Expensive']])
houses["Exterior1st_Recode"] = OE.fit_transform(houses[["Exterior1st_Recode"]])

OE = OrdinalEncoder(categories=[['Cheap', 'Normal', 'Expensive']])
houses["MasVnrType_Recode"] = OE.fit_transform(houses[["MasVnrType_Recode"]])

OE = OrdinalEncoder(categories=[['Cheap', 'Expensive']])
houses["ExterQual_Recode"] = OE.fit_transform(houses[["ExterQual_Recode"]])

OE = OrdinalEncoder(categories=[['Cheap', 'Expensive']])
houses["Foundation_Recode"] = OE.fit_transform(houses[["Foundation_Recode"]])

OE = OrdinalEncoder(categories=[['Cheap', 'Normal', 'Expensive']])
houses["BsmtQual_Recode"] = OE.fit_transform(houses[["BsmtQual_Recode"]])

OE = OrdinalEncoder(categories=[['Cheap', 'Normal', 'Expensive']])
houses["BsmtExposure_Recode"] = OE.fit_transform(houses[["BsmtExposure_Recode"]])

OE = OrdinalEncoder(categories=[['Cheap', 'Expensive']])
houses["BsmtFinType1_Recode"] = OE.fit_transform(houses[["BsmtFinType1_Recode"]])

OE = OrdinalEncoder(categories=[['Cheap', 'Normal', 'Expensive']])
houses["HeatingQC_Recode"] = OE.fit_transform(houses[["HeatingQC_Recode"]])

OE = OrdinalEncoder(categories=[['N', 'Y']])
houses["CentralAir"] = OE.fit_transform(houses[["CentralAir"]])

OE = OrdinalEncoder(categories=[['Cheap', 'Normal']])
houses["Electrical_Recode"] = OE.fit_transform(houses[["Electrical_Recode"]])

OE = OrdinalEncoder(categories=[['Normal', 'Expensive', 'Very expensive']])
houses["KitchenQual_Recode"] = OE.fit_transform(houses[["KitchenQual_Recode"]])

OE = OrdinalEncoder(categories=[['No', 'Po', 'Fa', 'TA', 'Gd', 'Ex']])
houses["FireplaceQu"] = OE.fit_transform(houses[["FireplaceQu"]])

OE = OrdinalEncoder(categories=[['Cheap', 'Normal', 'Expensive']])
houses["GarageType_Recode"] = OE.fit_transform(houses[["GarageType_Recode"]])

OE = OrdinalEncoder(categories=[['Unf', 'RFn', 'Fin']])
houses["GarageFinish"] = OE.fit_transform(houses[["GarageFinish"]])




In [353]:
houses["CentralAir"].value_counts()

Y    1365
N      95
Name: CentralAir, dtype: int64

In [483]:
from sklearn.model_selection import train_test_split

X = houses[vars_to_keep]
y = houses["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 24)

from sklearn.ensemble import RandomForestRegressor
random_for_reg = RandomForestRegressor(max_depth = 3, n_jobs = -1, random_state = 32)
random_for_reg.fit(X_train, y_train)
random_regressor_score = random_for_reg.score(X_test, y_test)

In [484]:
random_regressor_score

0.7347900619407632