###### Importing libraries:

In [271]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

###### Reading in both test and train files

In [126]:
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

In [127]:
# Check data frame sizes:
train.shape  # 1460 by 81

(1460, 81)

In [128]:
test.shape   # 1459 by 80 - why only 80 columns?

(1459, 80)

In [129]:
# saving sample size as N for train and N2 for test:
N = train.shape[0]
N

1460

In [130]:
N2 = test.shape[0]
N2

1459

In [131]:
# Because test, unlike train, doesn't have the very last variable - SalePrice
# list(train)
# list(test)

In [132]:
# Checking column types:
# train.dtypes   # 81 columns

In [133]:
# test.dtypes   # 80 columns

#### MISSING DATA

In [134]:
# How many missings in each column?
misscounts = np.sum(train.isnull(), axis=0)/N*100
print(misscounts[40:].round(2))
# LotFrontage      17.7%
# Alley            93.8% - not really missings, NA = no alley access to property
# MasVnrType        0.5% (8 houses)
# MasVnrArea        0.5% (8 houses)
# BsmtQual          2.5%
# BsmtCond          2.5%
# BsmtExposure      2.6%
# BsmtFinType1      2.5%
# BsmtFinType2      2.6%
# Electrical        0.1% (1 house)
# FireplaceQu      47.3%
# GarageType        5.5%
# GarageYrBlt       5.5%
# GarageFinish      5.5%
# GarageQual        5.5%
# GarageCond        5.5%
# PoolQC           99.5%
# Fence            80.8%
# MiscFeature      96.3%

HeatingQC         0.00
CentralAir        0.00
Electrical        0.07
1stFlrSF          0.00
2ndFlrSF          0.00
LowQualFinSF      0.00
GrLivArea         0.00
BsmtFullBath      0.00
BsmtHalfBath      0.00
FullBath          0.00
HalfBath          0.00
BedroomAbvGr      0.00
KitchenAbvGr      0.00
KitchenQual       0.00
TotRmsAbvGrd      0.00
Functional        0.00
Fireplaces        0.00
FireplaceQu      47.26
GarageType        5.55
GarageYrBlt       5.55
GarageFinish      5.55
GarageCars        0.00
GarageArea        0.00
GarageQual        5.55
GarageCond        5.55
PavedDrive        0.00
WoodDeckSF        0.00
OpenPorchSF       0.00
EnclosedPorch     0.00
3SsnPorch         0.00
ScreenPorch       0.00
PoolArea          0.00
PoolQC           99.52
Fence            80.75
MiscFeature      96.30
MiscVal           0.00
MoSold            0.00
YrSold            0.00
SaleType          0.00
SaleCondition     0.00
SalePrice         0.00
dtype: float64


In [135]:
misscounts2 = np.sum(test.isnull(), axis=0)/N2*100
print(misscounts2[:40].round(2))
# In addition to the above, has missings in:
# MSZoning          0.27% - 4 houses
# Utilities         0.14% - 2 houses
# Exterior1st       0.07% - 1 house
# Exterior2nd       0.07%
# BsmtFinSF1        0.07%
# BsmtFinSF2        0.07%
# BsmtUnfSF         0.07%
# TotalBsmtSF       0.07%
# BsmtFullBath      0.14%
# BsmtHalfBath      0.14%
# KitchenQual       0.07%
# Functional        0.14%
# GarageCars        0.07%
# GarageArea        0.07%
# SaleType          0.07%

Id               0.00
MSSubClass       0.00
MSZoning         0.27
LotFrontage     15.56
LotArea          0.00
Street           0.00
Alley           92.67
LotShape         0.00
LandContour      0.00
Utilities        0.14
LotConfig        0.00
LandSlope        0.00
Neighborhood     0.00
Condition1       0.00
Condition2       0.00
BldgType         0.00
HouseStyle       0.00
OverallQual      0.00
OverallCond      0.00
YearBuilt        0.00
YearRemodAdd     0.00
RoofStyle        0.00
RoofMatl         0.00
Exterior1st      0.07
Exterior2nd      0.07
MasVnrType       1.10
MasVnrArea       1.03
ExterQual        0.00
ExterCond        0.00
Foundation       0.00
BsmtQual         3.02
BsmtCond         3.08
BsmtExposure     3.02
BsmtFinType1     2.88
BsmtFinSF1       0.07
BsmtFinType2     2.88
BsmtFinSF2       0.07
BsmtUnfSF        0.07
TotalBsmtSF      0.07
Heating          0.00
dtype: float64


#### Checking number of levels for categorical predictors

###### MSSubClass: The building class (originally integer):

Built dummies forthe levels with the highest incidence

In [136]:
# Observations in each level of MSSubClass (% of total):
# (train.MSSubClass.value_counts().sort_values(ascending = False)/N*100).round(2)

In [137]:
# test dataset has one extra value (150) with only one observation:
# (test.MSSubClass.value_counts().sort_values(ascending = False)/N*100).round(2)

In [138]:
# Build dummies for some classes (with somewhat decent incidence levels):
myclasses = [20, 60, 50, 120, 30, 70, 160, 80, 90, 190, 85]

In [139]:
for cl in myclasses:
    forname = 'class_' + str(cl)
    train[forname] = 0
    test[forname] = 0
    train.loc[train.MSSubClass == cl, forname] = 1
    test.loc[test.MSSubClass == cl, forname] = 1

In [140]:
pd.crosstab(train.class_190, train.MSSubClass)

MSSubClass,20,30,40,45,50,60,70,75,80,85,90,120,160,180,190
class_190,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,536,69,4,12,144,299,60,16,58,20,52,87,63,10,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,30


In [141]:
pd.crosstab(test.class_160, test.MSSubClass)

MSSubClass,20,30,40,45,50,60,70,75,80,85,90,120,150,160,180,190
class_160,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,543,70,2,6,143,276,68,7,60,28,57,95,1,0,7,31
1,0,0,0,0,0,0,0,0,0,0,0,0,0,65,0,0


In [142]:
print(train.shape)
print(test.shape)

(1460, 92)
(1459, 91)


###### MSZoning: The general zoning classification (currently object)

Built dummies for levels RL (Residential Low Density), RM (Residential Medium Density), FV (Floating Village Residential)?

In [143]:
# Observations in each level of MSZoning (%):
# (train.MSZoning.value_counts().sort_values(ascending = False)/N*100).round(2)

In [144]:
# Variable MSZoning
# (test.MSZoning.value_counts().sort_values(ascending = False)/N*100).round(2)

In [145]:
# Creating dummies based on MSZoning and dropping two of them (with too few observations):
temp = pd.get_dummies(train.MSZoning, prefix = "zone", drop_first = True)
temp.drop('zone_RH', axis=1, inplace = True)
train = pd.concat([train, temp], axis = 1)

In [146]:
temp = pd.get_dummies(test.MSZoning, prefix = "zone", drop_first = True)
temp.drop('zone_RH', axis=1, inplace = True)
test = pd.concat([test, temp], axis = 1)

In [147]:
print(train.shape)
print(test.shape)

(1460, 95)
(1459, 94)


###### Street: Type of road access to property - IGNORE THIS VAR!

99.6% are Paved and 0.41 (only 6 houses) have unpaved street
LETS NOT USE THIS VARIABLE


In [148]:
# Observations in each level of Street (%):
(train.Street.value_counts().sort_values(ascending = False)/N*100).round(2)

Pave    99.59
Grvl     0.41
Name: Street, dtype: float64

In [149]:
(test.Street.value_counts().sort_values(ascending = False)/N2*100).round(2)

Pave    99.59
Grvl     0.41
Name: Street, dtype: float64

###### Alley: Type of alley access to property - replaced missings with 'None'!

And created 2 dummies - alley_paved & alley_gravel

In [150]:
# Observations in each level of Alley (%):
(train.Alley.value_counts().sort_values(ascending = False)/N*100).round(2)

Grvl    3.42
Pave    2.81
Name: Alley, dtype: float64

In [151]:
(test.Alley.value_counts().sort_values(ascending = False)/N2*100).round(2)

Grvl    4.80
Pave    2.54
Name: Alley, dtype: float64

In [152]:
# Replacing NAs with 'None' in both data frames:
train['Alley'].fillna('None', inplace=True)
test['Alley'].fillna('None', inplace=True)

In [153]:
# Build a dummy for Gravel and for Paved Alley
train['alley_gravel'] = 0
train.loc[train.Alley == 'Grvl', 'alley_gravel'] = 1
train['alley_paved'] = 0
train.loc[train.Alley == 'Pave', 'alley_paved'] = 1

In [154]:
test['alley_gravel'] = 0
test.loc[test.Alley == 'Grvl', 'alley_gravel'] = 1
test['alley_paved'] = 0
test.loc[test.Alley == 'Pave', 'alley_paved'] = 1

In [155]:
print(train.shape)
print(test.shape)

(1460, 97)
(1459, 96)


In [156]:
# pd.crosstab(train.Alley, train.alley_gravel)  # It worked

In [157]:
# pd.crosstab(train.Alley, train.alley_paved)  # It worked

###### LotShape: General shape of property

Build only one dummy - LotShapeReg - for Regular lot size (63%)

In [158]:
# Observations in each level of LotShape (%):
# (train.LotShape.value_counts().sort_values(ascending = False)/N*100).round(2)

In [159]:
# (test.LotShape.value_counts().sort_values(ascending = False)/N*100).round(2)

In [160]:
# Creating new variable - dummy for LotShape = 'Reg'
train['LotShapeReg'] = 0
train.loc[train.LotShape=='Reg', 'LotShapeReg'] = 1
pd.crosstab(train.LotShape, train.LotShapeReg)  # It worked

LotShapeReg,0,1
LotShape,Unnamed: 1_level_1,Unnamed: 2_level_1
IR1,484,0
IR2,41,0
IR3,10,0
Reg,0,925


In [161]:
test['LotShapeReg'] = 0
test.loc[test.LotShape=='Reg', 'LotShapeReg'] = 1
pd.crosstab(test.LotShape, test.LotShapeReg)  # It worked

LotShapeReg,0,1
LotShape,Unnamed: 1_level_1,Unnamed: 2_level_1
IR1,484,0
IR2,35,0
IR3,6,0
Reg,0,934


In [162]:
print(train.shape)
print(test.shape)

(1460, 98)
(1459, 97)


###### LandContour: Flatness of the property

Created 3 dummies for 3 types of countour

In [163]:
# Observations in each level of LandContour (%):
# (train.LandContour.value_counts().sort_values(ascending = False)/N*100).round(2)

In [164]:
# (test.LandContour.value_counts().sort_values(ascending = False)/N*100).round(2)

In [165]:
# Creating dummies based on LandCountour and dropping one of them (with few observations):
temp = pd.get_dummies(train.LandContour, prefix = "slope")
temp.drop('slope_Low', axis=1, inplace = True)
train = pd.concat([train, temp], axis = 1)

In [166]:
temp = pd.get_dummies(test.LandContour, prefix = "slope")
temp.drop('slope_Low', axis=1, inplace = True)
test = pd.concat([test, temp], axis = 1)

In [167]:
print(train.shape)
print(test.shape)

(1460, 101)
(1459, 100)


###### Utilities: Type of utilities available - IGNORE THIS VAR! (no variance)

In [168]:
# Observations in each level of Utilities (%):
(train.Utilities.value_counts().sort_values(ascending = False)/N*100).round(2)

AllPub    99.93
NoSeWa     0.07
Name: Utilities, dtype: float64

###### LotConfig: Lot configuration

Created 3 dummy variables

In [169]:
# Observations in each level of LotConfig (%):
# (train.LotConfig.value_counts().sort_values(ascending = False)/N*100).round(2)

In [170]:
# (test.LotConfig.value_counts().sort_values(ascending = False)/N*100).round(2)

In [171]:
# Creating dummies based on LotConfig and dropping some of them (with few observations):
temp = pd.get_dummies(train.LotConfig, prefix = "lotconfig")
temp.drop('lotconfig_FR2', axis=1, inplace = True)
temp.drop('lotconfig_FR3', axis=1, inplace = True)
train = pd.concat([train, temp], axis = 1)

In [172]:
temp = pd.get_dummies(test.LotConfig, prefix = "lotconfig")
temp.drop('lotconfig_FR2', axis=1, inplace = True)
temp.drop('lotconfig_FR3', axis=1, inplace = True)
test = pd.concat([test, temp], axis = 1)

In [173]:
print(train.shape)
print(test.shape)

(1460, 104)
(1459, 103)


###### LandSlope: Slope of property

Created a new dummy: LandSlope_Gentle

In [174]:
# Observations in each level of LandSlope (%):
# (train.LandSlope.value_counts().sort_values(ascending = False)/N*100).round(2)

In [175]:
# (test.LandSlope.value_counts().sort_values(ascending = False)/N*100).round(2)

In [176]:
# Creating new variable - dummy for LandSlope = 'Gtl'
train['LandSlope_Gentle'] = 0
train.loc[train.LandSlope == 'Gtl', 'LandSlope_Gentle'] = 1
pd.crosstab(train.LandSlope, train.LandSlope_Gentle)  # It worked

LandSlope_Gentle,0,1
LandSlope,Unnamed: 1_level_1,Unnamed: 2_level_1
Gtl,0,1382
Mod,65,0
Sev,13,0


In [177]:
test['LandSlope_Gentle'] = 0
test.loc[test.LandSlope == 'Gtl', 'LandSlope_Gentle'] = 1
pd.crosstab(test.LandSlope, test.LandSlope_Gentle)  # It worked

LandSlope_Gentle,0,1
LandSlope,Unnamed: 1_level_1,Unnamed: 2_level_1
Gtl,0,1396
Mod,60,0
Sev,3,0


In [178]:
print(train.shape)
print(test.shape)

(1460, 105)
(1459, 104)


###### Neighborhood: Physical locations within Ames city limits - bilt dummies for most

In [179]:
# Observations in each level of Neighborhood (%):
# (train.Neighborhood.value_counts().sort_values(ascending = False)/N*100).round(2)

In [180]:
# (test.Neighborhood.value_counts().sort_values(ascending = False)/N*100).round(2)

In [181]:
temp = pd.get_dummies(train.Neighborhood, prefix = "nbh")
temp.drop('nbh_Blueste', axis=1, inplace = True)
temp.drop('nbh_NPkVill', axis=1, inplace = True)
temp.drop('nbh_Veenker', axis=1, inplace = True)
temp.drop('nbh_BrDale', axis=1, inplace = True)
temp.drop('nbh_MeadowV', axis=1, inplace = True)
temp.drop('nbh_Blmngtn', axis=1, inplace = True)
train = pd.concat([train, temp], axis = 1)

In [182]:
# Creating dummies and dropping some:
temp = pd.get_dummies(test.Neighborhood, prefix = "nbh")
temp.drop('nbh_Blueste', axis=1, inplace = True)
temp.drop('nbh_NPkVill', axis=1, inplace = True)
temp.drop('nbh_Veenker', axis=1, inplace = True)
temp.drop('nbh_BrDale', axis=1, inplace = True)
temp.drop('nbh_MeadowV', axis=1, inplace = True)
temp.drop('nbh_Blmngtn', axis=1, inplace = True)
test = pd.concat([test, temp], axis = 1)

In [183]:
pd.crosstab(train.nbh_Timber,train.Neighborhood)

Neighborhood,Blmngtn,Blueste,BrDale,BrkSide,ClearCr,CollgCr,Crawfor,Edwards,Gilbert,IDOTRR,...,NoRidge,NridgHt,OldTown,SWISU,Sawyer,SawyerW,Somerst,StoneBr,Timber,Veenker
nbh_Timber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,17,2,16,58,28,150,51,100,79,37,...,41,77,113,25,74,59,86,25,0,11
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,38,0


In [184]:
print(train.shape)
print(test.shape)

(1460, 124)
(1459, 123)


###### Condition1 & Condition2: Proximity to various conditions


Created 5 dummies that start with 'prox_'

In [185]:
# Observations in each level of Condition1 (%):
# (train.Condition1.value_counts().sort_values(ascending = False)/N*100).round(2)

In [186]:
# Observations in each level of Condition2 (%):
# (train.Condition2.value_counts().sort_values(ascending = False)/N*100).round(2)

In [187]:
# Creating dummies based on those 2 columns simultaneously:

train['prox_norm'] = 0
train['prox_feeder'] = 0
train['prox_artery'] = 0
train['prox_rr'] = 0
train['prox_positive'] = 0

test['prox_norm'] = 0
test['prox_feeder'] = 0
test['prox_artery'] = 0
test['prox_rr'] = 0
test['prox_positive'] = 0

In [188]:
train.loc[(train.Condition1 == 'Norm')|(train.Condition2 == 'Norm'), 'prox_norm'] = 1
test.loc[(test.Condition1 == 'Norm')|(test.Condition2 == 'Norm'), 'prox_norm'] = 1

In [189]:
train.loc[(train.Condition1 == 'Feedr')|(train.Condition2 == 'Feedr'), 'prox_feeder'] = 1
test.loc[(test.Condition1 == 'Feedr')|(test.Condition2 == 'Feedr'), 'prox_feeder'] = 1

In [190]:
train.loc[(train.Condition1 == 'Artery')|(train.Condition2 == 'Artery'), 'prox_artery'] = 1
test.loc[(test.Condition1 == 'Artery')|(test.Condition2 == 'Artery'), 'prox_artery'] = 1

In [191]:
train.loc[(train.Condition1 == 'PosN')|(train.Condition2 == 'PosN'), 'prox_positive'] = 1
train.loc[(train.Condition1 == 'PosA')|(train.Condition2 == 'PosA'), 'prox_positive'] = 1

test.loc[(test.Condition1 == 'PosN')|(test.Condition2 == 'PosN'), 'prox_positive'] = 1
test.loc[(test.Condition1 == 'PosA')|(test.Condition2 == 'PosA'), 'prox_positive'] = 1

In [192]:
train.loc[(train.Condition1.str.startswith('RR', na=False))|(train.Condition2.str.startswith('RR', na=False)), 'prox_rr'] = 1
test.loc[(test.Condition1.str.startswith('RR', na=False))|(test.Condition2.str.startswith('RR', na=False)), 'prox_rr'] = 1

In [193]:
# train[['Condition1','Condition2','prox_norm', 'prox_feeder','prox_artery','prox_positive','prox_rr']]

In [194]:
print(train.shape)
print(test.shape)

(1460, 129)
(1459, 128)


###### BldgType: Type of dwelling

###### Created 4 dummies that start with bldgtype_

In [195]:
# Observations in each level of BldgType (%):
# (train.BldgType.value_counts().sort_values(ascending = False)/N*100).round(2)

In [196]:
# (test.BldgType.value_counts().sort_values(ascending = False)/N*100).round(2)

In [197]:
# Fixing one value
train.loc[train.BldgType == 'Twnhs', 'BldgType'] = 'TwnhsI'
test.loc[test.BldgType == 'Twnhs', 'BldgType'] = 'TwnhsI'

In [198]:
# Building dummies
temp = pd.get_dummies(train.BldgType, prefix = "bldgtype")
temp.drop('bldgtype_2fmCon', axis=1, inplace = True)
train = pd.concat([train, temp], axis = 1)

In [199]:
temp = pd.get_dummies(test.BldgType, prefix = "bldgtype")
temp.drop('bldgtype_2fmCon', axis=1, inplace = True)
test = pd.concat([test, temp], axis = 1)

In [200]:
print(train.shape)
print(test.shape)

(1460, 133)
(1459, 132)


In [201]:
# test.columns

###### HouseStyle: Style of dwelling

Created 5 dummies that start with 'style_'

In [202]:
# Observations in each level of HouseStyle (%):
(train.HouseStyle.value_counts().sort_values(ascending = False)/N*100).round(2)

1Story    49.73
2Story    30.48
1.5Fin    10.55
SLvl       4.45
SFoyer     2.53
1.5Unf     0.96
2.5Unf     0.75
2.5Fin     0.55
Name: HouseStyle, dtype: float64

In [203]:
(test.HouseStyle.value_counts().sort_values(ascending = False)/N*100).round(2)
# Notice -test has one fewer category than train (has no 2.5Fin)

1Story    51.03
2Story    29.25
1.5Fin    10.96
SLvl       4.32
SFoyer     3.15
2.5Unf     0.89
1.5Unf     0.34
Name: HouseStyle, dtype: float64

In [204]:
# Fix some values:

train.loc[train.HouseStyle == '1.5Fin', 'HouseStyle'] = '1p5Fin'
train.loc[train.HouseStyle == '2.5Unf', 'HouseStyle'] = '2p5Unf'
train.loc[train.HouseStyle == '1.5Unf', 'HouseStyle'] = '1p5Unf'
train.loc[train.HouseStyle == '2.5Fin', 'HouseStyle'] = '2p5Fin'

test.loc[test.HouseStyle == '1.5Fin', 'HouseStyle'] = '1p5Fin'
test.loc[test.HouseStyle == '2.5Unf', 'HouseStyle'] = '2p5Unf'
test.loc[test.HouseStyle == '1.5Unf', 'HouseStyle'] = '1p5Unf'

In [205]:
# Building dummies
temp = pd.get_dummies(train.HouseStyle, prefix = "style")
temp.drop('style_2p5Unf', axis=1, inplace = True)
temp.drop('style_1p5Unf', axis=1, inplace = True)
temp.drop('style_2p5Fin', axis=1, inplace = True)
train = pd.concat([train, temp], axis = 1)

In [206]:
temp = pd.get_dummies(test.HouseStyle, prefix = "style")
temp.drop('style_2p5Unf', axis=1, inplace = True)
temp.drop('style_1p5Unf', axis=1, inplace = True)
test = pd.concat([test, temp], axis = 1)

In [207]:
print(train.shape)
print(test.shape)

(1460, 138)
(1459, 137)


###### OverallQual & OverallCond

In [208]:
# (train.OverallQual.value_counts()/N*100).round(2)
# (train.OverallCond.value_counts()/N*100).round(2)

###### YearBuilt & YearRemodAdd

In [209]:
# year = (train.YearBuilt.value_counts()/N*100).round(2)
# with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
#     print(year)
# year = (train.YearRemodAdd.value_counts()/N*100).round(2)
# with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
#     print(year)


###### RoofStyle: Type of roof

Created 2 dummies - roof_hip and roof_gable

In [210]:
# Observations in each level of RoofStyle (%):
# (train.RoofStyle.value_counts().sort_values(ascending = False)/N*100).round(2)

In [211]:
# (test.RoofStyle.value_counts().sort_values(ascending = False)/N*100).round(2)

In [212]:
# Creating 2 dummies:
train['roof_gable'] = 0
train['roof_hip'] = 0
train.loc[(train.RoofStyle == 'Gable'), 'roof_gable'] = 1
train.loc[(train.RoofStyle == 'Hip'), 'roof_hip'] = 1
# pd.crosstab(train.roof_hip, train.RoofStyle)

In [213]:
test['roof_gable'] = 0
test['roof_hip'] = 0
test.loc[(test.RoofStyle == 'Gable'), 'roof_gable'] = 1
test.loc[(test.RoofStyle == 'Hip'), 'roof_hip'] = 1

In [214]:
print(train.shape)
print(test.shape)

(1460, 140)
(1459, 139)


###### RoofMatl: Roof material - IGNORE, NO VARIANCE

In [215]:
# Observations in each level of RoofMatl (%):
(train.RoofMatl.value_counts().sort_values(ascending = False)/N*100).round(2)

CompShg    98.22
Tar&Grv     0.75
WdShngl     0.41
WdShake     0.34
Membran     0.07
Roll        0.07
Metal       0.07
ClyTile     0.07
Name: RoofMatl, dtype: float64

In [216]:
(test.RoofMatl.value_counts().sort_values(ascending = False)/N*100).round(2)

CompShg    98.77
Tar&Grv     0.82
WdShake     0.27
WdShngl     0.07
Name: RoofMatl, dtype: float64

###### Exterior1st: Exterior covering on house

In [217]:
print(train.shape)
print(test.shape)

(1460, 140)
(1459, 139)


In [94]:
# Observations in each level of Exterior1st (%):
# (train.Exterior1st.value_counts().sort_values(ascending = False)/N*100).round(2)

In [95]:
# (train.Exterior2nd.value_counts().sort_values(ascending = False)/N*100).round(2)

In [96]:
# (test.Exterior1st.value_counts().sort_values(ascending = False)/N*100).round(2)

In [97]:
# (test.Exterior2nd.value_counts().sort_values(ascending = False)/N*100).round(2)

In [218]:
# Fixing 3 values - eliminating spaces:
train.Exterior1st = train.Exterior1st.str.replace('\s+', '')
train.Exterior2nd = train.Exterior2nd.str.replace('\s+', '')
test.Exterior1st = test.Exterior1st.str.replace('\s+', '')
test.Exterior2nd = test.Exterior2nd.str.replace('\s+', '') 

In [99]:
# train = train.loc[:, ~train.columns.str.startswith('ext_')]
# test = test.loc[:, ~test.columns.str.startswith('ext_')]

In [100]:
# test['Exterior1st'].isnull().sum()    # Has one missing value
# train['Exterior1st'].isnull().sum()   # Has no missing values

In [219]:
# Build dummies:
temp = pd.get_dummies(train.Exterior1st, prefix = "ext")
train = pd.concat([train, temp], axis = 1)

In [220]:
temp = pd.get_dummies(test.Exterior1st, prefix = "ext")
test = pd.concat([test, temp], axis = 1)

In [221]:
for i in list(train.Exterior1st.unique()):
    forname = 'ext_' + str(i)
    train.loc[train.Exterior2nd == i, forname] = 1

In [222]:
for i in list(test.Exterior1st.unique()):
    forname = 'ext_' + str(i)
    test.loc[test.Exterior2nd == i, forname] = 1

In [235]:
train.loc[:, train.columns.str.startswith('ext_')].mean() * 100

ext_AsbShng     1.575342
ext_BrkFace     3.493151
ext_CemntBd     4.178082
ext_HdBoard    16.164384
ext_MetalSd    15.205479
ext_Plywood    10.547945
ext_Stucco      2.123288
ext_VinylSd    35.410959
ext_WdSdng     15.479452
ext_WdShing     1.780822
dtype: float64

In [236]:
test.loc[:, test.columns.str.startswith('ext_')].mean() * 100

ext_AsbShng     1.644962
ext_BrkFace     2.673064
ext_CemntBd     4.455106
ext_HdBoard    15.695682
ext_MetalSd    16.312543
ext_Plywood    10.349554
ext_Stucco      1.782042
ext_VinylSd    35.435230
ext_WdSdng     15.284441
ext_WdShing     2.056203
dtype: float64

Deleting exterial material columns we don't need

In [225]:
train.drop('ext_ImStucc', axis=1, inplace = True)

In [226]:
test.drop('ext_nan', axis=1, inplace = True)

In [229]:
train.drop('ext_AsphShn', axis=1, inplace = True)
test.drop('ext_AsphShn', axis=1, inplace = True)

In [230]:
train.drop('ext_BrkComm', axis=1, inplace = True)
test.drop('ext_BrkComm', axis=1, inplace = True)

In [231]:
train.drop('ext_CBlock', axis=1, inplace = True)
test.drop('ext_CBlock', axis=1, inplace = True)

In [234]:
train.drop('ext_Stone', axis=1, inplace = True)

In [113]:
# train.drop('ext_CBlock', axis=1, inplace = True)
# test.drop('ext_CBlock', axis=1, inplace = True)

In [237]:
print(train.shape)
print(test.shape)

(1460, 150)
(1459, 149)


###### MasVnrType: Masonry veneer type

Created 2 dummies - for stone & brick face

In [238]:
# Observations in each level of MasVnrType (%):
(train.MasVnrType.value_counts().sort_values(ascending = False)/N*100).round(2)

None       59.18
BrkFace    30.48
Stone       8.77
BrkCmn      1.03
Name: MasVnrType, dtype: float64

In [239]:
(test.MasVnrType.value_counts().sort_values(ascending = False)/N*100).round(2)

None       60.14
BrkFace    29.73
Stone       8.29
BrkCmn      0.68
Name: MasVnrType, dtype: float64

In [240]:
# Creating 2 dummies:
train['veneer_brickface'] = 0
train['veneer_stone'] = 0
train.loc[(train.MasVnrType == 'BrkFace'), 'veneer_brickface'] = 1
train.loc[(train.MasVnrType == 'Stone'), 'veneer_stone'] = 1
# pd.crosstab(train.veneer_stone, train.MasVnrType)

MasVnrType,BrkCmn,BrkFace,None,Stone
veneer_brickface,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,15,0,864,128
1,0,445,0,0


In [242]:
test['veneer_brickface'] = 0
test['veneer_stone'] = 0
test.loc[(test.MasVnrType == 'BrkFace'), 'veneer_brickface'] = 1
test.loc[(test.MasVnrType == 'Stone'), 'veneer_stone'] = 1
# pd.crosstab(test.veneer_brickface, test.MasVnrType)

MasVnrType,BrkCmn,BrkFace,None,Stone
veneer_brickface,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,10,0,878,121
1,0,434,0,0


###### MasVnrArea: Masonry veneer area in square feet

Check distribution

In [245]:
train.MasVnrArea.describe()

count    1452.000000
mean      103.685262
std       181.066207
min         0.000000
25%         0.000000
50%         0.000000
75%       166.000000
max      1600.000000
Name: MasVnrArea, dtype: float64

###### ExterQual: Exterior material quality

Recoded into numeric 'exterior_quality':
Fa = Fair = 0
TA = Average = 1
Gd = Good = 2
Ex = Excellent = 3


In [249]:
# (train.ExterQual.value_counts().sort_values(ascending = False)/N*100).round(2)

In [250]:
# (test.ExterQual.value_counts().sort_values(ascending = False)/N*100).round(2)

In [251]:
train['exterior_quality'] = 0
train.loc[(train.ExterQual == 'TA'), 'exterior_quality'] = 1
train.loc[(train.ExterQual == 'Gd'), 'exterior_quality'] = 2
train.loc[(train.ExterQual == 'Ex'), 'exterior_quality'] = 3
pd.crosstab(train.ExterQual, train.exterior_quality)

exterior_quality,0,1,2,3
ExterQual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ex,0,0,0,52
Fa,14,0,0,0
Gd,0,0,488,0
TA,0,906,0,0


In [252]:
test['exterior_quality'] = 0
test.loc[(test.ExterQual == 'TA'), 'exterior_quality'] = 1
test.loc[(test.ExterQual == 'Gd'), 'exterior_quality'] = 2
test.loc[(test.ExterQual == 'Ex'), 'exterior_quality'] = 3
pd.crosstab(test.ExterQual, test.exterior_quality)

exterior_quality,0,1,2,3
ExterQual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ex,0,0,0,55
Fa,21,0,0,0
Gd,0,0,491,0
TA,0,892,0,0


###### ExterCond: Evaluates the present condition of the material on the exterior

Recoded into numeric 'exterior_condition': Po = Poor = 0, Fa = Fair = 1 TA = Average = 2 Gd = Good = 3 Ex = Excellent = 4

In [257]:
# print((train.ExterCond.value_counts().sort_values(ascending = False)/N*100).round(2))
# print((test.ExterCond.value_counts().sort_values(ascending = False)/N*100).round(2))

In [258]:
train['exterior_condition'] = 0
train.loc[(train.ExterCond == 'Fa'), 'exterior_condition'] = 1
train.loc[(train.ExterCond == 'TA'), 'exterior_condition'] = 2
train.loc[(train.ExterCond == 'Gd'), 'exterior_condition'] = 3
train.loc[(train.ExterCond == 'Ex'), 'exterior_condition'] = 4
pd.crosstab(train.ExterCond, train.exterior_condition)

exterior_condition,0,1,2,3,4
ExterCond,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ex,0,0,0,0,3
Fa,0,28,0,0,0
Gd,0,0,0,146,0
Po,1,0,0,0,0
TA,0,0,1282,0,0


In [259]:
test['exterior_condition'] = 0
test.loc[(test.ExterCond == 'Fa'), 'exterior_condition'] = 1
test.loc[(test.ExterCond == 'TA'), 'exterior_condition'] = 2
test.loc[(test.ExterCond == 'Gd'), 'exterior_condition'] = 3
test.loc[(test.ExterCond == 'Ex'), 'exterior_condition'] = 4
pd.crosstab(test.ExterCond, test.exterior_condition)

exterior_condition,0,1,2,3,4
ExterCond,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ex,0,0,0,0,9
Fa,0,39,0,0,0
Gd,0,0,0,153,0
Po,2,0,0,0,0
TA,0,0,1256,0,0


In [274]:
train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities',
       ...
       'ext_MetalSd', 'ext_Plywood', 'ext_Stucco', 'ext_VinylSd', 'ext_WdSdng',
       'ext_WdShing', 'veneer_brickface', 'veneer_stone', 'exterior_quality',
       'exterior_condition'],
      dtype='object', length=154)

In [279]:
plt.hist(train['exterior_quality'])
# Not working - where is the plot?

(array([  14.,    0.,    0.,  906.,    0.,    0.,  488.,    0.,    0.,   52.]),
 array([ 0. ,  0.3,  0.6,  0.9,  1.2,  1.5,  1.8,  2.1,  2.4,  2.7,  3. ]),
 <a list of 10 Patch objects>)

In [276]:
plt.hist(train['exterior_condition'])

(array([  1.00000000e+00,   0.00000000e+00,   2.80000000e+01,
          0.00000000e+00,   0.00000000e+00,   1.28200000e+03,
          0.00000000e+00,   1.46000000e+02,   0.00000000e+00,
          3.00000000e+00]),
 array([ 0. ,  0.4,  0.8,  1.2,  1.6,  2. ,  2.4,  2.8,  3.2,  3.6,  4. ]),
 <a list of 10 Patch objects>)

In [278]:
train[['exterior_quality','exterior_condition']].corr()  # Correlation is just 0.009, good!

Unnamed: 0,exterior_quality,exterior_condition
exterior_quality,1.0,0.009184
exterior_condition,0.009184,1.0


In [284]:
import numpy as np
np.histogram(train.exterior_quality)

(array([ 14,   0,   0, 906,   0,   0, 488,   0,   0,  52], dtype=int64),
 array([ 0. ,  0.3,  0.6,  0.9,  1.2,  1.5,  1.8,  2.1,  2.4,  2.7,  3. ]))

###### Foundation: Type of foundation

In [285]:
print((train.Foundation.value_counts().sort_values(ascending = False)/N*100).round(2))
print((test.Foundation.value_counts().sort_values(ascending = False)/N*100).round(2))

PConc     44.32
CBlock    43.42
BrkTil    10.00
Slab       1.64
Stone      0.41
Wood       0.21
Name: Foundation, dtype: float64
PConc     45.27
CBlock    41.16
BrkTil    11.30
Slab       1.71
Stone      0.34
Wood       0.14
Name: Foundation, dtype: float64


In [286]:
# Building dummies and dropping some:
temp = pd.get_dummies(train.Foundation, prefix = "found")
temp.drop('found_Stone', axis=1, inplace = True)
temp.drop('found_Wood', axis=1, inplace = True)
train = pd.concat([train, temp], axis = 1)

In [287]:
temp = pd.get_dummies(test.Foundation, prefix = "found")
temp.drop('found_Stone', axis=1, inplace = True)
temp.drop('found_Wood', axis=1, inplace = True)
test = pd.concat([test, temp], axis = 1)

In [290]:
print(pd.crosstab(train.found_PConc, train.Foundation))
print(pd.crosstab(train.found_CBlock, train.Foundation))
print(pd.crosstab(train.found_BrkTil, train.Foundation))
print(pd.crosstab(train.found_Slab, train.Foundation))

Foundation   BrkTil  CBlock  PConc  Slab  Stone  Wood
found_PConc                                          
0               146     634      0    24      6     3
1                 0       0    647     0      0     0
Foundation    BrkTil  CBlock  PConc  Slab  Stone  Wood
found_CBlock                                          
0                146       0    647    24      6     3
1                  0     634      0     0      0     0
Foundation    BrkTil  CBlock  PConc  Slab  Stone  Wood
found_BrkTil                                          
0                  0     634    647    24      6     3
1                146       0      0     0      0     0
Foundation  BrkTil  CBlock  PConc  Slab  Stone  Wood
found_Slab                                          
0              146     634    647     0      6     3
1                0       0      0    24      0     0


In [None]:
print(pd.crosstab(test.found_PConc, test.Foundation))
print(pd.crosstab(test.found_CBlock, test.Foundation))
print(pd.crosstab(test.found_BrkTil, test.Foundation))
print(pd.crosstab(test.found_Slab, test.Foundation))

In [291]:
print(train.shape)
print(test.shape)

(1460, 158)
(1459, 157)


###### BsmtQual: Evaluates the height of the basement

Created quantitative variable bsmt_height

In [294]:
# Replacing NAs with 'None' in both data frames:
train['BsmtQual'].fillna('NoBasement', inplace=True)
test['BsmtQual'].fillna('NoBasement', inplace=True)

In [296]:
print((train.BsmtQual.value_counts().sort_values(ascending = False)/N*100).round(2))
print((test.BsmtQual.value_counts().sort_values(ascending = False)/N*100).round(2))

TA            44.45
Gd            42.33
Ex             8.29
NoBasement     2.53
Fa             2.40
Name: BsmtQual, dtype: float64
TA            43.42
Gd            40.48
Ex             9.38
Fa             3.63
NoBasement     3.01
Name: BsmtQual, dtype: float64


In [297]:
train['bsmt_height'] = 0
train.loc[(train.BsmtQual == 'Fa'), 'bsmt_height'] = 1
train.loc[(train.BsmtQual == 'TA'), 'bsmt_height'] = 2
train.loc[(train.BsmtQual == 'Gd'), 'bsmt_height'] = 3
train.loc[(train.BsmtQual == 'Ex'), 'bsmt_height'] = 4
pd.crosstab(train.BsmtQual, train.bsmt_height)

bsmt_height,0,1,2,3,4
BsmtQual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ex,0,0,0,0,121
Fa,0,35,0,0,0
Gd,0,0,0,618,0
NoBasement,37,0,0,0,0
TA,0,0,649,0,0


In [298]:
test['bsmt_height'] = 0
test.loc[(test.BsmtQual == 'Fa'), 'bsmt_height'] = 1
test.loc[(test.BsmtQual == 'TA'), 'bsmt_height'] = 2
test.loc[(test.BsmtQual == 'Gd'), 'bsmt_height'] = 3
test.loc[(test.BsmtQual == 'Ex'), 'bsmt_height'] = 4
pd.crosstab(test.BsmtQual, test.bsmt_height)

bsmt_height,0,1,2,3,4
BsmtQual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ex,0,0,0,0,137
Fa,0,53,0,0,0
Gd,0,0,0,591,0
NoBasement,44,0,0,0,0
TA,0,0,634,0,0


In [300]:
print(train.shape)
print(test.shape)

(1460, 159)
(1459, 158)


###### BsmtCond: Evaluates the general condition of the basement

In [302]:
# Replacing NAs with 'None' in both data frames:
train['BsmtCond'].fillna('NoBasement', inplace=True)
test['BsmtCond'].fillna('NoBasement', inplace=True)

In [303]:
print((train.BsmtCond.value_counts().sort_values(ascending = False)/N*100).round(2))
print((test.BsmtCond.value_counts().sort_values(ascending = False)/N*100).round(2))

TA            89.79
Gd             4.45
Fa             3.08
NoBasement     2.53
Po             0.14
Name: BsmtCond, dtype: float64
TA            88.70
Fa             4.04
Gd             3.90
NoBasement     3.08
Po             0.21
Name: BsmtCond, dtype: float64


In [304]:
train['bsmt_condition'] = 0
train.loc[(train.BsmtCond == 'Po'), 'bsmt_condition'] = 1
train.loc[(train.BsmtCond == 'Fa'), 'bsmt_condition'] = 2
train.loc[(train.BsmtCond == 'TA'), 'bsmt_condition'] = 3
train.loc[(train.BsmtCond == 'Gd'), 'bsmt_condition'] = 4
pd.crosstab(train.BsmtCond, train.bsmt_condition)

bsmt_condition,0,1,2,3,4
BsmtCond,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fa,0,0,45,0,0
Gd,0,0,0,0,65
NoBasement,37,0,0,0,0
Po,0,2,0,0,0
TA,0,0,0,1311,0


In [305]:
test['bsmt_condition'] = 0
test.loc[(test.BsmtCond == 'Po'), 'bsmt_condition'] = 1
test.loc[(test.BsmtCond == 'Fa'), 'bsmt_condition'] = 2
test.loc[(test.BsmtCond == 'TA'), 'bsmt_condition'] = 3
test.loc[(test.BsmtCond == 'Gd'), 'bsmt_condition'] = 4
pd.crosstab(test.BsmtCond, test.bsmt_condition)

bsmt_condition,0,1,2,3,4
BsmtCond,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fa,0,0,59,0,0
Gd,0,0,0,0,57
NoBasement,45,0,0,0,0
Po,0,3,0,0,0
TA,0,0,0,1295,0


In [306]:
print(train.shape)
print(test.shape)

(1460, 160)
(1459, 159)


In [307]:
train[['bsmt_height','bsmt_condition']].corr()  # Correlation is 0.536

Unnamed: 0,bsmt_height,bsmt_condition
bsmt_height,1.0,0.535957
bsmt_condition,0.535957,1.0


###### BsmtExposure: Refers to walkout or garden level walls

In [310]:
# Replacing NAs with 'None' in both data frames:
train['BsmtExposure'].fillna('NoBasement', inplace=True)
test['BsmtExposure'].fillna('NoBasement', inplace=True)

In [311]:
print((train.BsmtExposure.value_counts().sort_values(ascending = False)/N*100).round(2))
print((test.BsmtExposure.value_counts().sort_values(ascending = False)/N*100).round(2))

No            65.27
Av            15.14
Gd             9.18
Mn             7.81
NoBasement     2.60
Name: BsmtExposure, dtype: float64
No            65.14
Av            13.49
Gd             9.73
Mn             8.56
NoBasement     3.01
Name: BsmtExposure, dtype: float64


In [312]:
train['bsmt_exposure'] = 0
train.loc[(train.BsmtExposure == 'Mn'), 'bsmt_exposure'] = 1
train.loc[(train.BsmtExposure == 'Av'), 'bsmt_exposure'] = 2
train.loc[(train.BsmtExposure == 'Gd'), 'bsmt_exposure'] = 3
pd.crosstab(train.BsmtExposure, train.bsmt_exposure)

bsmt_exposure,0,1,2,3
BsmtExposure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Av,0,0,221,0
Gd,0,0,0,134
Mn,0,114,0,0
No,953,0,0,0
NoBasement,38,0,0,0


In [313]:
test['bsmt_exposure'] = 0
test.loc[(test.BsmtExposure == 'Mn'), 'bsmt_exposure'] = 1
test.loc[(test.BsmtExposure == 'Av'), 'bsmt_exposure'] = 2
test.loc[(test.BsmtExposure == 'Gd'), 'bsmt_exposure'] = 3
pd.crosstab(test.BsmtExposure, test.bsmt_exposure)

bsmt_exposure,0,1,2,3
BsmtExposure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Av,0,0,197,0
Gd,0,0,0,142
Mn,0,125,0,0
No,951,0,0,0
NoBasement,44,0,0,0


In [315]:
print(train.shape)
print(test.shape)

(1460, 161)
(1459, 160)


###### BsmtFinType1: Rating of basement finished area

Created 2 quantitative vars: bsmt_finished1 and bsmt_finished2

In [319]:
# Replacing NAs with 'None' in both data frames:
train['BsmtFinType1'].fillna('NoBasement', inplace=True)
train['BsmtFinType2'].fillna('NoBasement', inplace=True)
test['BsmtFinType1'].fillna('NoBasement', inplace=True)
test['BsmtFinType2'].fillna('NoBasement', inplace=True)

In [322]:
# print((train.BsmtFinType1.value_counts().sort_values(ascending = False)/N*100).round(2))
# print((train.BsmtFinType2.value_counts().sort_values(ascending = False)/N*100).round(2))

In [323]:
# print((test.BsmtFinType1.value_counts().sort_values(ascending = False)/N*100).round(2))
# print((test.BsmtFinType2.value_counts().sort_values(ascending = False)/N*100).round(2))

In [325]:
train['bsmt_finished1'] = 0
train.loc[(train.BsmtFinType1 == 'Unf'), 'bsmt_finished1'] = 1
train.loc[(train.BsmtFinType1 == 'LwQ'), 'bsmt_finished1'] = 1
train.loc[(train.BsmtFinType1 == 'BLQ'), 'bsmt_finished1'] = 2
train.loc[(train.BsmtFinType1 == 'Rec'), 'bsmt_finished1'] = 2
train.loc[(train.BsmtFinType1 == 'ALQ'), 'bsmt_finished1'] = 3
train.loc[(train.BsmtFinType1 == 'GLQ'), 'bsmt_finished1'] = 4
pd.crosstab(train.BsmtFinType1, train.bsmt_finished1)

bsmt_finished1,0,1,2,3,4
BsmtFinType1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ALQ,0,0,0,220,0
BLQ,0,0,148,0,0
GLQ,0,0,0,0,418
LwQ,0,74,0,0,0
NoBasement,37,0,0,0,0
Rec,0,0,133,0,0
Unf,0,430,0,0,0


In [326]:
train['bsmt_finished2'] = 0
train.loc[(train.BsmtFinType2 == 'Unf'), 'bsmt_finished2'] = 1
train.loc[(train.BsmtFinType2 == 'LwQ'), 'bsmt_finished2'] = 1
train.loc[(train.BsmtFinType2 == 'BLQ'), 'bsmt_finished2'] = 2
train.loc[(train.BsmtFinType2 == 'Rec'), 'bsmt_finished2'] = 2
train.loc[(train.BsmtFinType2 == 'ALQ'), 'bsmt_finished2'] = 3
train.loc[(train.BsmtFinType2 == 'GLQ'), 'bsmt_finished2'] = 4
pd.crosstab(train.BsmtFinType2, train.bsmt_finished1)

bsmt_finished1,0,1,2,3,4
BsmtFinType2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ALQ,0,9,6,0,4
BLQ,0,4,12,15,2
GLQ,0,7,5,2,0
LwQ,0,0,21,15,10
NoBasement,37,0,0,0,1
Rec,0,8,15,22,9
Unf,0,476,222,166,392


In [327]:
test['bsmt_finished1'] = 0
test.loc[(test.BsmtFinType1 == 'Unf'), 'bsmt_finished1'] = 1
test.loc[(test.BsmtFinType1 == 'LwQ'), 'bsmt_finished1'] = 1
test.loc[(test.BsmtFinType1 == 'BLQ'), 'bsmt_finished1'] = 2
test.loc[(test.BsmtFinType1 == 'Rec'), 'bsmt_finished1'] = 2
test.loc[(test.BsmtFinType1 == 'ALQ'), 'bsmt_finished1'] = 3
test.loc[(test.BsmtFinType1 == 'GLQ'), 'bsmt_finished1'] = 4
pd.crosstab(test.BsmtFinType1, test.bsmt_finished1)

bsmt_finished1,0,1,2,3,4
BsmtFinType1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ALQ,0,0,0,209,0
BLQ,0,0,121,0,0
GLQ,0,0,0,0,431
LwQ,0,80,0,0,0
NoBasement,42,0,0,0,0
Rec,0,0,155,0,0
Unf,0,421,0,0,0


In [328]:
test['bsmt_finished2'] = 0
test.loc[(test.BsmtFinType2 == 'Unf'), 'bsmt_finished2'] = 1
test.loc[(test.BsmtFinType2 == 'LwQ'), 'bsmt_finished2'] = 1
test.loc[(test.BsmtFinType2 == 'BLQ'), 'bsmt_finished2'] = 2
test.loc[(test.BsmtFinType2 == 'Rec'), 'bsmt_finished2'] = 2
test.loc[(test.BsmtFinType2 == 'ALQ'), 'bsmt_finished2'] = 3
test.loc[(test.BsmtFinType2 == 'GLQ'), 'bsmt_finished2'] = 4
pd.crosstab(test.BsmtFinType2, test.bsmt_finished2)

bsmt_finished2,0,1,2,3,4
BsmtFinType2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ALQ,0,0,0,33,0
BLQ,0,0,35,0,0
GLQ,0,0,0,0,20
LwQ,0,41,0,0,0
NoBasement,42,0,0,0,0
Rec,0,0,51,0,0
Unf,0,1237,0,0,0


In [329]:
print(train.shape)
print(test.shape)

(1460, 163)
(1459, 162)


###### Basement Square Footage - create a new var - share of finished

In [331]:
train['BsmtFinishedShare'] = 1 - train['BsmtUnfSF']/train['TotalBsmtSF']
train.loc[train.TotalBsmtSF == 0, 'BsmtFinishedShare'] = 0

In [338]:
test['BsmtFinishedShare'] = 1 - test['BsmtUnfSF']/test['TotalBsmtSF']
test.loc[test.TotalBsmtSF == 0, 'BsmtFinishedShare'] = 0

In [335]:
# train[['BsmtUnfSF','TotalBsmtSF','BsmtFinishedShare']]

In [339]:
train['BsmtFinishedShare'].corr(train['TotalBsmtSF'])  # correlation is 0.17

0.17388520502907454

In [None]:
train['BsmtFinishedShare'].corr(train['TotalBsmtSF'])  # correlation is 0.17

In [344]:
test['BsmtUnfSF'].corr(test['TotalBsmtSF'])  # correlation is 0.41

0.40902319772896112

In [343]:
plt.hist(train.TotalBsmtSF)

(array([ 121.,  907.,  372.,   52.,    3.,    4.,    0.,    0.,    0.,    1.]),
 array([    0.,   611.,  1222.,  1833.,  2444.,  3055.,  3666.,  4277.,
         4888.,  5499.,  6110.]),
 <a list of 10 Patch objects>)

###### Heating: Type of heating - IGNORE, too little variance

In [345]:
print((train.Heating.value_counts().sort_values(ascending = False)/N*100).round(2))
print((test.Heating.value_counts().sort_values(ascending = False)/N*100).round(2))

GasA     97.81
GasW      1.23
Grav      0.48
Wall      0.27
OthW      0.14
Floor     0.07
Name: Heating, dtype: float64
GasA    99.04
GasW     0.62
Grav     0.14
Wall     0.14
Name: Heating, dtype: float64


###### HeatingQC: Heating quality and condition

Created quantitative variable 'heating_quality'

In [346]:
print((train.HeatingQC.value_counts().sort_values(ascending = False)/N*100).round(2))
print((test.HeatingQC.value_counts().sort_values(ascending = False)/N*100).round(2))

Ex    50.75
TA    29.32
Gd    16.51
Fa     3.36
Po     0.07
Name: HeatingQC, dtype: float64
Ex    51.51
TA    29.38
Gd    15.96
Fa     2.95
Po     0.14
Name: HeatingQC, dtype: float64


In [350]:
train['heating_quality'] = 0
train.loc[train.HeatingQC == 'Fa', 'heating_quality'] = 1
train.loc[train.HeatingQC == 'TA', 'heating_quality'] = 2
train.loc[train.HeatingQC == 'Gd', 'heating_quality'] = 3
train.loc[train.HeatingQC == 'Ex', 'heating_quality'] = 4
pd.crosstab(train.HeatingQC, train.heating_quality)

heating_quality,0,1,2,3,4
HeatingQC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ex,0,0,0,0,741
Fa,0,49,0,0,0
Gd,0,0,0,241,0
Po,1,0,0,0,0
TA,0,0,428,0,0


In [354]:
test['heating_quality'] = 0
test.loc[test.HeatingQC == 'Fa', 'heating_quality'] = 1
test.loc[test.HeatingQC == 'TA', 'heating_quality'] = 2
test.loc[test.HeatingQC == 'Gd', 'heating_quality'] = 3
test.loc[test.HeatingQC == 'Ex', 'heating_quality'] = 4
pd.crosstab(test.HeatingQC, test.heating_quality)

heating_quality,0,1,2,3,4
HeatingQC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ex,0,0,0,0,752
Fa,0,43,0,0,0
Gd,0,0,0,233,0
Po,2,0,0,0,0
TA,0,0,429,0,0


In [355]:
print(train.shape)
print(test.shape)

(1460, 165)
(1459, 164)


###### CentralAir: Central air conditioning

In [356]:
print((train.CentralAir.value_counts().sort_values(ascending = False)/N*100).round(2))
print((test.CentralAir.value_counts().sort_values(ascending = False)/N*100).round(2))

Y    93.49
N     6.51
Name: CentralAir, dtype: float64
Y    93.01
N     6.92
Name: CentralAir, dtype: float64


In [358]:
# Creating a dummy:
train['central_air'] = 0
train.loc[train.CentralAir == 'Y', 'central_air'] = 1
pd.crosstab(train.CentralAir, train.central_air)

central_air,0,1
CentralAir,Unnamed: 1_level_1,Unnamed: 2_level_1
N,95,0
Y,0,1365


In [359]:
test['central_air'] = 0
test.loc[test.CentralAir == 'Y', 'central_air'] = 1
pd.crosstab(test.CentralAir, test.central_air)

central_air,0,1
CentralAir,Unnamed: 1_level_1,Unnamed: 2_level_1
N,101,0
Y,0,1358


###### Electrical: Electrical system

In [360]:
print((train.Electrical.value_counts().sort_values(ascending = False)/N*100).round(2))
print((test.Electrical.value_counts().sort_values(ascending = False)/N*100).round(2))

SBrkr    91.37
FuseA     6.44
FuseF     1.85
FuseP     0.21
Mix       0.07
Name: Electrical, dtype: float64
SBrkr    91.58
FuseA     6.44
FuseF     1.58
FuseP     0.34
Name: Electrical, dtype: float64


In [361]:
# Creating 2 dummies:
train['elec_sbrkr'] = 0
train.loc[train.Electrical == 'SBrkr', 'elec_sbrkr'] = 1
train['elec_fuseA'] = 0
train.loc[train.Electrical == 'FuseA', 'elec_fuseA'] = 1

In [362]:
test['elec_sbrkr'] = 0
test.loc[test.Electrical == 'SBrkr', 'elec_sbrkr'] = 1
test['elec_fuseA'] = 0
test.loc[test.Electrical == 'FuseA', 'elec_fuseA'] = 1

In [363]:
print(train.shape)
print(test.shape)

(1460, 168)
(1459, 167)


In [364]:
misscounts = np.sum(train.isnull(), axis=0)/N*100
# print(misscounts[:40].round(2))
print(misscounts[40:].round(2))

HeatingQC              0.00
CentralAir             0.00
Electrical             0.07
1stFlrSF               0.00
2ndFlrSF               0.00
LowQualFinSF           0.00
GrLivArea              0.00
BsmtFullBath           0.00
BsmtHalfBath           0.00
FullBath               0.00
HalfBath               0.00
BedroomAbvGr           0.00
KitchenAbvGr           0.00
KitchenQual            0.00
TotRmsAbvGrd           0.00
Functional             0.00
Fireplaces             0.00
FireplaceQu           47.26
GarageType             5.55
GarageYrBlt            5.55
GarageFinish           5.55
GarageCars             0.00
GarageArea             0.00
GarageQual             5.55
GarageCond             5.55
PavedDrive             0.00
WoodDeckSF             0.00
OpenPorchSF            0.00
EnclosedPorch          0.00
3SsnPorch              0.00
                      ...  
roof_gable             0.00
roof_hip               0.00
ext_AsbShng            0.00
ext_BrkFace            0.00
ext_CemntBd         

In [None]:
# Creating dummies based on MSZoning and dropping two of them (with too few observations):
temp = pd.get_dummies(train.MSZoning, prefix = "zone", drop_first = True)
temp.drop('zone_RH', axis=1, inplace = True)
train = pd.concat([train, temp], axis = 1)