## Data cleaning and preprocessing ##

In [1]:
import pandas as pd

In [2]:
houses = pd.read_csv('../data/train.csv')
houses.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


Let's drop the Id column since it doesn't give important information.

In [3]:
houses = houses.drop(labels='Id', axis=1)
houses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [4]:
houses.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,46.549315,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,161.319273,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,0.0,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [5]:
#Separate columns into numerical data and categorical data

houses_numerical_cols = houses.select_dtypes(include='number')

houses_categorical_cols = houses.select_dtypes(exclude='number')


In [6]:
houses_categorical_cols.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 43 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MSZoning       1460 non-null   object
 1   Street         1460 non-null   object
 2   Alley          91 non-null     object
 3   LotShape       1460 non-null   object
 4   LandContour    1460 non-null   object
 5   Utilities      1460 non-null   object
 6   LotConfig      1460 non-null   object
 7   LandSlope      1460 non-null   object
 8   Neighborhood   1460 non-null   object
 9   Condition1     1460 non-null   object
 10  Condition2     1460 non-null   object
 11  BldgType       1460 non-null   object
 12  HouseStyle     1460 non-null   object
 13  RoofStyle      1460 non-null   object
 14  RoofMatl       1460 non-null   object
 15  Exterior1st    1460 non-null   object
 16  Exterior2nd    1460 non-null   object
 17  MasVnrType     588 non-null    object
 18  ExterQual      1460 non-null

In [7]:
houses_categorical_cols.columns[houses_categorical_cols.isna().any()]

Index(['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu',
       'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC',
       'Fence', 'MiscFeature'],
      dtype='object')

For a lot of these categories a missing value means that feature is not there so we can just fill that category with 'None' or something similar.

List of columns for which missing value means no feature:
`Alley`, `MasVnrType`, `BsmtQual`, `BsmtCond`, `BsmtExposure`,
`BsmtFinType1`, `BsmtFinType2`, `FireplaceQu`, `GarageType`, 
`GarageFinish`, `GarageQual`, `GarageCond`, `PoolQC`,
`Fence`, `MiscFeature`

List of columns for which missing values might mean something else:
`Electrical`,

For all the categories on the first list we will just fill the missing values with 'None'. The remaining category will be dealt with separately.

In [8]:
missing_features_cols = ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 
'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC',
'Fence', 'MiscFeature']

for feature in missing_features_cols:
    houses[feature] = houses[feature].fillna('None')



As seen below, there is only one house without Electrical information. I propose that we use either the neighborhood or the year in which it was built/remodeled to choose which category does it belong to.

In [9]:
houses['Electrical'][houses['Electrical'].isna()==True] # This gives only house 1379

mysterious_house = houses.iloc[1379] 

print(mysterious_house['Neighborhood'], mysterious_house['YearBuilt'], mysterious_house['YearRemodAdd'])

Timber 2006 2007


In [10]:
print(houses.groupby('Neighborhood')['Electrical'].get_group('Timber').describe() )
print(houses.groupby('YearBuilt')['Electrical'].get_group(2006).describe())
print(houses.groupby('YearRemodAdd')['Electrical'].get_group(2007).describe())

count        37
unique        2
top       SBrkr
freq         36
Name: Electrical, dtype: object
count        66
unique        1
top       SBrkr
freq         66
Name: Electrical, dtype: object
count        75
unique        1
top       SBrkr
freq         75
Name: Electrical, dtype: object


Seems like the most reasonable choice is to assign 'SBrkr'.

In [11]:
houses['Electrical'] = houses['Electrical'].fillna('SBrkr')

In [12]:
houses_categorical_cols = houses.select_dtypes(exclude='number')
houses_categorical_cols.columns[houses_categorical_cols.isna().any()]

Index([], dtype='object')

Now we are done filling all the missing categorical data.

## Impute Missing Data for Numerical Values

In [13]:
houses_numerical_cols.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 37 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   LotFrontage    1201 non-null   float64
 2   LotArea        1460 non-null   int64  
 3   OverallQual    1460 non-null   int64  
 4   OverallCond    1460 non-null   int64  
 5   YearBuilt      1460 non-null   int64  
 6   YearRemodAdd   1460 non-null   int64  
 7   MasVnrArea     1452 non-null   float64
 8   BsmtFinSF1     1460 non-null   int64  
 9   BsmtFinSF2     1460 non-null   int64  
 10  BsmtUnfSF      1460 non-null   int64  
 11  TotalBsmtSF    1460 non-null   int64  
 12  1stFlrSF       1460 non-null   int64  
 13  2ndFlrSF       1460 non-null   int64  
 14  LowQualFinSF   1460 non-null   int64  
 15  GrLivArea      1460 non-null   int64  
 16  BsmtFullBath   1460 non-null   int64  
 17  BsmtHalfBath   1460 non-null   int64  
 18  FullBath

Question: How do we deal with the different missing nummerical values? 

LotFrontage shows the length of the adjacent street, maybe we should use the average/median(?)

MasVnrArea shows the area of the masonry work, here a missing value indicates zero area so we fill it out with zeros

GarageYrBlt shows the year in which the garage was built, here a missing value indicates no garage but I don't know if putting year 0 is too much of a penalty. The value here should somehow make a house with a garage more expensive than a house without one. Ideas?

Update: I learned there is a thing called sklearn.imputer. Maybe we can check there for options.
Update 2: I think we should use the k nearest neighbors imputer from sklearn. The question is which neighbors to consider, or which columns. https://scikit-learn.org/stable/auto_examples/impute/plot_missing_values.html#sphx-glr-auto-examples-impute-plot-missing-values-py

We can use LotArea, LotConfig, 1stFlrSF to regress on 

In [14]:
houses['MasVnrArea'] = houses['MasVnrArea'].fillna(0)


In [15]:
houses['LotFrontage'].describe()

count    1201.000000
mean       70.049958
std        24.284752
min        21.000000
25%        59.000000
50%        69.000000
75%        80.000000
max       313.000000
Name: LotFrontage, dtype: float64

In [16]:
#This failed at first because 'LotConfig' is categorical, maybe we should've dealt with those first.
#I added some cells above to deal with categorical data first.
#Now let's try to encode the categorical data into numerical labels to able to do regression.

from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()

houses['LotConfig_labels'] = labelencoder.fit_transform(houses['LotConfig'])


from sklearn.impute import KNNImputer

imputer = KNNImputer()
houses[['LotFrontage','LotArea', 'LotConfig_labels', '1stFlrSF']] = imputer.fit_transform(houses[['LotFrontage','LotArea', 'LotConfig_labels', '1stFlrSF']])



In [17]:
houses['LotFrontage'].describe()

count    1460.000000
mean       71.100000
std        23.855761
min        21.000000
25%        60.000000
50%        70.000000
75%        82.000000
max       313.000000
Name: LotFrontage, dtype: float64

The regression seems to have worked since it didnt change quantitites by too much.

Now there's only one more variable to clean: `GarageYrBlt`

In [18]:
houses[houses['GarageYrBlt'].isna()]

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LotConfig_labels
39,90,RL,65.0,6040.0,Pave,,Reg,Lvl,AllPub,Inside,...,,,,0,6,2008,WD,AdjLand,82000,4.0
48,190,RM,33.0,4456.0,Pave,,Reg,Lvl,AllPub,Inside,...,,,,0,6,2009,New,Partial,113000,4.0
78,90,RL,72.0,10778.0,Pave,,Reg,Lvl,AllPub,Inside,...,,,,0,4,2010,WD,Normal,136500,4.0
88,50,C (all),105.0,8470.0,Pave,,IR1,Lvl,AllPub,Corner,...,,MnPrv,,0,10,2009,ConLD,Abnorml,85000,0.0
89,20,RL,60.0,8070.0,Pave,,Reg,Lvl,AllPub,Inside,...,,,,0,8,2007,WD,Normal,123600,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1349,70,RM,50.0,5250.0,Pave,Pave,Reg,Lvl,AllPub,Inside,...,,,,0,12,2008,WD,Normal,122000,4.0
1407,20,RL,66.0,8780.0,Pave,,IR1,Lvl,AllPub,Corner,...,,MnPrv,,0,3,2009,WD,Normal,112000,0.0
1449,180,RM,21.0,1533.0,Pave,,Reg,Lvl,AllPub,Inside,...,,,,0,8,2006,WD,Abnorml,92000,4.0
1450,90,RL,60.0,9000.0,Pave,,Reg,Lvl,AllPub,FR2,...,,,,0,9,2009,WD,Normal,136000,2.0


In [19]:
houses[houses['GarageYrBlt'].isna()].groupby('LotConfig').describe()

Unnamed: 0_level_0,MSSubClass,MSSubClass,MSSubClass,MSSubClass,MSSubClass,MSSubClass,MSSubClass,MSSubClass,LotFrontage,LotFrontage,...,SalePrice,SalePrice,LotConfig_labels,LotConfig_labels,LotConfig_labels,LotConfig_labels,LotConfig_labels,LotConfig_labels,LotConfig_labels,LotConfig_labels
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
LotConfig,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Corner,13.0,46.538462,25.11512,20.0,30.0,50.0,50.0,90.0,13.0,81.292308,...,112000.0,141000.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CulDSac,1.0,90.0,,90.0,90.0,90.0,90.0,90.0,1.0,35.0,...,127500.0,127500.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0
FR2,3.0,53.333333,35.118846,20.0,35.0,50.0,70.0,90.0,3.0,62.666667,...,125500.0,136000.0,3.0,2.0,0.0,2.0,2.0,2.0,2.0,2.0
Inside,64.0,77.5,63.044454,20.0,30.0,50.0,90.0,190.0,64.0,57.15,...,124750.0,200500.0,64.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0


It is likely that the `GarageYrBlt` is the same as the `YearBuilt`, and looking at the results below, confirms this theory for most of the rows. Therefore, it is safe to impute the missing values of `GarageYrBlt` with `YearBuilt`

In [20]:
houses[['YearBuilt', 'GarageYrBlt']]

Unnamed: 0,YearBuilt,GarageYrBlt
0,2003,2003.0
1,1976,1976.0
2,2001,2001.0
3,1915,1998.0
4,2000,2000.0
...,...,...
1455,1999,1999.0
1456,1978,1978.0
1457,1941,1941.0
1458,1950,1950.0


In [21]:
#This is the proportion of houses for which 'YearBuilt' and 'GarageYrBlt' agree.

houses[houses['YearBuilt']==houses['GarageYrBlt']].shape[0]/1460

0.7458904109589041

In [22]:
houses['GarageYrBlt'] = houses['GarageYrBlt'].fillna(houses['YearBuilt'])

Looking at the info of the Dataframe below, all columns now have no missing values

In [23]:
houses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   MSSubClass        1460 non-null   int64  
 1   MSZoning          1460 non-null   object 
 2   LotFrontage       1460 non-null   float64
 3   LotArea           1460 non-null   float64
 4   Street            1460 non-null   object 
 5   Alley             1460 non-null   object 
 6   LotShape          1460 non-null   object 
 7   LandContour       1460 non-null   object 
 8   Utilities         1460 non-null   object 
 9   LotConfig         1460 non-null   object 
 10  LandSlope         1460 non-null   object 
 11  Neighborhood      1460 non-null   object 
 12  Condition1        1460 non-null   object 
 13  Condition2        1460 non-null   object 
 14  BldgType          1460 non-null   object 
 15  HouseStyle        1460 non-null   object 
 16  OverallQual       1460 non-null   int64  


Above, we see that `MoSold` is an integer type, but we want it to be categorical instead

In [24]:
houses['MoSold'] = houses['MoSold'].astype('object')
houses['YrSold'] = houses['YrSold'].astype('object')
houses['MSSubClass'] = houses['MSSubClass'].astype('object')


That's a good observation about the month being categorical, maybe the year in which it was sold is similar.

Does this also apply to the year in which things were built? 

I think there are some other features that are stored as numbers but should be categorical, for example MSSubClass, the number represents a category, not a measure of anything.

In [25]:
houses.describe()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,SalePrice,LotConfig_labels
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,71.1,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.117123,443.639726,46.549315,567.240411,...,472.980137,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,180921.19589,3.019178
std,23.855761,9981.264932,1.382997,1.112799,30.202904,20.645407,180.731373,456.098091,161.319273,441.866955,...,213.804841,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,79442.502883,1.622634
min,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34900.0,0.0
25%,60.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,223.0,...,334.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,129975.0,2.0
50%,70.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,477.5,...,480.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,163000.0,4.0
75%,82.0,11601.5,7.0,6.0,2000.0,2004.0,164.25,712.25,0.0,808.0,...,576.0,168.0,68.0,0.0,0.0,0.0,0.0,0.0,214000.0,4.0
max,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,2336.0,...,1418.0,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,755000.0,4.0


In [26]:
pd.set_option('display.max_columns', None)
houses.describe(include='object')

Unnamed: 0,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,MoSold,YrSold,SaleType,SaleCondition
count,1460,1460,1460,1460.0,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460.0,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460.0,1460,1460,1460,1460,1460,1460.0,1460.0,1460.0,1460,1460,1460,1460
unique,15,5,2,3.0,4,4,2,5,3,25,9,8,5,8,6,8,15,16,4.0,4,5,6,5,5,5,7,7,6,5,2,5,4,7,6.0,7,4,6,6,3,4.0,5.0,5.0,12,5,9,6
top,20,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,Gable,CompShg,VinylSd,VinylSd,,TA,TA,PConc,TA,TA,No,Unf,Unf,GasA,Ex,Y,SBrkr,TA,Typ,,Attchd,Unf,TA,TA,Y,,,,6,2009,WD,Normal
freq,536,1151,1454,1369.0,925,1311,1459,1052,1382,225,1260,1445,1220,726,1141,1434,515,504,872.0,906,1282,647,649,1311,953,430,1256,1428,741,1365,1335,735,1360,690.0,870,605,1311,1326,1340,1453.0,1179.0,1406.0,253,338,1267,1198


## Convert Categorical Variables to Dummy Variables

In [27]:
houses_categorical_cols = houses.select_dtypes(exclude='number')

In [28]:
#Uncomment this to see the plots

# import matplotlib.pyplot as plt


# for col in houses_categorical_cols:
      
#     houses.groupby(col)['SalePrice'].median().plot(kind='bar',figsize=(2,2)) 
#     plt.show()

# for col in houses_categorical_cols:
#     houses.plot(x=col,y='SalePrice', kind='scatter',figsize=(2,2))
#     plt.show()

Based on the plots above we can see which categories seem to have rankings, for example it's obvious that the condition of the house or the condition or the kitchen can be ordered and have impact on the price of the house. So we can use Label encoding for those. The neighborhood does not have this property so maybe it's better to use One-Hot encoding. The other option is to rank the neighborhoods (or whatever other feature) by the height of the bar in each graph and use that to say that some neighborhoods are more desirable than others. We have to balance out the integrity of the data with having too many dimensions.

In [29]:
columns_to_process = ["MSZoning", "Street", "Alley", "LotShape", "LandContour", "Utilities", "LotConfig", "LandSlope",
                        "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", 
                        "Exterior1st", "Exterior2nd", "MasVnrType", "ExterQual", "ExterCond", "Foundation", "BsmtQual", 
                        "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "Heating", "HeatingQC", "CentralAir", 
                        "Electrical", "KitchenQual", "Functional", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual", 
                        "GarageCond", "PavedDrive", "PoolQC", "Fence", "MiscFeature", "MoSold", "SaleType", "SaleCondition"]

#Categories with obvious rankings:
#Street, Utilities, ExterQual, ExterCond, BsmtQual, BsmtCond, BsmtExposure(?), Heating(?), 
#HeatingQC, CentralAir, Electrical, KitchenQual, FireplaceQu, GarageFinish, GarageQual, 
#GarageCond, PavedDrive, PoolQC


columns_with_rankings = ['Street', 'Utilities', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'Heating', 
                          'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'FireplaceQu', 'GarageFinish', 'GarageQual', 
                          'GarageCond', 'PavedDrive', 'PoolQC']

columns_for_dummies = [col for col in columns_to_process if col not in columns_with_rankings]

houses = pd.get_dummies(houses, columns=columns_for_dummies, prefix=columns_for_dummies, drop_first = True, dtype=int)   

#Maybe for now:

#labelencoder = LabelEncoder()
#houses_categorical_cols = houses.select_dtypes(exclude='number')

#for col in columns_with_rankings:

#   houses[col] = labelencoder.fit_transform(houses[col])




In [30]:
#Aternatively:
from sklearn.preprocessing import OrdinalEncoder

columns_with_rankings = ['Street', 'Utilities', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
                         'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'FireplaceQu', 
                         'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Heating']

#Street

ordinalencoder = OrdinalEncoder(categories=[['Grvl','Pave']])
houses['Street'] = ordinalencoder.fit_transform(houses['Street'].values.reshape(-1,1))

#Utilities

ordinalencoder = OrdinalEncoder(categories=[['ELO','NoSeWa','NoSewr','AllPub']])
houses['Utilities'] = ordinalencoder.fit_transform(houses['Utilities'].values.reshape(-1,1))

#ExterQual, ExterCond, HeatingQC, KitchenQual,

ordinalencoder = OrdinalEncoder(categories=[['Po','Fa','TA','Gd','Ex']])
houses['ExterQual'] = ordinalencoder.fit_transform(houses['ExterQual'].values.reshape(-1,1))
houses['ExterCond'] = ordinalencoder.fit_transform(houses['ExterCond'].values.reshape(-1,1))
houses['KitchenQual'] = ordinalencoder.fit_transform(houses['KitchenQual'].values.reshape(-1,1))
houses['HeatingQC'] = ordinalencoder.fit_transform(houses['HeatingQC'].values.reshape(-1,1))

#BsmtQual, BsmtCond, FireplaceQu, GarageQual, GarageCond

ordinalencoder = OrdinalEncoder(categories=[['None','Po','Fa','TA','Gd','Ex']])
houses['BsmtQual'] = ordinalencoder.fit_transform(houses['BsmtQual'].values.reshape(-1,1))
houses['BsmtCond'] = ordinalencoder.fit_transform(houses['BsmtCond'].values.reshape(-1,1))
houses['FireplaceQu'] = ordinalencoder.fit_transform(houses['FireplaceQu'].values.reshape(-1,1))
houses['GarageQual'] = ordinalencoder.fit_transform(houses['GarageQual'].values.reshape(-1,1))
houses['GarageCond'] = ordinalencoder.fit_transform(houses['GarageCond'].values.reshape(-1,1))



#BsmtExposure

ordinalencoder = OrdinalEncoder(categories=[['None','No','Mn','Av','Gd']])
houses['BsmtExposure'] = ordinalencoder.fit_transform(houses['BsmtExposure'].values.reshape(-1,1))

#CentralAir

ordinalencoder = OrdinalEncoder(categories=[['N','Y']])
houses['CentralAir'] = ordinalencoder.fit_transform(houses['CentralAir'].values.reshape(-1,1))

#Electrical

ordinalencoder = OrdinalEncoder(categories=[['Mix','FuseP','FuseF','FuseA','SBrkr']])
houses['Electrical'] = ordinalencoder.fit_transform(houses['Electrical'].values.reshape(-1,1))

#GarageFinish

ordinalencoder = OrdinalEncoder(categories=[['None','Unf','RFn','Fin']])
houses['GarageFinish'] = ordinalencoder.fit_transform(houses['GarageFinish'].values.reshape(-1,1))

#PavedDrive

ordinalencoder = OrdinalEncoder(categories=[['N','P','Y']])
houses['PavedDrive'] = ordinalencoder.fit_transform(houses['PavedDrive'].values.reshape(-1,1))

#PoolQC

ordinalencoder = OrdinalEncoder(categories=[['None','Fa','TA','Gd','Ex']])
houses['PoolQC'] = ordinalencoder.fit_transform(houses['PoolQC'].values.reshape(-1,1))

#Heating

originalencoder = OrdinalEncoder(categories=[['GasA', 'GasW', 'Grav', 'Wall', 'OthW', 'Floor']])
houses['Heating'] = originalencoder.fit_transform(houses['Heating'].values.reshape(-1,1))

In [31]:
houses.KitchenAbvGr.describe()

count    1460.000000
mean        1.046575
std         0.220338
min         0.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         3.000000
Name: KitchenAbvGr, dtype: float64

In [32]:
houses.shape

(1460, 228)

Now, we need to replace `True` values with 1 and `False` values with 0

Edit: This is not needed anymore since we can ask the get_dummy function to give us integers instead of bools

In [33]:
#houses = houses.replace({True: 1, False: 0})

In [34]:
houses.info

<bound method DataFrame.info of      MSSubClass  LotFrontage  LotArea  Street  Utilities  OverallQual  \
0            60         65.0   8450.0     1.0        3.0            7   
1            20         80.0   9600.0     1.0        3.0            6   
2            60         68.0  11250.0     1.0        3.0            7   
3            70         60.0   9550.0     1.0        3.0            7   
4            60         84.0  14260.0     1.0        3.0            8   
...         ...          ...      ...     ...        ...          ...   
1455         60         62.0   7917.0     1.0        3.0            6   
1456         20         85.0  13175.0     1.0        3.0            6   
1457         70         66.0   9042.0     1.0        3.0            7   
1458         20         68.0   9717.0     1.0        3.0            5   
1459         20         75.0   9937.0     1.0        3.0            5   

      OverallCond  YearBuilt  YearRemodAdd  MasVnrArea  ExterQual  ExterCond  \
0          

### Handle Outliers

In [35]:
from sklearn.preprocessing import MinMaxScaler #scale features

scaler = MinMaxScaler()

In [36]:
houses_array = scaler.fit_transform(houses.select_dtypes(include='number'))
houses_array

array([[0.15068493, 0.0334198 , 1.        , ..., 0.        , 1.        ,
        0.        ],
       [0.20205479, 0.03879502, 1.        , ..., 0.        , 1.        ,
        0.        ],
       [0.1609589 , 0.04650728, 1.        , ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.15410959, 0.03618687, 1.        , ..., 0.        , 1.        ,
        0.        ],
       [0.1609589 , 0.03934189, 1.        , ..., 0.        , 1.        ,
        0.        ],
       [0.18493151, 0.04037019, 1.        , ..., 0.        , 1.        ,
        0.        ]])

In [37]:
houses[houses.select_dtypes(include='number').columns] = pd.DataFrame(houses_array, columns=houses.select_dtypes(include='number').columns, index=houses.select_dtypes(include='number').index)
houses

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,Utilities,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,MiscVal,YrSold,SalePrice,LotConfig_labels,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Alley_None,Alley_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_HLS,LandContour_Low,LandContour_Lvl,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,LandSlope_Mod,LandSlope_Sev,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,Condition1_RRNe,Condition1_RRNn,Condition2_Feedr,Condition2_Norm,Condition2_PosA,Condition2_PosN,Condition2_RRAe,Condition2_RRAn,Condition2_RRNn,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,Exterior2nd_AsphShn,Exterior2nd_Brk Cmn,Exterior2nd_BrkFace,Exterior2nd_CBlock,Exterior2nd_CmentBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Other,Exterior2nd_Plywood,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_None,BsmtFinType1_Rec,BsmtFinType1_Unf,BsmtFinType2_BLQ,BsmtFinType2_GLQ,BsmtFinType2_LwQ,BsmtFinType2_None,BsmtFinType2_Rec,BsmtFinType2_Unf,Functional_Maj2,Functional_Min1,Functional_Min2,Functional_Mod,Functional_Sev,Functional_Typ,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_None,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_None,MiscFeature_None,MiscFeature_Othr,MiscFeature_Shed,MiscFeature_TenC,MoSold_2,MoSold_3,MoSold_4,MoSold_5,MoSold_6,MoSold_7,MoSold_8,MoSold_9,MoSold_10,MoSold_11,MoSold_12,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,0.150685,0.033420,1.0,1.0,0.666667,0.500,0.949275,0.883333,0.122500,0.666667,0.50,0.8,0.75,0.25,0.125089,0.000000,0.064212,0.140098,0.0,1.00,1.0,1.00,0.119780,0.413559,0.0,0.259231,0.333333,0.0,0.666667,0.5,0.375,0.333333,0.666667,0.500000,0.000000,0.0,0.949275,0.666667,0.50,0.386460,0.6,0.6,1.0,0.000000,0.111517,0.000000,0.0,0.0,0.0,0.0,0.00000,2008,0.241078,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,20,0.202055,0.038795,1.0,1.0,0.555556,0.875,0.753623,0.433333,0.000000,0.333333,0.50,0.8,0.75,1.00,0.173281,0.000000,0.121575,0.206547,0.0,1.00,1.0,1.00,0.212942,0.000000,0.0,0.174830,0.000000,0.5,0.666667,0.0,0.375,0.333333,0.333333,0.333333,0.333333,0.6,0.753623,0.666667,0.50,0.324401,0.6,0.6,1.0,0.347725,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,2007,0.203583,0.5,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,60,0.160959,0.046507,1.0,1.0,0.666667,0.500,0.934783,0.866667,0.101250,0.666667,0.50,0.8,0.75,0.50,0.086109,0.000000,0.185788,0.150573,0.0,1.00,1.0,1.00,0.134465,0.419370,0.0,0.273549,0.333333,0.0,0.666667,0.5,0.375,0.333333,0.666667,0.333333,0.333333,0.6,0.934783,0.666667,0.50,0.428773,0.6,0.6,1.0,0.000000,0.076782,0.000000,0.0,0.0,0.0,0.0,0.00000,2008,0.261908,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,70,0.133562,0.038561,1.0,1.0,0.666667,0.500,0.311594,0.333333,0.000000,0.333333,0.50,0.6,1.00,0.25,0.038271,0.000000,0.231164,0.123732,0.0,0.75,1.0,1.00,0.143873,0.366102,0.0,0.260550,0.333333,0.0,0.333333,0.0,0.375,0.333333,0.666667,0.416667,0.333333,0.8,0.913043,0.333333,0.75,0.452750,0.6,0.6,1.0,0.000000,0.063985,0.492754,0.0,0.0,0.0,0.0,0.00000,2006,0.145952,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,60,0.215753,0.060576,1.0,1.0,0.777778,0.500,0.927536,0.833333,0.218750,0.666667,0.50,0.8,0.75,0.75,0.116052,0.000000,0.209760,0.187398,0.0,1.00,1.0,1.00,0.186095,0.509927,0.0,0.351168,0.333333,0.0,0.666667,0.5,0.500,0.333333,0.666667,0.583333,0.333333,0.6,0.927536,0.666667,0.75,0.589563,0.6,0.6,1.0,0.224037,0.153565,0.000000,0.0,0.0,0.0,0.0,0.00000,2008,0.298709,0.5,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,0.140411,0.030929,1.0,1.0,0.555556,0.500,0.920290,0.833333,0.000000,0.333333,0.50,0.8,0.75,0.25,0.000000,0.000000,0.407962,0.155974,0.0,1.00,1.0,1.00,0.142038,0.336077,0.0,0.247362,0.000000,0.0,0.666667,0.5,0.375,0.333333,0.333333,0.416667,0.333333,0.6,0.920290,0.666667,0.50,0.324401,0.6,0.6,1.0,0.000000,0.073126,0.000000,0.0,0.0,0.0,0.0,0.00000,2007,0.194556,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1456,20,0.219178,0.055505,1.0,1.0,0.555556,0.625,0.768116,0.633333,0.074375,0.333333,0.50,0.8,0.75,0.25,0.139972,0.110583,0.252140,0.252373,0.0,0.50,1.0,1.00,0.399036,0.000000,0.0,0.327619,0.333333,0.0,0.666667,0.0,0.375,0.333333,0.333333,0.416667,0.666667,0.6,0.768116,0.333333,0.50,0.352609,0.6,0.6,1.0,0.407235,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,2010,0.243161,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1457,70,0.154110,0.036187,1.0,1.0,0.666667,1.000,0.500000,0.933333,0.000000,1.000000,0.75,0.6,1.00,0.25,0.048724,0.000000,0.375428,0.188543,0.0,1.00,1.0,1.00,0.195961,0.557869,0.0,0.377920,0.000000,0.0,0.666667,0.0,0.500,0.333333,0.666667,0.583333,0.666667,0.8,0.500000,0.666667,0.25,0.177715,0.6,0.6,1.0,0.000000,0.109689,0.000000,0.0,0.0,0.0,0.0,0.16129,2010,0.321622,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1458,20,0.160959,0.039342,1.0,1.0,0.444444,0.625,0.565217,0.766667,0.000000,0.333333,0.50,0.6,0.75,0.50,0.008682,0.698100,0.000000,0.176432,0.0,0.75,1.0,0.75,0.170721,0.000000,0.0,0.140166,0.333333,0.0,0.333333,0.0,0.250,0.333333,0.666667,0.250000,0.000000,0.0,0.565217,0.333333,0.25,0.169252,0.6,0.6,1.0,0.427071,0.000000,0.202899,0.0,0.0,0.0,0.0,0.00000,2010,0.148903,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [38]:
houses.describe()

Unnamed: 0,LotFrontage,LotArea,Street,Utilities,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,MiscVal,SalePrice,LotConfig_labels,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Alley_None,Alley_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_HLS,LandContour_Low,LandContour_Lvl,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,LandSlope_Mod,LandSlope_Sev,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,Condition1_RRNe,Condition1_RRNn,Condition2_Feedr,Condition2_Norm,Condition2_PosA,Condition2_PosN,Condition2_RRAe,Condition2_RRAn,Condition2_RRNn,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,Exterior2nd_AsphShn,Exterior2nd_Brk Cmn,Exterior2nd_BrkFace,Exterior2nd_CBlock,Exterior2nd_CmentBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Other,Exterior2nd_Plywood,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_None,BsmtFinType1_Rec,BsmtFinType1_Unf,BsmtFinType2_BLQ,BsmtFinType2_GLQ,BsmtFinType2_LwQ,BsmtFinType2_None,BsmtFinType2_Rec,BsmtFinType2_Unf,Functional_Maj2,Functional_Min1,Functional_Min2,Functional_Mod,Functional_Sev,Functional_Typ,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_None,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_None,MiscFeature_None,MiscFeature_Othr,MiscFeature_Shed,MiscFeature_TenC,MoSold_2,MoSold_3,MoSold_4,MoSold_5,MoSold_6,MoSold_7,MoSold_8,MoSold_9,MoSold_10,MoSold_11,MoSold_12,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,0.171575,0.04308,0.99589,0.999315,0.566591,0.571918,0.719332,0.581096,0.064448,0.465297,0.52089,0.697808,0.733733,0.407534,0.078604,0.03158,0.242826,0.173065,0.007808,0.786301,0.934932,0.972432,0.190139,0.168035,0.010218,0.222582,0.141781,0.028767,0.521689,0.191438,0.358305,0.348858,0.503881,0.376484,0.204338,0.365068,0.757301,0.571918,0.441781,0.333554,0.562055,0.561781,0.928082,0.10997,0.085302,0.039772,0.006712,0.031377,0.003738,0.003253,0.002806,0.202779,0.754795,0.044521,0.010959,0.788356,0.149315,0.937671,0.028082,0.028082,0.006849,0.633562,0.034247,0.024658,0.897945,0.064384,0.032192,0.00274,0.720548,0.044521,0.008904,0.00137,0.010959,0.039726,0.019178,0.10274,0.034932,0.068493,0.05411,0.025342,0.011644,0.033562,0.15411,0.006164,0.05,0.028082,0.05274,0.077397,0.017123,0.050685,0.040411,0.058904,0.017123,0.026027,0.007534,0.055479,0.863014,0.005479,0.013014,0.007534,0.017808,0.00137,0.003425,0.00411,0.989726,0.000685,0.00137,0.000685,0.000685,0.00137,0.021233,0.035616,0.029452,0.078082,0.009589,0.49726,0.005479,0.007534,0.304795,0.025342,0.044521,0.781507,0.007534,0.19589,0.004795,0.00137,0.982192,0.000685,0.000685,0.000685,0.007534,0.003425,0.00411,0.000685,0.00137,0.034247,0.000685,0.041781,0.152055,0.000685,0.150685,0.073973,0.00137,0.017123,0.35274,0.141096,0.017808,0.002055,0.004795,0.017123,0.000685,0.041096,0.141781,0.006849,0.146575,0.000685,0.09726,0.003425,0.017808,0.345205,0.134932,0.026027,0.304795,0.59726,0.087671,0.434247,0.443151,0.016438,0.00411,0.002055,0.10137,0.286301,0.050685,0.025342,0.091096,0.294521,0.022603,0.009589,0.031507,0.026027,0.036986,0.860274,0.003425,0.021233,0.023288,0.010274,0.000685,0.931507,0.59589,0.013014,0.060274,0.006164,0.265068,0.055479,0.036986,0.107534,0.007534,0.807534,0.963014,0.00137,0.033562,0.000685,0.035616,0.072603,0.096575,0.139726,0.173288,0.160274,0.083562,0.043151,0.060959,0.05411,0.040411,0.00274,0.00137,0.006164,0.003425,0.003425,0.083562,0.002055,0.867808,0.00274,0.008219,0.013699,0.820548,0.085616
std,0.081698,0.046653,0.063996,0.026171,0.153666,0.1391,0.218862,0.34409,0.112957,0.191427,0.087763,0.175296,0.13804,0.266848,0.080811,0.109443,0.189155,0.071801,0.061236,0.239875,0.246731,0.098664,0.088708,0.211394,0.085005,0.098998,0.17297,0.119376,0.183639,0.251443,0.101972,0.073446,0.221253,0.135449,0.214889,0.362175,0.190629,0.29761,0.186829,0.150779,0.14458,0.143937,0.248296,0.146253,0.121126,0.110723,0.057711,0.116161,0.054441,0.051015,0.032008,0.110321,0.405659,0.206319,0.104145,0.408614,0.356521,0.241835,0.165264,0.165264,0.082505,0.481996,0.181924,0.155132,0.302824,0.245519,0.17657,0.052289,0.448884,0.206319,0.093973,0.036999,0.104145,0.195382,0.137198,0.303723,0.183669,0.252677,0.226311,0.157217,0.107313,0.18016,0.361177,0.078298,0.21802,0.165264,0.22359,0.267312,0.129775,0.219429,0.196989,0.235526,0.129775,0.159271,0.086502,0.228992,0.343951,0.073846,0.113372,0.086502,0.132299,0.036999,0.05844,0.063996,0.100873,0.026171,0.036999,0.026171,0.026171,0.036999,0.144209,0.185395,0.169128,0.268393,0.097486,0.500164,0.073846,0.086502,0.460478,0.157217,0.206319,0.413365,0.086502,0.397021,0.0691,0.036999,0.132299,0.026171,0.026171,0.026171,0.086502,0.05844,0.063996,0.026171,0.036999,0.181924,0.026171,0.200157,0.359197,0.026171,0.357864,0.261816,0.036999,0.129775,0.477986,0.34824,0.132299,0.045299,0.0691,0.129775,0.026171,0.19858,0.348945,0.082505,0.353803,0.026171,0.296413,0.05844,0.132299,0.475598,0.341767,0.159271,0.460478,0.490617,0.282913,0.495827,0.496928,0.127198,0.063996,0.045299,0.301921,0.452187,0.219429,0.157217,0.287844,0.455983,0.148684,0.097486,0.174743,0.159271,0.188793,0.346821,0.05844,0.144209,0.150867,0.100873,0.026171,0.252677,0.490887,0.113372,0.238075,0.078298,0.441521,0.228992,0.188793,0.309897,0.086502,0.394372,0.188793,0.036999,0.18016,0.026171,0.185395,0.259572,0.29548,0.346821,0.378625,0.366986,0.276824,0.203266,0.239337,0.226311,0.196989,0.052289,0.036999,0.078298,0.05844,0.05844,0.276824,0.045299,0.338815,0.052289,0.090317,0.116277,0.383862,0.279893
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.133562,0.029229,1.0,1.0,0.444444,0.5,0.594203,0.283333,0.0,0.333333,0.5,0.6,0.75,0.25,0.0,0.0,0.095462,0.130237,0.0,0.5,1.0,1.0,0.125746,0.0,0.0,0.149868,0.0,0.0,0.333333,0.0,0.25,0.333333,0.333333,0.25,0.0,0.0,0.630435,0.333333,0.25,0.235896,0.6,0.6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13203,0.5,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
50%,0.167808,0.038227,1.0,1.0,0.555556,0.5,0.731884,0.733333,0.0,0.333333,0.5,0.8,0.75,0.25,0.067948,0.0,0.204409,0.162275,0.0,1.0,1.0,1.0,0.172786,0.0,0.0,0.212886,0.0,0.0,0.666667,0.0,0.375,0.333333,0.333333,0.333333,0.333333,0.4,0.768116,0.666667,0.5,0.338505,0.6,0.6,1.0,0.0,0.045704,0.0,0.0,0.0,0.0,0.0,0.0,0.177892,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
75%,0.208904,0.04815,1.0,1.0,0.666667,0.625,0.927536,0.9,0.102656,0.666667,0.5,0.8,0.75,0.5,0.126196,0.0,0.34589,0.21248,0.0,1.0,1.0,1.0,0.2426,0.352542,0.0,0.271807,0.333333,0.0,0.666667,0.5,0.375,0.333333,0.666667,0.416667,0.333333,0.8,0.934783,0.666667,0.5,0.406206,0.6,0.6,1.0,0.196033,0.124314,0.0,0.0,0.0,0.0,0.0,0.0,0.248715,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [39]:
houses.to_csv('../data/cleaned_data.csv')