# House Prices: Advanced Regression Techniques

<p><a name="sections"></a></p>


## Sections

- <a href="#description">Description</a><br>
- <a href="#executive_summary">Executive Summary</a><br>
- <a href="#data_import">Data Import</a><br>
- <a href="#Feature Engineering (Advanced)">Feature Engineering (Advanced)</a><br>
- <a href="#data_cleaning_and_eda">Data Cleaning and EDA</a><br>
- <a href="#models">Models</a><br>
    - <a href="#lasso">Lasso</a><br>
    - <a href="#decision_tree">Decision Tree</a><br>
    - <a href="#bagging">Bagging</a><br>
    - <a href="#random_forest">Random Forest</a><br>
    - <a href="#boosting">Boosting</a><br>
    - <a href="#xgboost">XGBoost</a><br>
- <a href="#submission">Submission</a><br>

<p><a name="description"></a></p>

## Description

<p><a name="executive_summary"></a></p>

## Executive Summary

<p><a name="data_import_and_cleaning"></a></p>

## Data Import and Cleaning

**import libaries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from scipy.stats import skew
from sklearn.model_selection import cross_val_score, train_test_split,GridSearchCV
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV,ElasticNet, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, make_scorer, mean_squared_log_error
# from sklearn.model_selection import RepeatedKFold
# from sklearn.neighbors import KNeighborsRegressor
# import xgboost as XGB
# from sklearn.model_selection import RepeatedKFold #why it doesn't work???
# from feature_selection import VarianceReduction

In [2]:
import sys
print(sys.version)

3.7.3 (default, Apr 24 2019, 15:29:51) [MSC v.1915 64 bit (AMD64)]


Note: ImportError: cannot import name 'RepeatedKFold'

Solution (Windows):

conda config --append channels conda-forge

conda install scikit-learn=0.19.2

**read training data**

In [3]:
train = pd.read_csv('train.csv')
print("train : " + str(train.shape))
train = train.reindex(columns=sorted(train.columns))
train.head(1)

train : (1460, 81)


Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,SaleType,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,856,854,0,,3,1Fam,TA,No,706,0,...,WD,0,Pave,8,856,AllPub,0,2003,2003,2008


In [4]:
test = pd.read_csv('test.csv')
print("test : " + str(test.shape))
test = test.reindex(columns=sorted(train.columns))
test.head(1)

test : (1459, 80)


Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,SaleType,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,896,0,0,,2,1Fam,TA,No,468.0,144.0,...,WD,120,Pave,5,882.0,AllPub,140,1961,1961,2010


In [5]:
# Check for duplicates 
idsUnique = len(set(train.Id))
idsTotal = train.shape[0]
idsDupli = idsTotal - idsUnique
print("There are " + str(idsDupli) + " duplicate IDs for " + str(idsTotal) + " total entries")
target_var=train[['SalePrice']]

There are 0 duplicate IDs for 1460 total entries


In [6]:
# combine train and test sets
data = pd.concat([train,test],axis=0)
data.reset_index(inplace=True,drop=True)
print(data.loc[[0,2000],'Id'], data.shape)

0          1
2000    2001
Name: Id, dtype: int64 (2919, 81)


In [7]:
train=data.drop(['SalePrice'],axis=1)

## Data Types

In [8]:
# Differentiate numerical features (minus the target) and categorical features
categorical_features = train.select_dtypes(include = ["object"]).columns
numerical_features = train.select_dtypes(exclude = ["object"]).columns
print("Numerical features : " + str(len(numerical_features)))
print("Categorical features : " + str(len(categorical_features)))
train_num = train[numerical_features]
train_cat = train[categorical_features]

Numerical features : 37
Categorical features : 43


In [9]:
df=train_num
print("Numerical features w/ NAs: " +str(df.columns[df.isnull().any()].values))
df=train_cat
print("Categorical features w/ NAs: " +str(df.columns[df.isnull().any()].values))

Numerical features w/ NAs: ['BsmtFinSF1' 'BsmtFinSF2' 'BsmtFullBath' 'BsmtHalfBath' 'BsmtUnfSF'
 'GarageArea' 'GarageCars' 'GarageYrBlt' 'LotFrontage' 'MasVnrArea'
 'TotalBsmtSF']
Categorical features w/ NAs: ['Alley' 'BsmtCond' 'BsmtExposure' 'BsmtFinType1' 'BsmtFinType2'
 'BsmtQual' 'Electrical' 'Exterior1st' 'Exterior2nd' 'Fence' 'FireplaceQu'
 'Functional' 'GarageCond' 'GarageFinish' 'GarageQual' 'GarageType'
 'KitchenQual' 'MSZoning' 'MasVnrType' 'MiscFeature' 'PoolQC' 'SaleType'
 'Utilities']


## Missing value imputation

In [10]:
# Handle missing values for features where median/mean or most common value doesn't make sense
# some of the following are redundant
# Alley : data description says NA means "no alley access"
train.loc[:, "Alley"] = train.loc[:, "Alley"].fillna("None")

# BsmtQual etc : data description says NA for basement features is "no basement"
for col in ["BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2",'BsmtQual']:
    train.loc[:, col] = train.loc[:, col].fillna("No")

# NA Bsmt means 0
for col in ['BsmtFinSF1','BsmtFinSF2','TotalBsmtSF','BsmtUnfSF','BsmtHalfBath','BsmtFullBath']:
    train[col].fillna(0,inplace=True)   
    
# EnclosedPorch : NA most likely means no enclosed porch
train.loc[:, "EnclosedPorch"] = train.loc[:, "EnclosedPorch"].fillna(0)
# Fence : data description says NA means "no fence"
train.loc[:, "Fence"] = train.loc[:, "Fence"].fillna("No")
# FireplaceQu : data description says NA means "no fireplace"
train.loc[:, "FireplaceQu"] = train.loc[:, "FireplaceQu"].fillna("No")
train.loc[:, "Fireplaces"] = train.loc[:, "Fireplaces"].fillna(0)
# Functional : data description says NA means typical
train.loc[:, "Functional"] = train.loc[:, "Functional"].fillna("Typ")
# GarageType etc : data description says NA for garage features is "no garage"
train.loc[:, "GarageType"] = train.loc[:, "GarageType"].fillna("No")
train.loc[:, "GarageFinish"] = train.loc[:, "GarageFinish"].fillna("No")
train.loc[:, "GarageQual"] = train.loc[:, "GarageQual"].fillna("No")
train.loc[:, "GarageCond"] = train.loc[:, "GarageCond"].fillna("No")
train.loc[:, "GarageArea"] = train.loc[:, "GarageArea"].fillna(0)
train.loc[:, "GarageCars"] = train.loc[:, "GarageCars"].fillna(0)
# HalfBath : NA most likely means no half baths above grade
train.loc[:, "HalfBath"] = train.loc[:, "HalfBath"].fillna(0)
# HeatingQC : NA most likely means typical
train.loc[:, "HeatingQC"] = train.loc[:, "HeatingQC"].fillna("TA")
# KitchenAbvGr : NA most likely means 0
train.loc[:, "KitchenAbvGr"] = train.loc[:, "KitchenAbvGr"].fillna(0)
# KitchenQual : NA most likely means typical
train.loc[:, "KitchenQual"] = train.loc[:, "KitchenQual"].fillna("TA")
# LotFrontage : NA most likely means no lot frontage
train.loc[:, "LotFrontage"] = train.loc[:, "LotFrontage"].fillna(0)
# LotShape : NA most likely means regular
train.loc[:, "LotShape"] = train.loc[:, "LotShape"].fillna("Reg")
# MasVnrType : NA most likely means no veneer
train.loc[:, "MasVnrType"] = train.loc[:, "MasVnrType"].fillna("None")
train.loc[:, "MasVnrArea"] = train.loc[:, "MasVnrArea"].fillna(0)
# MiscFeature : data description says NA means "no misc feature"
train.loc[:, "MiscFeature"] = train.loc[:, "MiscFeature"].fillna("No")
train.loc[:, "MiscVal"] = train.loc[:, "MiscVal"].fillna(0)
# OpenPorchSF : NA most likely means no open porch
train.loc[:, "OpenPorchSF"] = train.loc[:, "OpenPorchSF"].fillna(0)
# PavedDrive : NA most likely means not paved
train.loc[:, "PavedDrive"] = train.loc[:, "PavedDrive"].fillna("N")
# PoolQC : data description says NA means "no pool"
train.loc[:, "PoolQC"] = train.loc[:, "PoolQC"].fillna("No")
train.loc[:, "PoolArea"] = train.loc[:, "PoolArea"].fillna(0)
# SaleCondition : NA most likely means normal sale
train.loc[:, "SaleCondition"] = train.loc[:, "SaleCondition"].fillna("Normal")
# ScreenPorch : NA most likely means no screen porch
train.loc[:, "ScreenPorch"] = train.loc[:, "ScreenPorch"].fillna(0)
# TotRmsAbvGrd : NA most likely means 0
train.loc[:, "TotRmsAbvGrd"] = train.loc[:, "TotRmsAbvGrd"].fillna(0)
# Utilities : NA most likely means all public utilities
train.loc[:, "Utilities"] = train.loc[:, "Utilities"].fillna("AllPub")
# Electrical : NA most likely means SBrkr
train.loc[:, "Electrical"] = train.loc[:, "Electrical"].fillna("SBrkr")
# GarageYrBlt: NA same as 
train.loc[train.GarageYrBlt.isnull(), "GarageYrBlt"] = train.loc[train.GarageYrBlt.isnull(), "YearBuilt"]
train.loc[train.GarageYrBlt>2020,'GarageYrBlt']=train.loc[train.GarageYrBlt>2020,'YearBuilt']
# fill w/ mode
for col in ['Exterior1st','Exterior2nd','SaleType']:
    train[col].fillna(data[col].mode()[0],inplace=True)    
train['MSZoning'] =  train.groupby('Neighborhood')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

In [11]:
# Some numerical features are actually really categories
train = train.replace({"MSSubClass" : {20 : "SC20", 30 : "SC30", 40 : "SC40", 45 : "SC45", 
                                       50 : "SC50", 60 : "SC60", 70 : "SC70", 75 : "SC75", 
                                       80 : "SC80", 85 : "SC85", 90 : "SC90", 120 : "SC120", 
                                       150 : "SC150", 160 : "SC160", 180 : "SC180", 190 : "SC190"},
                       "MoSold" : {1 : "Jan", 2 : "Feb", 3 : "Mar", 4 : "Apr", 5 : "May", 6 : "Jun",
                                   7 : "Jul", 8 : "Aug", 9 : "Sep", 10 : "Oct", 11 : "Nov", 12 : "Dec"}
                      })


In [12]:
# non-missing dataset
categorical_features = train.select_dtypes(include = ["object"]).columns
numerical_features = train.select_dtypes(exclude = ["object"]).columns
print("Numerical features : " + str(len(numerical_features)))
print("Categorical features : " + str(len(categorical_features)))
train_num = train[numerical_features]
train_cat = train[categorical_features]
print("NAs for numerical features in train : " + str(train_num.isnull().values.sum()))
print("Remaining NAs for numerical features in train : " + str(train_num.isnull().values.sum()))

Numerical features : 35
Categorical features : 45
NAs for numerical features in train : 0
Remaining NAs for numerical features in train : 0


In [13]:
pd.concat([target_var,train_num, train_cat], axis = 1).to_csv('dataset_1_0.csv',index=False)

In [14]:
train=pd.read_csv('dataset_1_0.csv',)
train=train[train.GrLivArea < 4500].reset_index(drop=True)
print(np.shape(train))
train.to_csv('dataset1_1_delete_outliers.csv',index=False)

(2916, 81)


In [15]:
#2 add features
train=pd.read_csv('dataset1_1_delete_outliers.csv')
target_var=train.loc[train.Id<=idsTotal,['SalePrice']]
train=train.drop(['SalePrice'],axis=1)
categorical_features = train.select_dtypes(include = ["object"]).columns
numerical_features = train.select_dtypes(exclude = ["object"]).columns
print("Numerical features : " + str(len(numerical_features)))
print("Categorical features : " + str(len(categorical_features)))
train_num = train[numerical_features]
train_cat = train[categorical_features]
print("NAs for numerical features in train : " + str(train_num.isnull().values.sum()))
print("Remaining NAs for numerical features in train : " + str(train_num.isnull().values.sum()))

Numerical features : 35
Categorical features : 45
NAs for numerical features in train : 0
Remaining NAs for numerical features in train : 0


In [16]:
#Add features
# Transform Time variables
for s in ['Year','Yr']:
    cols = [col for col in train_num.columns if s in col and 'YrSold' not in col]
    print(cols)
    for col in cols:
        train[col+'Age']=train.YrSold-train[col]
train['YrSold'] = train['YrSold'].astype(str)

['YearBuilt', 'YearRemodAdd']
['GarageYrBlt']


In [17]:
#add total; not redundant when taking log
#From Peter
train['TotalBsmtSF'] = train['BsmtFinSF1'] + train['BsmtFinSF2'] + train['BsmtUnfSF']
train['TotalSF'] = train['TotalBsmtSF'] + train['1stFlrSF'] + train['2ndFlrSF']
#From Zhongjin (Gene)
train["BsmtTotalBath"] = train["BsmtFullBath"] + (0.5 * train["BsmtHalfBath"]) 
train["TotalBath"]=train["FullBath"] + (0.5 * train["HalfBath"])
train['BsmtFinSF']=train.BsmtFinSF1+train.BsmtFinSF2
# Total SF for porch
train["AllPorchSF"] = train["OpenPorchSF"] + train["EnclosedPorch"] + \
train["3SsnPorch"] + train["ScreenPorch"]

In [18]:
logvar_list=[]
for s in ['SF','Area','Val','3SsnPorch', 'EnclosedPorch', 'ScreenPorch','LotFrontage']:
    cols=[col for col in train.columns if s in col and 'SFoyer' not in col and '_absent' not in col]
    print(cols)
    logvar_list=logvar_list+cols
    for col in cols:
        train[col+'_log']=np.log1p(train[col])

['1stFlrSF', '2ndFlrSF', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'LowQualFinSF', 'OpenPorchSF', 'TotalBsmtSF', 'WoodDeckSF', 'TotalSF', 'BsmtFinSF', 'AllPorchSF']
['GarageArea', 'GrLivArea', 'LotArea', 'MasVnrArea', 'PoolArea']
['MiscVal']
['3SsnPorch']
['EnclosedPorch']
['ScreenPorch']
['LotFrontage']


In [19]:
train.to_csv('dataset2_log.csv',index=False)

In [20]:
# add dummies for numeric values=0
cols=train_num.columns[train_num.apply(lambda x: x==0).sum()>0]
train_num_absent = train_num.loc[:,cols].apply(lambda x: x==0).astype(int).add_suffix('_absent')
print("Number of numerical variables: {}".format(len(train_num.columns)))
print("Number of dummies for 0s: {}".format(len(train_num_absent.columns)))

Number of numerical variables: 35
Number of dummies for 0s: 24


In [21]:
# add dummies for extreme numeric values
#initialize empty data frame with same dimensions and ID as train
df_dummy=pd.DataFrame(index=train.index)
#create dummy for rooms 12 or above
def More_Than_12_Rooms(q):
    if(q >=12):
        return(1)
    else:
        return(0)
    
df_dummy["12_rooms_plus_dummy"] = train["TotRmsAbvGrd"].apply(More_Than_12_Rooms)

#create dummy for 2 or more kitchens above ground
def More_Than_2_kitchen(q):
    if(q >=2):
        return(1)
    else:
        return(0)
    
df_dummy["2_kitchens_plus_dummy"] = train["KitchenAbvGr"].apply(More_Than_2_kitchen)

#create dummy for 5 or more bedrooms above ground
def More_Than_5_bedroom(q):
    if(q >=5):
        return(1)
    else:
        return(0)
    
df_dummy["5_bedroom_plus_dummy"] = train["BedroomAbvGr"].apply(More_Than_5_bedroom)

#create dummy for 2 or fewwer bedrooms above ground
def Less_Than_1_bedroom(q):
    if(q <=1):
        return(1)
    else:
        return(0)
    
df_dummy["1_bedroom_less_dummy"] = train["BedroomAbvGr"].apply(Less_Than_1_bedroom)

def More_Than_4_car(q):
    if(q >=4):
        return(1)
    else:
        return(0)
df_dummy["4_garage_more_dummy"] = train["GarageCars"].apply(More_Than_4_car)
df_dummy.head(10)

Unnamed: 0,12_rooms_plus_dummy,2_kitchens_plus_dummy,5_bedroom_plus_dummy,1_bedroom_less_dummy,4_garage_more_dummy
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0
5,0,0,0,1,0
6,0,0,0,0,0
7,0,0,0,0,0
8,0,1,0,0,0
9,0,1,0,0,0


In [22]:
train=pd.concat([target_var,train, df_dummy,train_num_absent], axis = 1)
train.to_csv('dataset2_numeric_dummy.csv',index=False)

In [23]:
train.head()

Unnamed: 0,SalePrice,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtUnfSF,...,KitchenAbvGr_absent,LotFrontage_absent,LowQualFinSF_absent,MasVnrArea_absent,MiscVal_absent,OpenPorchSF_absent,PoolArea_absent,ScreenPorch_absent,TotalBsmtSF_absent,WoodDeckSF_absent
0,208500.0,856,854,0,3,706.0,0.0,1.0,0.0,150.0,...,0,0,1,0,1,0,1,1,0,1
1,181500.0,1262,0,0,3,978.0,0.0,0.0,1.0,284.0,...,0,0,1,1,1,1,1,1,0,0
2,223500.0,920,866,0,3,486.0,0.0,1.0,0.0,434.0,...,0,0,1,0,1,0,1,1,0,1
3,140000.0,961,756,0,3,216.0,0.0,1.0,0.0,540.0,...,0,0,1,1,1,0,1,1,0,1
4,250000.0,1145,1053,0,4,655.0,0.0,1.0,0.0,490.0,...,0,0,1,0,1,0,1,1,0,0


In [24]:
comp=pd.read_csv('dataset2_compPrice.csv')
comp['compPriceXcompIndex']=comp['compPrice']*comp['compIndex']
comp.head()

Unnamed: 0,Id,compPrice,compIndex,compPriceXcompIndex
0,1,202980.0,37.2,7550856.0
1,2,206791.8,32.4,6700054.32
2,3,180480.0,37.2,6713856.0
3,4,189220.0,31.0,5865820.0
4,5,206150.0,36.4,7503860.0


In [25]:
train=pd.merge(train,comp,on='Id')
train.to_csv('dataset2_addfeatures.csv',index=False)

In [26]:
train.shape

(2916, 143)

In [250]:
#3.1, 3.2, and 3.3 are independent of each other; use the same following input!!!
train=pd.read_csv('dataset2_addfeatures.csv')
train['YrSold'] = train['YrSold'].astype(str)
target_var=train.loc[train.Id<=idsTotal,['SalePrice']]
train=train.drop(['SalePrice'],axis=1)
categorical_features = train.select_dtypes(include = ["object"]).columns
numerical_features = train.select_dtypes(exclude = ["object"]).columns
print("Numerical features : " + str(len(numerical_features)))
print("Categorical features : " + str(len(categorical_features)))
train_num = train[numerical_features]
train_cat = train[categorical_features]
print("NAs for numerical features in train : " + str(train_num.isnull().values.sum()))
print("Remaining NAs for numerical features in train : " + str(train_num.isnull().values.sum()))

Numerical features : 96
Categorical features : 46
NAs for numerical features in train : 0
Remaining NAs for numerical features in train : 0


In [251]:
train_cat_dum=pd.get_dummies(train_cat)
np.shape(train_cat_dum)

(2916, 297)

In [232]:
#3.1 encode categorical variable to numeric
# Graeme has edited this to reflect the data visualizations!!!
#
train = train.replace({
                       "BsmtCond" : {"No" : 2, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "BsmtExposure" : {"No" : 0, "Mn" : 1, "Av": 2, "Gd" : 3},
                       "BsmtFinType1" : {"No" : 0, "Unf" : 1, "LwQ": 1, "Rec" : 1, "BLQ" : 1, 
                                        "ALQ" : 1, "GLQ" : 2},
                       #"BsmtFinType2" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, 
                       #                  "ALQ" : 5, "GLQ" : 6},
                       "BsmtQual" : {"No" : 2, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5},
                       "ExterCond" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                       "ExterQual" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                       "FireplaceQu" : {"No" : 1, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       #"Functional" : {"Sal" : 1, "Sev" : 2, "Maj2" : 3, "Maj1" : 4, "Mod": 5, 
                        #               "Min2" : 6, "Min1" : 7, "Typ" : 8},
                       "GarageCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "GarageQual" : {"No" : 2, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "HeatingQC" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "KitchenQual" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       #"LandSlope" : {"Sev" : 1, "Mod" : 2, "Gtl" : 3},
                       "LandContour" : {"Bnk" : 0, "Lvl" : 1, "Low" : 2,"HLS" : 2},
                       "LotShape" : {"IR3" : 1, "IR2" : 1, "IR1" : 1, "Reg" : 2},
                       "PavedDrive" : {"N" : 0, "P" : 1, "Y" : 2},
                       "PoolQC" : {"No" : 0, "Fa" : 1, "TA" : 2, "Gd" : 3, "Ex" : 4}})

pd.concat([target_var,train], axis = 1).to_csv('dataset3_1.csv',index=False)


In [233]:
X=pd.get_dummies(train)

In [234]:
X.MSSubClass_SC150

0       0
1       0
2       0
3       0
4       0
5       0
6       0
7       0
8       0
9       0
10      0
11      0
12      0
13      0
14      0
15      0
16      0
17      0
18      0
19      0
20      0
21      0
22      0
23      0
24      0
25      0
26      0
27      0
28      0
29      0
       ..
2886    0
2887    0
2888    0
2889    0
2890    0
2891    0
2892    0
2893    0
2894    0
2895    0
2896    0
2897    0
2898    0
2899    0
2900    0
2901    0
2902    0
2903    0
2904    0
2905    0
2906    0
2907    0
2908    0
2909    0
2910    0
2911    0
2912    0
2913    0
2914    0
2915    0
Name: MSSubClass_SC150, Length: 2916, dtype: uint8

In [None]:
# 3.2 Peter's code for clustering of categories
train.to_csv('dataset3_2.csv',index=False)


In [181]:
# 3.3 Zhongjin(Gene)'s code for removing duplicates and variables causing multicollinearity (done)

In [201]:
# check pair-wise high correlations
train_cat_dum = pd.get_dummies(train_cat)
corr_mat=pd.concat([train[numerical_features],train_cat_dum],axis=1).corr()
np.fill_diagonal(corr_mat.values, 0)
for i, col in enumerate(numerical_features):
    if corr_mat[[col]].abs().sort_values(col,ascending=False).iloc[0,0]==1:
        print('Duplicates',corr_mat[[col]].abs().sort_values(col,ascending=False).iloc[:2,:])

Duplicates                 Fireplaces_absent
FireplaceQu_No           1.000000
Fireplaces               0.900818
Duplicates                    GarageArea_absent
GarageCars_absent            1.00000
GarageFinish_No              0.99667
Duplicates                    GarageCars_absent
GarageArea_absent            1.00000
GarageFinish_No              0.99667
Duplicates                  TotalBsmtSF_absent
BsmtFinType1_No            1.000000
BsmtFinType2_No            0.993555


In [202]:
# drop duplicates
train.drop(['Fireplaces_absent','GarageArea_absent','GarageCars_absent','TotalBsmtSF_absent'],axis=1,inplace=True)

In [203]:
pd.concat([target_var,train], axis = 1).to_csv('dataset4_nodup.csv',index=False)

In [204]:
# check high R2 variables; all have been addressed by the feature engineering above
from sklearn import linear_model
ols = linear_model.LinearRegression()
df=pd.concat([train[set(train.columns)-set(logvar_list)-set(categorical_features)]],axis=1)
for i, col in enumerate(df.columns.values):
    x=df.iloc[:,df.columns!=col]
    y=df.iloc[:,i]
    ols.fit(x, y)   #### X needs to be 2-D array; X.reshape(-1,1) to shape 1-D array; x = df[["col1"]]
    if ols.score(x, y)==1:
        print(col+' R^2: %.2f' % ols.score(x, y)) ## score: the R^2 of the fitted model

BsmtFullBath R^2: 1.00
BsmtHalfBath R^2: 1.00
YearBuiltAge R^2: 1.00
YearBuilt R^2: 1.00
FullBath R^2: 1.00
GarageYrBlt R^2: 1.00
BsmtTotalBath R^2: 1.00
GarageYrBltAge R^2: 1.00
YearRemodAdd R^2: 1.00
YearRemodAddAge R^2: 1.00
TotalBath R^2: 1.00
HalfBath R^2: 1.00


In [205]:
train.drop(['BsmtHalfBath','HalfBath','YearBuilt','GarageYrBlt','YearRemodAdd'],axis=1,inplace=True)
#verify multicolinearity is addressed
df=pd.concat([train[set(train.columns)-set(logvar_list)-set(categorical_features)]],axis=1)
for i, col in enumerate(df.columns.values):
    x=df.iloc[:,df.columns!=col]
    y=df.iloc[:,i]
    ols.fit(x, y)   #### X needs to be 2-D array; X.reshape(-1,1) to shape 1-D array; x = df[["col1"]]
    if ols.score(x, y)==1:
        print(col+' R^2: %.2f' % ols.score(x, y)) ## score: the R^2 of the fitted model
        
pd.concat([target_var,train], axis = 1).to_csv('dataset4_nodup_nomulcol.csv',index=False)