In [198]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image

%matplotlib inline

In [199]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, LassoCV, RidgeCV, ElasticNetCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn.metrics import mean_squared_error, r2_score


from scipy import stats
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [200]:
# Import cleaned datasets
train = pd.read_csv('./datasets/train_encoded.csv')
test = pd.read_csv('./datasets/test_encoded.csv')

In [201]:
# Show all data columns.
pd.set_option("display.max_columns", 100)

### Dimentionality Reduction

#### Low Variance Method

Variance tells us about the spread of data. It tells us how far the points are from the mean. If the variance is low which means that the value are all similiar and it will have less impact on the target variable. For this section, we will set a threshold value of variance of 0.009. If the variants of a variable is less than the threshold, then we can safely drop the variable. Also, low variance method is only applied to numerical type data .

In [202]:
# only select numerical data

num = train.select_dtypes("number")

In [203]:
# filter the low variance

low_var_list = num.var().sort_values(ascending=False)
low_var_list = low_var_list[low_var_list.values < 0.009]

In [204]:
low_var_list

neighborhood_veenker    0.008232
sale_type_conld         0.008232
exterior_2nd_brk cmn    0.008232
neighborhood_npkvill    0.008232
roof_matl_tar&grv       0.007271
house_style_2.5unf      0.006789
ms_zoning_rh            0.006789
roof_style_flat         0.006307
mas_vnr_type_brkcmn     0.006307
house_style_1.5unf      0.005825
roof_style_gambrel      0.005825
condition_1_posa        0.005825
exterior_2nd_imstucc    0.005342
condition_2_feedr       0.005342
garage_type_carport     0.005342
sale_type_cwd           0.004859
lot_config_fr3          0.004375
roof_style_mansard      0.003406
street_pave             0.003406
street_grvl             0.003406
sale_type_conli         0.003406
heating_wall            0.002921
house_style_2.5fin      0.002921
exterior_2nd_stone      0.002921
condition_1_rrnn        0.002921
neighborhood_blueste    0.002921
utilities               0.002439
roof_matl_wdshngl       0.002435
heating_grav            0.002435
foundation_stone        0.002435
sale_type_

#### Feature selection/Engineering

In [205]:
# Categories with small sale type numbers to make them stastically significant

train['combined_sale_type'] = train['sale_type_conlw'] + train['sale_type_conli'] +train['sale_type_con'] + train['sale_type_conld'] + train['sale_type_cwd']+train['sale_type_oth']

In [206]:
test['combined_sale_type'] = test['sale_type_conlw'] + test['sale_type_conli'] +test ['sale_type_con'] + test ['sale_type_conld']+ test['sale_type_cwd']+test['sale_type_oth']

In [207]:
# Categories with small house type numbers to make them stastically significant

train['combined_house_style'] = train['house_style_1.5unf'] + train['house_style_2.5fin'] +train['house_style_2.5unf']

In [208]:
test['combined_house_style'] = test['house_style_1.5unf'] + test['house_style_2.5fin'] +test['house_style_2.5unf']

In [209]:
# Categories with small roof style numbers to make them stastically significant

train['combined_roof_style'] = train['roof_style_flat'] + train['roof_style_gambrel'] +train['roof_style_mansard']+train['roof_style_shed']

In [210]:
test['combined_roof_style'] = test['roof_style_flat'] + test['roof_style_gambrel'] +test['roof_style_mansard']+test['roof_style_shed']

In [211]:
# Dropping features with low variance (<0.009)

low_var_drop_list = [item for item in low_var_list.index]
train = train.drop(low_var_drop_list, axis=1)
test = test.drop(low_var_drop_list, axis=1,errors='ignore')

In [212]:
low_var_list.head()

neighborhood_veenker    0.008232
sale_type_conld         0.008232
exterior_2nd_brk cmn    0.008232
neighborhood_npkvill    0.008232
roof_matl_tar&grv       0.007271
dtype: float64

In [213]:
train.shape

(2049, 163)

The features reduced to 163

#### Comparing test and train dataset

In [214]:
test.shape

(876, 170)

In [215]:
train.shape

(2049, 163)

In [216]:
# Checking for features in training but not in test dataset
[x for x in train if x not in test]

['saleprice']

In [217]:
# Checking for features in test dataset but not in training dataset
[x for x in test if x not in train]

['roof_matl_metal',
 'roof_matl_roll',
 'exterior_1st_precast',
 'exterior_2nd_other',
 'exterior_2nd_precast',
 'mas_vnr_type_cblock',
 'heating_floor',
 'sale_type_vwd']

In [218]:
# Drop features not in train data
test = test.drop(['roof_matl_metal',
 'roof_matl_roll',
 'exterior_1st_precast',
 'exterior_2nd_other',
 'exterior_2nd_precast',
 'mas_vnr_type_cblock',
 'heating_floor',
 'sale_type_vwd'], axis=1)

In [219]:
train.to_csv('./datasets/train_final.csv', index=False)

In [220]:
test.to_csv('./datasets/test_final.csv', index=False)