In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, balanced_accuracy_score, r2_score, mean_squared_log_error
from pandas.plotting import scatter_matrix
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from sklearn.feature_selection import SelectKBest, f_regression, chi2, f_classif

#### __1. Exploring the data__

In [2]:
train_data = pd.read_csv(os.path.expanduser("~/Desktop/Projects/api/data/house-prices-advanced-regression-techniques/" + 
                                            "train.csv"))

In [3]:
train_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [4]:
train_data['OverallQual'].value_counts()

5     397
6     374
7     319
8     168
4     116
9      43
3      20
10     18
2       3
1       2
Name: OverallQual, dtype: int64

#### __2. Dealing with missing values__

In [5]:
features = list(train_data)
X = train_data[features]
y = train_data['SalePrice']

In [6]:
# check for numeric features train_data
num_features = []
cat_features = []
for feature in X:
    if X[feature].dtypes == np.int or X[feature].dtypes == np.float:
        num_features.append(feature)
    else:
        cat_features.append(feature)

In [7]:
# impute using only numerical features
imp = IterativeImputer(max_iter = 10, random_state = 42)
imp.fit(X[num_features])
X[num_features] = imp.transform(X[num_features])

In [8]:
# impute using only categorical features
imp = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
X[cat_features] = imp.fit_transform(X[cat_features].astype(str))

In [9]:
# split dataframe into numeric and categorical data
X_num = X.drop(cat_features, axis = 1)
X_cat = X.drop(num_features, axis = 1)

#### __3. Feature Selection__

In [10]:
# saleprice correlation matrix
k_num = round(len(X_num.columns) / 2)
corrmat = X_num.corr()
X_num_fs = corrmat.nlargest(k_num, 'SalePrice')['SalePrice'].index
# plt.figure(figsize = (15, 15))
# # plot heat map
# g = sns.heatmap(X_num[X_num_fs].corr(),annot = True,cmap = "RdYlGn")

In [11]:
for i, feature in enumerate(X_num_fs):
    print('{} {}'.format(i, feature))

0 SalePrice
1 OverallQual
2 GrLivArea
3 GarageCars
4 GarageArea
5 TotalBsmtSF
6 1stFlrSF
7 FullBath
8 TotRmsAbvGrd
9 YearBuilt
10 GarageYrBlt
11 YearRemodAdd
12 MasVnrArea
13 Fireplaces
14 BsmtFinSF1
15 LotFrontage
16 WoodDeckSF
17 2ndFlrSF
18 OpenPorchSF


In [12]:
# check for multicollinearity
# if two features are strongly correlated with each other (>= 0.7) 
# the feature with the lower correlation with the target variable is dropped
multicorr = {}
k = len(corrmat)
for feature in corrmat:
    i = 1
    if feature != 'SalePrice':
        while i < k - 1:
            if corrmat[feature][i] >= 0.7 and feature != corrmat.index[i]:
                print('{}: {} {}'.format(feature, corrmat.index[i], corrmat[feature][i]))
                multicorr[feature] = corrmat.index[i], corrmat[feature][i]
            i = i + 1

YearBuilt: GarageYrBlt 0.8356304739037884
TotalBsmtSF: 1stFlrSF 0.8195299750050355
1stFlrSF: TotalBsmtSF 0.8195299750050355
GrLivArea: TotRmsAbvGrd 0.8254893743088377
TotRmsAbvGrd: GrLivArea 0.8254893743088377
GarageYrBlt: YearBuilt 0.8356304739037884
GarageCars: GarageArea 0.8824754142814603
GarageArea: GarageCars 0.8824754142814603


In [13]:
# delete duplicates
corr_scores = []
for feature in list(multicorr.keys()):
    if multicorr[feature][1] in corr_scores:
        del multicorr[feature]
    else:
        corr_scores.append(multicorr[feature][1])
        
multicorr

{'YearBuilt': ('GarageYrBlt', 0.8356304739037884),
 'TotalBsmtSF': ('1stFlrSF', 0.8195299750050355),
 'GrLivArea': ('TotRmsAbvGrd', 0.8254893743088377),
 'GarageCars': ('GarageArea', 0.8824754142814603)}

In [14]:
# remove the feature with the lower correlation coefficient (pearson)
dropped_features = [] 
for feature1, feature2 in multicorr.items():
    if corrmat['SalePrice'][feature1] < corrmat['SalePrice'][feature2[0]]:
        dropped_features.append(feature1)
    else:
        dropped_features.append(feature2[0])

dropped_features

['GarageYrBlt', '1stFlrSF', 'TotRmsAbvGrd', 'GarageArea']

In [15]:
print('SalePrice, YearBuilt: {}'.format(corrmat['SalePrice']['YearBuilt']))
print('SalePrice, GarageYrBlt: {}'.format(corrmat['SalePrice']['GarageYrBlt']))
print('SalePrice, TotalBsmtSF: {}'.format(corrmat['SalePrice']['TotalBsmtSF']))
print('SalePrice, 1stFlrSF: {}'.format(corrmat['SalePrice']['1stFlrSF']))
print('SalePrice, GrLivArea: {}'.format(corrmat['SalePrice']['GrLivArea']))
print('SalePrice, TotRmsAbvGrd: {}'.format(corrmat['SalePrice']['TotRmsAbvGrd']))
print('SalePrice, GarageCars: {}'.format(corrmat['SalePrice']['GarageCars']))
print('SalePrice, GarageArea: {}'.format(corrmat['SalePrice']['GarageArea']))

SalePrice, YearBuilt: 0.5228973328794967
SalePrice, GarageYrBlt: 0.5188878777720517
SalePrice, TotalBsmtSF: 0.6135805515591944
SalePrice, 1stFlrSF: 0.6058521846919166
SalePrice, GrLivArea: 0.7086244776126511
SalePrice, TotRmsAbvGrd: 0.5337231555820238
SalePrice, GarageCars: 0.640409197258349
SalePrice, GarageArea: 0.6234314389183598


In [16]:
# drop the features from X_num dataframe
for feature in X_num:
    if feature in dropped_features:
        X_num = X_num.drop(feature, axis = 1)

In [17]:
drop_multicoll_features = dropped_features
drop_multicoll_features

['GarageYrBlt', '1stFlrSF', 'TotRmsAbvGrd', 'GarageArea']

In [18]:
drop_corr_features = X_num.columns.difference(X_num_fs)

In [20]:
X_num.drop(X_num.columns.difference(X_num_fs), 1, inplace = True)

In [22]:
ord_data = ['OverallQual']
X_num = pd.get_dummies(X_num, columns = ord_data, drop_first = True)

In [23]:
# encoding categorical features
enc = OrdinalEncoder()
enc.fit(X_cat)
X_cat_enc = enc.transform(X_cat)

In [24]:
# feature selection on categorical data
k_cat = round(len(X_cat.columns) / 2)
fs = SelectKBest(f_classif, k_cat)
fs.fit(X_cat_enc, y) # save!!
X_cat_fs = fs.transform(X_cat_enc)
X_cat_enc = pd.DataFrame(X_cat_fs)



In [25]:
# for i in range(len(fs.scores_)):
#	print('Feature %d: %f' % (i, fs.scores_[i]))

In [26]:
df_cat = pd.DataFrame(X_cat_enc, index = list(range(len(X.index))))
df_num = pd.DataFrame(X_num, index = list(range(len(X.index))))

In [27]:
df_num.index[3]

3

In [28]:
X = pd.concat([df_cat, df_num], axis = 1, sort = False)
X = X.drop(['SalePrice'], axis = 1)

#### __4. Building the model__

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [30]:
# hyperparameter tuning
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [31]:
# Create the random grid
rf_random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [32]:
# rand_rf = RandomForestRegressor()
# rf_random = RandomizedSearchCV(estimator = rand_rf, param_distributions = random_grid, 
#                               n_iter = 100, cv = 7, verbose = 2, n_jobs = -1, random_state = 42)
# rf_random.fit(X_train, y_train)

In [33]:
# rf_random.best_params_

In [34]:
rf = RandomForestRegressor(n_estimators = 800, min_samples_split = 2, min_samples_leaf = 1, 
      max_features = 'log2', max_depth = 70, bootstrap = False)
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=False, max_depth=70, max_features='log2',
                      n_estimators=800)

#### __5. Quantifying the quality of prediction____

In [35]:
y_predict = rf.predict(X_test)

In [36]:
print("R2 Score: ", r2_score(y_test, y_predict))
print("RMSLE", math.sqrt(mean_squared_log_error(y_test, y_predict)))

R2 Score:  0.8932816550127339
RMSLE 0.1498755357726837


#### __6. Submission (Kaggle)__

In [37]:
test_data = pd.read_csv(os.path.expanduser("~/Desktop/Projects/api/data/house-prices-advanced-regression-techniques/" + 
                                            "test.csv"))

In [38]:
def submission(df, fs, drop_multicoll_features, drop_corr_features):
    features = list(df)
    X = df[features]
    
    # check for numeric features train_data
    num_features = []
    cat_features = []
    for feature in X:
        if X[feature].dtypes == np.int or X[feature].dtypes == np.float:
            num_features.append(feature)
        else:
            cat_features.append(feature)
            
    # impute using only numerical features
    imp = IterativeImputer(max_iter = 10, random_state = 42)
    imp.fit(X[num_features])
    X[num_features] = imp.transform(X[num_features])
    
    # impute using only categorical features
    imp = SimpleImputer(strategy = 'most_frequent')
    X[cat_features] = imp.fit_transform(X[cat_features])
    
    # split dataframe into numeric and categorical data
    X_num = X.drop(cat_features, axis = 1)
    X_cat = X.drop(num_features, axis = 1)
    
    # drop features
    X_num = X_num.drop(drop_multicoll_features, axis = 1) 
    X_num.drop(drop_corr_features, 1, inplace = True)
        
    ord_data = ['OverallQual']
    X_num = pd.get_dummies(X_num, columns = ord_data, drop_first = True)
    
    # encoding categorical features
    enc = OrdinalEncoder()
    enc.fit(X_cat)
    X_cat_enc = enc.transform(X_cat)
    
    # feature selection on categorical data
    k_cat = round(len(X_cat.columns) / 2)
    X_cat_fs = fs.transform(X_cat_enc)
    X_cat_enc = pd.DataFrame(X_cat_fs)
    
    df_cat = pd.DataFrame(X_cat_enc, index = list(range(len(X.index))))
    df_num = pd.DataFrame(X_num, index = list(range(len(X.index))))
    
    X = pd.concat([df_cat, df_num], axis = 1, sort = False)
    
    return X

In [39]:
y_predict = rf.predict(submission(test_data, fs, drop_multicoll_features, drop_corr_features))

In [40]:
output = pd.DataFrame({'Id': test_data.Id, 'SalePrice': y_predict})
output.describe()

Unnamed: 0,Id,SalePrice
count,1459.0,1459.0
mean,2190.0,182325.628783
std,421.321334,67090.564919
min,1461.0,85728.07
25%,1825.5,135438.5525
50%,2190.0,162001.075
75%,2554.5,211232.41375
max,2919.0,469687.95875


In [41]:
output.to_csv('data/house_submission.csv', index = False)
print("Successful")

Successful
