In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier

from sklearn.model_selection import cross_val_score, GridSearchCV

In [2]:
# Import train dataset
df = pd.read_csv('train.csv')
df = df.drop('SalePrice', axis=1)

In [24]:
# Check columns with missing data
df[df.columns[df.isnull().any()]].isnull().sum()

Series([], dtype: float64)

In [3]:
##### Repalce missing data
# Impute missing value for 'Electrical' by SBrkr since it has the highest frequency
df.loc[df['Electrical'].isnull(), 'Electrical'] = 'SBrkr'

# MasVnrType MasVnrType have the same rows of missing value, None is the most frequent value for MasVnrType
# From my guessing, Impute 0.0 for missing value in column MasVnrArea
df.loc[df['MasVnrType'].isnull(), 'MasVnrType'] = 'None'
df.loc[df['MasVnrArea'].isnull(), 'MasVnrArea'] = 0.0

# Impute column BsmtExposure, index=948 with value 'No', since it has basement
df.loc[948, 'BsmtExposure'] = 'No'

# Impute 'NO_bsme' to column related besement is missing, since the area of basement is 0
for i in ['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2']:
    df.loc[df[i].isnull() == True, i] = 'NO_bsmt'

# Impute 'NO_garage' to column related garage is missing, since the area of garage is 0
for i in ['GarageType','GarageFinish','GarageQual','GarageCond']:
    df.loc[df[i].isnull() == True, i] = 'NO_garage'

df.loc[df['GarageYrBlt'].isnull( )== True, 'GarageYrBlt'] = 0

# When FireplaceQu has missing value, the value of column Fireplaces is 0. Means there is no Fireplaces in the house
# Impute FireplaceQu with 'No_fireplace'
df.loc[df['FireplaceQu'].isnull() == True, 'FireplaceQu'] = 'NO_fireplace'

# Impute Fence with 'No_fence'
df.loc[df['Fence'].isnull() == True, 'Fence'] = 'No_fence'

# I will drop column Alley, MiscFeature, PoolQC since it has over 90% missing value
df = df.drop(['Alley'], axis=1)
df = df.drop(['MiscFeature'], axis=1)
df = df.drop(['PoolQC'], axis=1)

# Impute missing value of LotFrontage column
df.loc[df['LotFrontage'].isnull() == True, 'LotFrontage'] = df['LotArea']**(1/4)

In [4]:
# chenge object value to int
dict1 = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1}
df['ExterQual'] = df['ExterQual'].map(dict1)
df['ExterCond'] = df['ExterCond'].map(dict1)
df['HeatingQC'] = df['HeatingQC'].map(dict1)
df['KitchenQual'] = df['KitchenQual'].map(dict1)

bsmt_dict = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'NO_bsmt':0}
df['BsmtQual'] = df['BsmtQual'].map(bsmt_dict)
df['BsmtCond'] = df['BsmtCond'].map(bsmt_dict)
df['BsmtExposure'] = df['BsmtExposure'].map(bsmt_dict)

Garage_dict = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1,'NO_garage':0}
df['GarageQual'] = df['GarageQual'].map(Garage_dict)
df['GarageCond'] = df['GarageCond'].map(Garage_dict)

GarageFinish_dict = {'Fin':3, 'RFn':2, 'Unf':1,'NO_garage':0}
df['GarageFinish'] = df['GarageFinish'].map(GarageFinish_dict)

FireplaceQu_dict = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1,'NO_fireplace':0}
df['FireplaceQu'] = df['FireplaceQu'].map(FireplaceQu_dict)

centralair_dict = {'Y':1, 'N':0}
df['CentralAir'] = df['CentralAir'].map(centralair_dict)

PavedDrive_dict = {'Y':3, 'P':2, 'N':1}
df['PavedDrive'] = df['PavedDrive'].map(PavedDrive_dict)

Fence_dict = {'GdPrv':4, 'MnPrv':3, 'GdWo':2, 'MnWw':1,'No_fence':0}
df['Fence'] = df['Fence'].map(Fence_dict)

BsmtFinType_dict = {'GLQ':6,'ALQ':5,'BLQ':4, 'Rec':3, 'LwQ':2, 'Unf':1,'NO_bsmt':0}
df['BsmtFinType1'] = df['BsmtFinType1'].map(BsmtFinType_dict)
df['BsmtFinType2'] = df['BsmtFinType2'].map(BsmtFinType_dict)

In [5]:
df['SaleCondition'] = df['SaleCondition'].map(lambda x: 1 if x=='Abnorml' else 0)

In [6]:
df['SaleCondition'].value_counts()

0    1359
1     101
Name: SaleCondition, dtype: int64

In [7]:
df_dummy = pd.get_dummies(df)

In [8]:
# Find the correlation between SaleCondition and features and sort the correlation value decending
corr = df_dummy.corr()[['SaleCondition']].sort_values('SaleCondition', ascending=False)

In [9]:
# Find features that has larger than 0.07 correlation value with SalePrice
high_corr = corr[abs(corr['SaleCondition']) > 0.07]
high_corr = high_corr.drop(['SaleCondition'])

In [10]:
high_corr = high_corr.drop(['Utilities_NoSeWa','Functional_Sev','Exterior1st_Stone','Electrical_Mix'])

In [11]:
X = df_dummy[high_corr.index]
y = df['SaleCondition']

Xs = StandardScaler().fit_transform(X)

In [12]:
# KNN 
baseline = 1 - np.mean(y)
knn = KNeighborsClassifier()
knn.fit(Xs, y)
print(baseline)
print(cross_val_score(knn, Xs, y, cv=5).mean())

0.9308219178082192
0.927394553098


In [13]:
clf = svm.SVC(C=0.01, kernel='poly')
clf.fit(Xs, y)
cross_val_score(clf, Xs, y, cv=5).mean()

0.93219848851233711

In [34]:
dtc = DecisionTreeClassifier(max_depth=100, random_state=0)
dtc.fit(Xs, y)
cross_val_score(dtc, Xs, y, cv=5).mean()

0.86782871177480025

In [32]:
rfc = RandomForestClassifier(max_depth=200, n_estimators=200)
rfc.fit(Xs, y)
cross_val_score(rfc, Xs, y, cv=5).mean()

0.92328731777374085

In [17]:
bagger = BaggingClassifier(dtc)
cross_val_score(bagger, Xs, y, cv=5).mean()

0.92192449984327296

In [36]:
vf = VotingClassifier(estimators=[('svm',clf), ('rfc',rfc), ('bagger',bagger)], voting='hard')

In [37]:
cross_val_score(vf, Xs, y, cv=5).mean()

0.92671434509083017

In [5]:
# Support Vector Machine
clf = svm.SVC(kernel='poly')
gamma_range = np.logspace(-5, 2, 10)
c_range = np.logspace(-3, 2, 10)
kernel_range = ['sigmoid','linear','poly']
grid_dict = dict(gamma=gamma_range, C=c_range, kernel=kernel_range)
#grid_search = GridSearchCV(clf, grid_dict, verbose=1, n_jobs=-1)
#grid_search.fit(Xs, y)

In [14]:
# Import test dataset
test = pd.read_csv('test.csv')

In [15]:
# change columns name of test data
test.columns = [i.replace(' ','') for i in test.columns]
test.columns = [i.replace('/','') for i in test.columns]

In [16]:
# Impute missing value for 'Electrical' by SBrkr since it has the highest frequency
test.loc[test['Electrical'].isnull(), 'Electrical'] = 'SBrkr'

# MasVnrType MasVnrType have the same rows of missing value, None is the most frequent value for MasVnrType
# From my guessing, Impute 0.0 for missing value in column MasVnrArea
test.loc[test['MasVnrType'].isnull(), 'MasVnrType'] = 'None'
test.loc[test['MasVnrArea'].isnull(), 'MasVnrArea'] = 0.0

# Impute 'NO_bsme' to column related besement is missing, since the area of basement is 0
for i in ['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2']:
    test.loc[test[i].isnull() == True, i] = 'NO_bsmt'

# Impute 'NO_garage' to column related garage is missing, since the area of garage is 0
for i in ['GarageType','GarageFinish','GarageQual','GarageCond']:
    test.loc[test[i].isnull() == True, i] = 'NO_garage'

test.loc[test['GarageYrBlt'].isnull( )== True, 'GarageYrBlt'] = 0

# When FireplaceQu has missing value, the value of column Fireplaces is 0. Means there is no Fireplaces in the house
# Impute FireplaceQu with 'No_fireplace'
test.loc[test['FireplaceQu'].isnull() == True, 'FireplaceQu'] = 'NO_fireplace'

# Impute Fence with 'No_fence'
test.loc[test['Fence'].isnull() == True, 'Fence'] = 'No_fence'

# I will drop column Alley, MiscFeature, PoolQC since it has over 90% missing value
test = test.drop(['Alley'], axis=1)
test = test.drop(['MiscFeature'], axis=1)
test = test.drop(['PoolQC'], axis=1)

# Impute missing value of LotFrontage column
test.loc[test['LotFrontage'].isnull() == True, 'LotFrontage'] = test['LotArea']**(1/4)

In [17]:
# chenge object value to int
dict1 = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1}
test['ExterQual'] = test['ExterQual'].map(dict1)
test['ExterCond'] = test['ExterCond'].map(dict1)
test['HeatingQC'] = test['HeatingQC'].map(dict1)
test['KitchenQual'] = test['KitchenQual'].map(dict1)

bsmt_dict = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'NO_bsmt':0}
test['BsmtQual'] = test['BsmtQual'].map(bsmt_dict)
test['BsmtCond'] = test['BsmtCond'].map(bsmt_dict)
test['BsmtExposure'] = test['BsmtExposure'].map(bsmt_dict)

Garage_dict = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1,'NO_garage':0}
test['GarageQual'] = test['GarageQual'].map(Garage_dict)
test['GarageCond'] = test['GarageCond'].map(Garage_dict)

GarageFinish_dict = {'Fin':3, 'RFn':2, 'Unf':1,'NO_garage':0}
test['GarageFinish'] = test['GarageFinish'].map(GarageFinish_dict)

FireplaceQu_dict = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1,'NO_fireplace':0}
test['FireplaceQu'] = test['FireplaceQu'].map(FireplaceQu_dict)

centralair_dict = {'Y':1, 'N':0}
test['CentralAir'] = test['CentralAir'].map(centralair_dict)

PavedDrive_dict = {'Y':3, 'P':2, 'N':1}
test['PavedDrive'] = test['PavedDrive'].map(PavedDrive_dict)

Fence_dict = {'GdPrv':4, 'MnPrv':3, 'GdWo':2, 'MnWw':1,'No_fence':0}
test['Fence'] = test['Fence'].map(Fence_dict)

BsmtFinType_dict = {'GLQ':6,'ALQ':5,'BLQ':4, 'Rec':3, 'LwQ':2, 'Unf':1,'NO_bsmt':0}
test['BsmtFinType1'] = test['BsmtFinType1'].map(BsmtFinType_dict)
test['BsmtFinType2'] = test['BsmtFinType2'].map(BsmtFinType_dict)

In [18]:
test1_col = set([i[:i.index('_')] for i in high_corr.index if '_' in i])

In [19]:
test1 = test[list(test1_col)]

In [20]:
test1_dummy = pd.get_dummies(test1)

In [21]:
test_dummy = pd.concat([test, test1_dummy], axis=1)

In [23]:
test_dummy = test_dummy.rename(columns = {'SaleType_WD ':'SaleType_WD'})

In [24]:
set(high_corr.index) - set(test_dummy.columns)

set()

In [25]:
X_test = test_dummy[high_corr.index]
Xs_test = StandardScaler().fit_transform(X_test)

In [26]:
y_predict = pd.DataFrame(clf.predict(Xs_test))
y_predict.columns = ['Sale Condition']

In [27]:
y_predict.mean()

Sale Condition    0.002275
dtype: float64

In [28]:
pd.concat([test[['Id']], y_predict], axis=1).to_csv("classification_out1.csv")