# EDA Kaggle Competition: Team Integreat

In [1]:
import pandas as pd
import numpy as np
import plotly
import plotly.plotly as py
from scipy import stats
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline

### Loading test and training datasets

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print(test.shape)
print(train.shape)

(1459, 80)
(1460, 81)


### Combine test and train

In [3]:
# Save the 'Id' column
train_ID = train['Id']
test_ID = test['Id']

# Now drop the 'Id' colum since we can not use it as a feature to train our model.
train.drop("Id", axis = 1, inplace = True)
test.drop("Id", axis = 1, inplace = True)

Y_train = train['SalePrice']
X_train = train.drop('SalePrice', axis=1)
X_test = test.copy()

all_data = pd.concat([X_train, X_test], ignore_index=True)

In [4]:
from preprocess import impute, Encoder, Skewness

all_data = impute(all_data)
all_data = Encoder(all_data)
all_data = Skewness(all_data)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,LotConfig,LandSlope,...,MiscVal,MoSold,YrSold,SaleType,SaleCondition,TotalPorchSF,TotalSF,TotalBath,MSZoning * Neighborhood,BsmtUnfSF / TotalBsmtSF
0,60,RL,65.0,8450,Pave,,Reg,Lvl,Inside,Gtl,...,0,2,2008,WD,Normal,61,2566.0,4.0,RL*CollgCr,0.175234
1,20,RL,80.0,9600,Pave,,Reg,Lvl,FR2,Gtl,...,0,5,2007,WD,Normal,298,2524.0,3.0,RL*Veenker,0.22504
2,60,RL,68.0,11250,Pave,,IR1,Lvl,Inside,Gtl,...,0,9,2008,WD,Normal,42,2706.0,4.0,RL*CollgCr,0.471739
3,70,RL,60.0,9550,Pave,,IR1,Lvl,Corner,Gtl,...,0,2,2006,WD,Abnorml,307,2473.0,2.0,RL*Crawfor,0.714286
4,60,RL,84.0,14260,Pave,,IR1,Lvl,FR2,Gtl,...,0,12,2008,WD,Normal,276,3343.0,4.0,RL*NoRidge,0.427948


## Combine datasets

In [None]:
trainset = len(X_train)
X_train_preprocessed = all_data[:trainset]
X_test_preprocessed = all_data[trainset:]

## ML models

In [None]:
from sklearn.linear_model import LinearRegression
linear = LinearRegression()

linear.fit(X_train_preprocessed, Y_train)
linear.score(X_train_preprocessed, Y_train)

pred = linear.predict(X_test_preprocessed)

submission = pd.DataFrame({'Id': test_ID, 'SalePrice': pred})
submission.to_csv('linear_no1.csv', index=False)

In [None]:
from sklearn.linear_model import ElasticNet
elastic = ElasticNet(alpha = 1, l1_ratio = 0.5)
elastic.fit(X_train_preprocessed, Y_train)
pred = elastic.predict(X_test_preprocessed)

submission = pd.DataFrame({'Id': test_ID, 'SalePrice': pred})

submission.to_csv('elastic_no1.csv', index=False)

elastic.score(X_train_preprocessed, Y_train)

elastic.

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor()
tree.fit(X_train_preprocessed, Y_train)
tree.score(X_train_preprocessed, Y_train)

pred = tree.predict(X_test_preprocessed)

submission = pd.DataFrame({'Id': test_ID, 'SalePrice':pred})
submission.to_csv('tree.csv', index=False)

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import make_pipeline

pipe_tree = make_pipeline(SelectFromModel(elastic, prefit=False, threshold=None),
                        DecisionTreeRegressor())
pipe_tree.fit(X_train_preprocessed, Y_train)
pred = pipe_tree.predict(X_test_preprocessed)

submission = pd.DataFrame({'Id': test_ID, 'SalePrice':pred})
submission.to_csv('elastic_tree.csv', index=False)

pipe_tree.score(X_train_preprocessed, Y_train)

In [None]:
from sklearn import svm
svm = svm.SVR(kernel='linear')
svm.fit(X_train_preprocessed, Y_train) 

pred = svm.predict(X_test_preprocessed)

submission = pd.DataFrame({'Id': test_ID, 'SalePrice':pred})
submission.to_csv('svm_no1.csv', index=False)

In [None]:
plt.rcParams['figure.figsize'] = (20, 20)
skewed_feat_vals.plot(kind = "barh")
plt.title("Skewness of the Continuous Numerical Features in the Data")
plt.show()

In [None]:
# from sklearn.preprocessing import StandardScaler
columns_transform = ['LotFrontage','LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','LowQualFinSF','GrLivArea','GarageArea','PoolArea']
X_train_std = X_train_preprocessed.copy()
X_test_std = X_test_preprocessed.copy()

std = StandardScaler()

X_train_std.loc[:,columns_transform] = std.fit_transform(X_train_preprocessed.loc[:,columns_transform])
X_test_std.loc[:,columns_transform] = std.transform(X_test_preprocessed.loc[:,columns_transform])

print(X_train_std.shape)
print(X_test_std.shape)

In [None]:
# import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

feat_labels = X_train_std.columns[0:]

forest = RandomForestClassifier(n_estimators = 500, random_state=1)
forest.fit(X_train_std, Y_train)

In [None]:
#save values of importance and indices of the columns
importances = forest.feature_importances_
indices = np.argsort(importances)
indices

In [None]:
# print the columns and importance
for feature in range(X_train_std.shape[1]):
    print("%2d) %-*s %feature" % (feature + 1, 30, feat_labels[indices[feature]],
                                 importances[indices[feature]]))

In [None]:
#visualize result with all columns
from matplotlib import cm
color = cm.inferno_r(np.linspace(.4,.8, 69))

plt.figure(figsize=(20,20))
plt.title('Feature Importance')

x = plt.barh(range(X_train_std.shape[1]), importances[indices], align='center', color=color)
x = plt.yticks(range(X_train_std.shape[1]), feat_labels, rotation = 0, size=12)
x = plt.ylim([-1, X_train_std.shape[1]])

In [None]:
# set threshold as mean to pick features
from sklearn.feature_selection import SelectFromModel
importances2 = forest.feature_importances_
indices2 = np.argsort(importances2)[::-1]

sfm = SelectFromModel(forest, prefit=True, threshold=None)
X_selected = sfm.transform(X_train_std)


for feature in range(X_selected.shape[1]):
    print("%2d) %-*s %f" % (feature + 1, 30, feat_labels[indices2[feature]],
                                 importances2[indices2[feature]]))
    
#plt.figure(figsize=(20,20))
#plt.title('Feature Importance')

#x = plt.barh(range(X_selected.shape[1]), importances[indices], align='center', color=color)
#x = plt.yticks(range(X_selected.shape[1]), feat_labels, rotation = 0, size=15)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from math import sqrt
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
# Create KNeighbors classifier object model 

model = KNeighborsClassifier(n_neighbors = 38) # default value for n_neighbors is 5
# Train the model using the training sets and check score

model.fit(X_train_std, Y_train)
model.score(X_train_std, Y_train)
#Predict Output
#predicted= model.predict(x_test)

### Scatterplot for selected numerical columns

In [None]:
# investigate relationships
train.describe().columns
cols = ['LotArea','OverallQual','OverallCond','TotalBsmtSF','GarageArea', 'SalePrice']
sns.pairplot(train[cols], size=2)

#SalePrice right-skewed - log or box cox transformation
#TotalBsmtSF and GarageArea normally distributed

#Correlations SalePrice: TotalBsmtSF, GaraArea, OverallQual
#Multicollinearity: OverallQual and TotalBsmtSF, TotalBsmtSF and GarageArea

In [None]:
cor = np.corrcoef(train[cols].values.T)
sns.set(font_scale=1.5)
heat = sns.heatmap(cor, cbar=True,
                  annot=True,
                  square=True,
                  fmt='.2f',
                  annot_kws={'size':15},
                  yticklabels=cols,
                  xticklabels=cols)

#SalePrice correlate with: GarageArea, TotalSF