In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import seaborn as sns
import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
train_data = pd.read_csv('../input/train.csv')
test_data = pd.read_csv('../input/test.csv')
test_data_copy = pd.read_csv('../input/test.csv')
n_rows_train = train_data.shape[0]
Y_train = train_data.iloc[:,-1]
#train_data = train_data.iloc[:,:-1]

total_data = train_data.append(test_data, sort=False)
total_data = total_data.drop(['Id','SalePrice'],axis=1)
total_data.head()
print(test_data.shape)

In [None]:
#correlation matrix
corrmat = train_data.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);

In [None]:
# NOTE: we can draw scatter plots, it is  just for visualization of relation between different variables
sns.set()
cols = ['SalePrice','OverallQual', 'GrLivArea','TotalBsmtSF', 'FullBath','YearBuilt']
sns.pairplot(train_data[cols], height=2.5)
plt.show()

In [None]:
#Missing data treatment
train_data = train_data.iloc[:,:-1]
total_missing_values = total_data.isnull().sum().sort_values(ascending=False)
percentage_missing_data = (100*(total_data.isnull().sum()/total_data.isnull().count())).sort_values(ascending=False)
missing_data = pd.concat([total_missing_values, percentage_missing_data], axis=1, keys=['total_missing_values','percentage_missing_data'])
missing_data.head(20)

In [None]:
#missing value treatment
#we can see some of the features which have high missing values are categorical, 
#so we will replce their missing value by "None" which represents NA category as given in variable description
total_data["PoolQC"] = total_data["PoolQC"].fillna("None") 
total_data["MiscFeature"] = total_data["MiscFeature"].fillna("None") 
total_data["Alley"] = total_data["Alley"].fillna("None") 
total_data["Fence"] = total_data["Fence"].fillna("None")
total_data["FireplaceQu"] = total_data["FireplaceQu"].fillna("None") 

# LotFrontage is a continuous variable, so we replace missing values from houses of same neighborhood
# and take their median
total_data["LotFrontage"] = total_data.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

total_data["GarageCond"] = total_data["GarageCond"].fillna("None") 
total_data["GarageQual"] = total_data["GarageQual"].fillna("None") 
total_data["GarageFinish"] = total_data["GarageFinish"].fillna("None") 
total_data["GarageType"] = total_data["GarageType"].fillna("None")

# we have replaced garage variables by none i.e. they don't have garage, so we can replace numeric
# variables of garage =0 
total_data["GarageYrBlt"] = total_data["GarageYrBlt"].fillna(0)
total_data["GarageCars"] = total_data["GarageCars"].fillna(0)
total_data["GarageArea"] = total_data["GarageArea"].fillna(0)

for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    total_data[col] = total_data[col].fillna(0)
    
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    total_data[col] = total_data[col].fillna('None')
    
total_data["MasVnrType"] = total_data["MasVnrType"].fillna("None")
total_data["MasVnrArea"] = total_data["MasVnrArea"].fillna(0)

# MSZoning is categorical variable but doesn't have any NA category, so we replace missing values 
# by most occured value in that variable
total_data['MSZoning'] = total_data['MSZoning'].fillna(total_data['MSZoning'].mode()[0])
total_data['MSZoning'] = total_data['MSZoning'].fillna(total_data['MSZoning'].mode()[0])
# NOTE: there are other variables which have 1or 2 missing values, they are very los, we can even drop 
# those obervations
# this variable has same value for all observations except 3, so we drop it
total_data = total_data.drop(['Utilities'], axis=1)

total_data["Functional"] = total_data["Functional"].fillna("Typ")
for col in ('KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType'):
    total_data[col] = total_data[col].fillna(total_data[col].mode()[0])
total_data['MSSubClass'] = total_data['MSSubClass'].fillna("None")

In [None]:
total_data['Electrical'] = total_data['Electrical'].fillna(total_data['Electrical'].mode()[0])

In [None]:
# converting some numericla variiables that really are categories
total_data['MSSubClass'] = total_data['MSSubClass'].apply(str)


#Changing OverallCond into a categorical variable
total_data['OverallCond'] = total_data['OverallCond'].astype(str)


#Year and month sold are transformed into categorical features.
total_data['YrSold'] = total_data['YrSold'].astype(str)
total_data['MoSold'] = total_data['MoSold'].astype(str)

In [None]:
from sklearn.preprocessing import LabelEncoder

col = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')

for c in col:
    lb = LabelEncoder()
    lb.fit(total_data[c])
    total_data[c] = lb.transform(total_data[c])

In [None]:
print('Shape all_data: {}'.format(total_data.shape))

In [None]:
from scipy.stats import skew
#reducing skewness of all features and target variable
Y_train1 = np.log1p(Y_train)

n_rows_train = train_data.shape[0];
train_data = total_data.iloc[:n_rows_train,:]
test_data = total_data.iloc[n_rows_train:,:]
#finding numerical features

features = total_data.dtypes[total_data.dtypes != "object"].index

#finding skewness of all variables
skewed_feats = total_data[features].apply(lambda x: skew(x.dropna()))
#adjusting features having skewness >0.75
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
total_data[skewed_feats] = np.log1p(total_data[skewed_feats])



In [None]:
# although we have applied norm distribution to all numeric variables, but here we will plot graph of
# target variable only
# NOTE: y axisis probability density estimates, # to get freq, use kde= False
chart1, ax1 = plt.subplots()
sns.distplot(Y_train, norm_hist=False,ax=ax1);
#after applying logarithm, we get plot relatively simiar to norm distribution
chart2, ax2 = plt.subplots()
sns.distplot(Y_train1, norm_hist=False,ax=ax2);

In [None]:
# now converting categorical features to one hot encoding vectors
total_data_oh = pd.get_dummies(total_data)
total_data_oh.head()

In [None]:
#split between X and test data
X = total_data_oh.iloc[:n_rows_train,:]
test_data = total_data_oh.iloc[n_rows_train:,:]
print(X.shape)

#  split X between training and testing set
x_train, x_test, y_train, y_test = train_test_split(X,Y_train1, test_size=0.3, shuffle=True) 
'''
#PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=5)
pca = pca.fit(x_train)
principalComponents_xtrain = pca.transform(x_train)
principalComponents_xtest = pca.transform(x_test)
x_train = pd.DataFrame(principalComponents_xtrain)
x_test = pd.DataFrame(principalComponents_xtest)
'''
# I have tried PCA code above, but the model is performing bad with it. So, I am not applying PCA.
#scaling the data
scaler = RobustScaler()
scaler = scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
test_data = scaler.transform(test_data)
x_train = pd.DataFrame(x_train)
x_test = pd.DataFrame(x_test)
test_data = pd.DataFrame(test_data)
x_train.head()

In [None]:
print(Y_train1.shape)

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
'''
Cross-validation is a resampling procedure used to evaluate machine learning models on a limited 
data sample. It is just used to check how this particular model will perform on different test sets. 
It is not used to say whether this particuclar model is best or not.
At the end final predictions are made by model.fit and model.predict only.
'''
#Validation function
n_folds = 5

def rmse_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(x_train.values)
    rmse= np.sqrt(-cross_val_score(model, X, Y_train1, scoring="neg_mean_squared_error", cv = kf)) 
    # this computes rmse of each fold
    return(rmse)

In [None]:
#modelling
from sklearn.neural_network import MLPRegressor
mlp = MLPRegressor(hidden_layer_sizes=(10,6,5,3),activation='relu',alpha = 0.0001,max_iter = 1000,solver='lbfgs')


In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse(y_pred, y_test):
    rmse = sqrt(mean_squared_error(y_test,y_pred))
    return rmse

In [None]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.0005)
lasso.fit(x_train, y_train)
#pred = lasso.predict(x_test)
#print(rmse(pred,y_test))
#print(rmse(lasso, x_train, y_train, x_test, y_test))

In [None]:
import xgboost as xgb
xgb_model = xgb.XGBRegressor()
xgb_model.fit(x_train, y_train)
#print(rmse(xgb_model, x_train, y_train, x_test, y_test))

In [None]:
from sklearn.linear_model import ElasticNet
elastic_net_model = ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3)
elastic_net_model.fit(x_train, y_train)
#print(rmse(elastic_net_model, x_train, y_train, x_test, y_test))

In [None]:
#making predictions on test set
y_pred_elastic_net_test_data = np.expm1(elastic_net_model.predict(test_data))
y_pred_lasso_test_data = np.expm1(lasso.predict(test_data))

In [None]:
pred = 0.3*y_pred_elastic_net_test_data + 0.7*y_pred_lasso_test_data

In [None]:
solution = pd.DataFrame({"id":test_data_copy.Id, "SalePrice":pred})
solution.to_csv("housing_pricefinal.csv", index = False)

In [None]:
#y_pred_lasso_test_data