In [None]:
"""
House Prices: Advanced Regression Techniques
Input Missing Values
Data Distribution
Data Transformation from numerical to categorical column values
Label Encoding some categorical varibles that may contain some information in their ordering set.
Getting Dummy Variables for categorical features
"""

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.stats import skew
from sklearn.linear_model import Ridge,RidgeCV,ElasticNet,LassoCV,LassoLarsCV
from sklearn.model_selection import cross_val_score

In [None]:
trainData = pd.read_csv("C:/Users/320061150/HealthCare_Components_Task/Kaggle/house-prices-advanced-regression-techniques/train.csv")
testData = pd.read_csv("C:/Users/320061150/HealthCare_Components_Task/Kaggle/house-prices-advanced-regression-techniques/test.csv")


In [None]:
"""
Concatenating from MSSubClass - SaleCondition. Excluding ID and SalePrice column
"""
allData = pd.concat((trainData.loc[:,'MSSubClass':'SaleCondition'],testData.loc[:,'MSSubClass':'SaleCondition']))
allData
   
"""
Data preprocessing:
We're not going to do anything fancy here:

First I'll transform the skewed numeric features by taking log(feature + 1) - this will make the features more normal
Create Dummy variables for the categorical features
Replace the numeric missing values (NaN's) with the mean of their respective columns
"""

matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
prices = pd.DataFrame({"price":trainData["SalePrice"], "log(price + 1)":np.log1p(trainData["SalePrice"])})
prices.hist()


In [None]:

#log transform the target:
trainData["SalePrice"] = np.log1p(trainData["SalePrice"])

"""
log transform skewed numeric features and transforming data int dummy variables.

"""
numeric_feats = allData.dtypes[allData.dtypes != 'object'].index
skewed_feats = trainData[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[skewed_feats> 0.75]
skewed_feats = skewed_feats.index

allData[skewed_feats] = np.log1p(allData[skewed_feats])
allData = pd.get_dummies(allData)
allData = allData.fillna(allData.mean())


In [None]:
X_train = allData[:trainData.shape[0]]
X_test = allData[trainData.shape[0]:]
y = trainData.SalePrice

In [None]:
"""
Models
Now we are going to use regularized linear regression models from the scikit learn module. I'm going to try both l_1(Lasso) and l_2(Ridge) regularization. I'll also define a function that returns the cross-validation rmse error so we can evaluate our models and pick the best tuning par
"""
def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

In [None]:
"""
Lasso Regression
The lasso performs even more bettwe
"""
model_lasso =  LassoCV(alphas=[1,0.1,0.001,0.0005]).fit(X_train,y)
rmse_cv(model_lasso).mean()
"""
Nice! The lasso performs even better so we'll just use this one to predict on the test set. Another neat thing about the Lasso is that it does feature selection for you - setting coefficients of features it deems unimportant to zero. Let's take a look at the coefficients:
"""
coef = pd.Series(model_lasso.coef_,index=X_train.columns)
print("Lasso picked " +str(sum(coef !=0)) + " variables and elimated the other " + str(sum(coef == 0))+" variables")

"""
Identifying the important coefficients
"""

imp_coef = pd.concat([coef.sort_values().head(10),coef.sort_values().tail(10)])
plt.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = "barh")
plt.title("Coefficients in the Lasso Model")

lasso_preds = pd.DataFrame({'SalePrice':np.expm1(model_lasso.predict(X_test))})
lasso_preds['Id'] = testData['Id']
lasso_preds[['Id','SalePrice']]
lasso_preds.columns = ['SalePrice','Id']
lasso_preds.to_csv('sample_submission.csv',index=False)