In [None]:
#Importing the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
import sklearn.metrics as metrics
import os

# hide warnings
import warnings
warnings.filterwarnings('ignore')

#Setting the columns that can be displayed to max as 500 and rows to 300
pd.set_option('display.max_columns', 500)
pd.options.display.max_rows = 300

In [None]:
# reading the dataset
housing_orig = pd.read_csv("train.csv")

In [None]:
#Creating a copy of the data set to to work on
housing = housing_orig.copy(deep= True)

In [None]:
#to have the look at the data
housing.head()

In [None]:
#Statistical description of the dataset
housing.describe()

In [None]:
#To understand the columns, shape and data types of various columns
housing.info()

In [None]:
#To understand the shape 
housing.shape

In [None]:
#to find the total no of rows that are null for each column in the dataset
housing.isnull().sum().sort_values(ascending=False)

In [None]:
# Checking the percentage of null values in the dataframe 'housing' column-wise
round(100*(housing.isnull().sum()/len(housing.index)), 2).sort_values(ascending=False)

In [None]:
#Checking the unique values for the columns
housing['PoolQC'].value_counts()

In [None]:
#Checking the unique values for the columns
housing['MiscFeature'].value_counts()

In [None]:
#Checking the unique values for the columns
housing['Alley'].value_counts()

In [None]:
housing.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence'], axis =1, inplace = True)

In [None]:
# Checking the percentage of null values in the dataframe 'housing' column-wise
round(100*(housing.isnull().sum()/len(housing.index)), 2).sort_values(ascending=False)

In [None]:
#Checking the values and their distributions of the 'FireplaceQu'
housing['FireplaceQu'].value_counts()

In [None]:
housing['FireplaceQu'] = housing['FireplaceQu'].replace(np.nan, 'NA')

In [None]:
#Checking the values and their distributions of the 'FireplaceQu'
housing['FireplaceQu'].value_counts()

In [None]:
housing['LotFrontage'].describe()

In [None]:
#To understand if thre are any outliers in the column
sns.boxplot('LotFrontage', data = housing)

In [None]:
housing['LotFrontage'] = housing['LotFrontage'].fillna(housing['LotFrontage'].median())

In [None]:
housing['GarageYrBlt'].value_counts()

In [None]:
import datetime as d
current_year = int(d.datetime.now().year)

In [None]:
#Caluculating the age of the garage based on the year it was built
housing['GarageYrBlt'] = current_year-housing['GarageYrBlt']

In [None]:
 #To understand if thre are any outliers in the column
sns.boxplot('GarageYrBlt', data = housing)

In [None]:
#There are no outliers so we can impute the data with the mean of the age of which garage was built
housing['GarageYrBlt'] = housing['GarageYrBlt'].fillna(housing['GarageYrBlt'].mean())

In [None]:
#Caluculating the number of years the house was sold from the current year
housing['YrSold'] = current_year-housing['YrSold']

In [None]:
#Caluculating the number of years the house was built from the current year
housing['YearBuilt'] = current_year-housing['YearBuilt']

In [None]:
#Caluculating the number of years the house was built from the current year
housing['YearRemodAdd'] = current_year-housing['YearRemodAdd']

In [None]:
housing['GarageType'].value_counts()

In [None]:
#Replacing NaN values to NA which indicates that the property doesnt have a garage.
housing['GarageType'] = housing['GarageType'].replace(np.nan, 'NA')

In [None]:
housing['GarageFinish'].value_counts()

In [None]:
#Replacing NaN values to NA which indicates that the property doesnt have a garage.
housing['GarageFinish'] = housing['GarageFinish'].replace(np.nan, 'NA')

In [None]:
housing['GarageQual'].value_counts()

In [None]:
#Replacing NaN values to NA which indicates that the property doesnt have a garage.
housing['GarageQual'] = housing['GarageQual'].replace(np.nan, 'NA')

In [None]:
#Replacing NaN values to NA which indicates that the property doesnt have a garage.
housing['GarageCond'] = housing['GarageCond'].replace(np.nan, 'NA')

In [None]:
housing['BsmtExposure'].value_counts()

In [None]:
#Replacing NaN values to NA which indicates that the property doesnt have a garage. It is also the mode for the data
housing['BsmtExposure'] = housing['BsmtExposure'].replace(np.nan, 'NA')

In [None]:
housing['BsmtFinType2'].value_counts()

In [None]:
#Replacing NaN values to NA which indicates that the property doesnt have a garage.
housing['BsmtFinType2'] = housing['BsmtFinType2'].replace(np.nan, 'NA')

In [None]:
housing['BsmtFinType1'].value_counts()

In [None]:
#Replacing NaN values to NA which indicates that the property doesnt have a garage.
housing['BsmtFinType1'] = housing['BsmtFinType1'].replace(np.nan, 'NA')

In [None]:
housing['BsmtCond'].value_counts()

In [None]:
#Replacing NaN values to NA which indicates that the property doesnt have a garage.
housing['BsmtCond'] = housing['BsmtCond'].replace(np.nan, 'NA')

In [None]:
housing['BsmtQual'].value_counts()

In [None]:
#Replacing NaN values to NA which indicates that the property doesnt have a garage.
housing['BsmtQual'] = housing['BsmtQual'].replace(np.nan, 'NA')

In [None]:
 #To understand if thre are any outliers in the column
sns.boxplot('MasVnrArea', data = housing)

In [None]:
housing['MasVnrArea'] = housing['MasVnrArea'].fillna(housing['MasVnrArea'].median())

In [None]:
housing['MasVnrArea'].isnull().sum()

In [None]:
housing['MasVnrType'].value_counts()

In [None]:
housing['MasVnrType'].mode()

In [None]:
#Replacing it with the mode i.e. the None
housing['MasVnrType'] = housing['MasVnrType'].replace(np.nan, 'None')

In [None]:
housing['MasVnrType'].isnull().sum()

In [None]:
housing['Electrical'].value_counts()

In [None]:
#Replacing it with the mode i.e. the SBrkr
housing['Electrical'] = housing['Electrical'].replace(np.nan, 'SBrkr')

In [None]:
# Checking the percentage of null values in the dataframe 'housing' column-wise
round(100*(housing.isnull().sum()/len(housing.index)), 2).sort_values(ascending=False)

In [None]:
#To see if the 'SalePrice' column is normally distributed to apply regression on it
sns.distplot(housing['SalePrice'])

In [None]:
housing.columns

In [None]:
#Applying the log transformation technique on the SalePrice column to convert into a normal distributed data
housing['log_value'] = np.log(housing['SalePrice'])

In [None]:
sns.distplot(housing['log_value'])

In [None]:
#converting the MSSubClass into a object class as the current column is being identified as int where in actual its being an categorical
housing['MSSubClass'] = housing['MSSubClass'].astype(object)

In [None]:
housing.info()

In [None]:
housing.head()

In [None]:
housing_orig.columns

In [None]:
#Dropping the columns
housing.drop(['Heating','SaleCondition', 'CentralAir', 'LowQualFinSF', 'KitchenAbvGr', 'Functional', 'SaleType', 'MoSold', 'MiscVal', 'PoolArea','Scree

In [None]:
housing.columns

In [None]:
# all numeric (float and int) variables in the dataset
housing_numeric = housing.select_dtypes(include=['float64', 'int64'])
housing_numeric.head()

In [None]:
housing['BsmtFullBath'].value_counts()

In [None]:
housing['BsmtHalfBath'].value_counts()

In [None]:
# correlation matrix
cor = housing_numeric.corr()

# plotting correlations on a heatmap

# figure size
plt.figure(figsize=(16,8))

# heatmap
sns.heatmap(cor, cmap="YlGnBu", annot=True)
plt.show()

In [None]:
cor*100

In [None]:
housing.columns

In [None]:
# split into X and y
X = housing.loc[:, ['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood',
       'Condition1', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'Foundation', 'BsmtQual', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', 'HeatingQC',
       '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
       'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'YrSold']]

y = housing['log_value']

In [None]:
# creating dummy variables for categorical variables

# subset all categorical variables
housing_categorical = X.select_dtypes(include=['object'])
housing_categorical.head()

In [None]:
# convert into dummies
housing_dummies = pd.get_dummies(housing_categorical, drop_first=True)
housing_dummies.head()

In [None]:
# drop categorical variables 
X = X.drop(list(housing_categorical.columns), axis=1)

In [None]:
# concat dummy variables with X
X = pd.concat([X, housing_dummies], axis=1)

In [None]:
# scaling the features
from sklearn.preprocessing import scale

# storing column names in cols, since column names are lost after 
# scaling (the dataframe is converted to a numpy array)
cols = X.columns
X = pd.DataFrame(scale(X))
X.columns = cols
X.columns

In [None]:
# split into train and test
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.7,
                                                    test_size = 0.3, random_state=100)

In [None]:
# list of alphas to tune
params = {'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000 ]}


ridge = Ridge()

# cross validation
folds = 5
model_cv = GridSearchCV(estimator = ridge, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            
model_cv.fit(X_train, y_train) 

In [None]:
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results.head()

In [None]:
cv_results

In [None]:
# plotting mean test and train scoes with alpha 
cv_results['param_alpha'] = cv_results['param_alpha'].astype('int32')

# plotting
plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'])
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('Negative Mean Absolute Error')
plt.title("Negative Mean Absolute Error and alpha")
plt.legend(['train score', 'test score'], loc='upper right')
plt.show()

In [None]:
print("\n The best estimator across ALL searched params:\n",
          model_cv.best_estimator_)
print("\n The best score across ALL searched params:\n",
          model_cv.best_score_)
print("\n The best parameters across ALL searched params:\n",
          model_cv.best_params_)

In [None]:
#Using the best hyper parameter in the ridge Regression
alpha = 100
ridge = Ridge(alpha=alpha)

ridge.fit(X_train, y_train)
ridge.coef_

In [None]:
# predict for the training dataset
y_train_pred = ridge.predict(X_train)
print('The training accuracy is:')
print(metrics.r2_score(y_true=np.exp(y_train), y_pred=np.exp(y_train_pred)))

In [None]:
# predict for the test dataset
y_test_pred = ridge.predict(X_test)
print('The testing accuracy is:')
print(metrics.r2_score(y_true=np.exp(y_test), y_pred=np.exp(y_test_pred)))

In [None]:
# model coefficients
cols = X.columns
cols = cols.insert(0, "constant")
model_parameters = list(ridge.coef_)
list(zip(cols, model_parameters))

In [None]:
# model coefficients
cols = X.columns
cols = cols.insert(0, "constant")
model_parameters = list(ridge.coef_)
ridge_list = list(zip(cols, model_parameters))

In [None]:
final_pred_ridge = []
for i in range(len(ridge_list)):
    if(ridge_list[i][1]!=0):
        final_pred_ridge.append(ridge_list[i][0])
print('Number of predictors selected by optimal alpha for ridge are:{0}'.format(len(final_pred_ridge)))

In [None]:
alpha_double = 200
ridge_double = Ridge(alpha=alpha_double)

ridge_double.fit(X_train, y_train)
ridge_double.coef_

In [None]:
# predict
y_train_pred_double = ridge_double.predict(X_train)
print(metrics.r2_score(y_true=y_train, y_pred=y_train_pred_double))

In [None]:
rsquare = metrics.r2_score(y_true=y_train, y_pred=y_train_pred_double)
rssbytss = 1-rsquare
rssbytss

In [None]:
y_test_pred = ridge_double.predict(X_test)
print(metrics.r2_score(y_true=y_test, y_pred=y_test_pred))

In [None]:
# model coefficients
cols = X.columns
cols = cols.insert(0, "constant")
model_parameters = list(ridge_double.coef_)
ridge_doble_list = list(zip(cols, model_parameters))

In [None]:
ridge_doble_list

In [None]:
final_pred_ridge_double =[]
for i in range(len(ridge_doble_list)):
    if(ridge_doble_list[i][1]!=0):
        final_pred_ridge_double.append(ridge_doble_list[i][0])
print('Number of predictors selected by double the optimal alpha for ridge are:{0}'.format(len(final_pred_ridge_double)))

In [None]:
# list of alphas to tune
params = {'alpha': [0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01]}


lasso = Lasso()

# cross validation
model_lasso_cv = GridSearchCV(estimator = lasso, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            

model_lasso_cv.fit(X_train, y_train) 

In [None]:
cv_results_lasso = pd.DataFrame(model_cv.cv_results_)
cv_results_lasso.head()

In [None]:
# plotting mean test and train scoes with alpha 
cv_results_lasso['param_alpha'] = cv_results_lasso['param_alpha'].astype('float32')

# plotting
plt.plot(cv_results_lasso['param_alpha'], cv_results_lasso['mean_train_score'])
plt.plot(cv_results_lasso['param_alpha'], cv_results_lasso['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('Negative Mean Absolute Error')

plt.title("Negative Mean Absolute Error and alpha")
plt.legend(['train score', 'test score'], loc='upper right')
plt.show()

In [None]:
print("\n The best estimator across ALL searched params:\n",
          model_lasso_cv.best_estimator_)
print("\n The best score across ALL searched params:\n",
          model_lasso_cv.best_score_)
print("\n The best parameters across ALL searched params:\n",
          model_lasso_cv.best_params_)

In [None]:
alpha_lasso =0.001

lasso = Lasso(alpha=alpha_lasso)
        
lasso.fit(X_train, y_train) 

In [None]:
lasso.coef_

In [None]:
model_parameters = list(lasso.coef_)
# model coefficients
cols = X.columns
cols = cols.insert(0, "constant")
model_parameters = list(lasso.coef_)
lasso_list = list(zip(cols, model_parameters))

In [None]:
lasso_list

In [None]:
#List of all predictors with non zero co-efficients
for i in range(len(a)):
    if(lasso_list[i][1]!=0):
        print(lasso_list[i][0])

In [None]:
lm = Lasso(alpha=0.001)
lm.fit(X_train, y_train)

# predict
y_train_pred = lm.predict(X_train)
print('The training accuracy is:')
print(metrics.r2_score(y_true=y_train, y_pred=y_train_pred))
y_test_pred = lm.predict(X_test)
print('The test accuracy is:')
print(metrics.r2_score(y_true=y_test, y_pred=y_test_pred))

In [None]:
final_pred_lasso =[]
for i in range(len(lasso_list)):
    if(lasso_list[i][1]!=0):
        final_pred_lasso.append(lasso_list[i][0])
print('Number of predictors selected by optimaloptimal alpha for ridge are:{0}'.format(len(final_pred_lasso)))

In [None]:
lm_double = Lasso(alpha=0.002)
lm_double.fit(X_train, y_train)

# predict
y_train_pred_double = lm_double.predict(X_train)
print('The training accuracy is:')
print(metrics.r2_score(y_true=y_train, y_pred=y_train_pred_double))
y_test_pred_double = lm.predict(X_test)
print('The test accuracy is:')
print(metrics.r2_score(y_true=y_test, y_pred=y_test_pred_double))

In [None]:
model_parameters = list(lm_double.coef_)
# model coefficients
cols = X.columns
cols = cols.insert(0, "constant")
model_parameters = list(lm_double.coef_)
a = list(zip(cols, model_parameters))
#List of all predictors with non zero co-efficients

final_pred =[]
for i in range(len(a)):
    if(a[i][1]!=0):
        final_pred.append(a[i][0])
print('Number of predictors selected by double the optimal alpha for lasso are:{0}'.format(len(final_pred)))

In [None]:
a

In [None]:
d = len(final_pred)

In [None]:
d

In [None]:
n = len(X)

In [None]:
X_train_new = X_train.drop(['2ndFlrSF', 'MSZoning_RH', 'LotArea', 'MSZoning_RL', 'MSSubClass_190'], axis=1)

In [None]:
X_test_new = X_test.drop(['2ndFlrSF', 'MSZoning_RH', 'LotArea', 'MSZoning_RL', 'MSSubClass_190'], axis=1)

In [None]:
X_train_new.columns

In [None]:
model_lasso_cv.fit(X_train_new, y_train)

In [None]:
cv_results_lasso = pd.DataFrame(model_lasso_cv.cv_results_)

In [None]:
# plotting mean test and train scoes with alpha 
cv_results_lasso['param_alpha'] = cv_results_lasso['param_alpha'].astype('float32')

# plotting
plt.plot(cv_results_lasso['param_alpha'], cv_results_lasso['mean_train_score'])
plt.plot(cv_results_lasso['param_alpha'], cv_results_lasso['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('Negative Mean Absolute Error')

plt.title("Negative Mean Absolute Error and alpha")
plt.legend(['train score', 'test score'], loc='upper right')
plt.show()

In [None]:
print("\n The best estimator across ALL searched params:\n",
          model_lasso_cv.best_estimator_)
print("\n The best score across ALL searched params:\n",
          model_lasso_cv.best_score_)
print("\n The best parameters across ALL searched params:\n",
          model_lasso_cv.best_params_)

In [None]:
alpha_lasso =0.001

lasso = Lasso(alpha=alpha_lasso)
        
lasso.fit(X_train_new, y_train) 

In [None]:
model_parameters = list(lasso.coef_)
# model coefficients
cols = X.columns
cols = cols.insert(0, "constant")
model_parameters = list(lasso.coef_)
lasso_list = list(zip(cols, model_parameters))

In [None]:
lm = Lasso(alpha=0.001)
lm.fit(X_train_new, y_train)

# predict
y_train_pred = lm.predict(X_train_new)
print('The training accuracy is:')
print(metrics.r2_score(y_true=y_train, y_pred=y_train_pred))
y_test_pred = lm.predict(X_test_new)
print('The test accuracy is:')
print(metrics.r2_score(y_true=y_test, y_pred=y_test_pred))

In [None]:
lasso_list

In [None]:
final_lasso_new_pred =[]
for i in range(len(lasso_list)):
    if(lasso_list[i][1]!=0):
        final_lasso_new_pred.append(lasso_list[i][0])
print('Number of predictors selected by double the optimal alpha for lasso are:{0}'.format(len(final_lasso_new_pred)))