In [None]:
# Importing Libraries and Dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, Ridge, Lasso
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split

train = pd.read_csv('train.csv')
train.head()

In [None]:
#Identifying null values count in each column
train_null = train.isnull().sum()
train_null

In [None]:
#Dropping columns with Null Values count more than 1000
train = train.drop(['Alley','PoolQC','Fence','MiscFeature'], axis = 1)
train.shape

In [None]:
#Identifying columns with object data type and unique values in each column
train_Object_DT = train.select_dtypes(exclude=[np.number])
for i in train_Object_DT.columns:
    print(i + "\t" + "\t" + str(len(train_Object_DT[i].unique())))

In [None]:
#Creating dummy variable as part of hot encoding against the selective columns out of above mentioned list.
train=pd.get_dummies(data=train, columns=['MSZoning', 'Street','Utilities','BldgType','ExterQual','ExterCond','BsmtQual','BsmtCond','CentralAir','Electrical','KitchenQual','GarageType','SaleType','SaleCondition'],drop_first=True)
train.head(20)

In [None]:
#Selecting only numeric data to select the machine learning modelling.
train = train.select_dtypes(include=[np.number])
train

In [None]:
#Identifying columns with null values
null_columns=train.columns[train.isnull().any()]
train[null_columns].isnull().sum()

In [None]:
#Converting column with Year data to Age for better results
def age(x): return 2020-x
train['Year_Built_Age'] = train.YearBuilt.apply(age)
train['Year_GarageBuilt_Age'] = train.GarageYrBlt.apply(age)
train['Age_After_Remodel'] = train.YearRemodAdd.apply(age)
train['Age_After_Sold'] = train.YrSold.apply(age)
train.head()

In [None]:
#Dropping column with year data as the same is now converted to age.
train = train.drop(['YearBuilt','GarageYrBlt','YearRemodAdd','YrSold','LotFrontage','MasVnrArea'], axis=1)

In [None]:
train.describe()

In [None]:
#Imputing null variables with mean of the column value.
train = train.groupby(train.columns, axis = 1).transform(lambda x: x.fillna(x.mean()))
train.head(20)

In [None]:
#Skewness of Dataset Target Variable
train_skew = train.SalePrice.skew()
plt.hist(train.SalePrice)
plt.show()

In [None]:
#Log value of Dataset Target Variable to project normal distribution
train_skew_log = np.log(train.SalePrice)
plt.hist(train_skew_log)
plt.show()

In [None]:
# correlation heatmap
plt.figure(figsize=(28,28))
corr1 = sns.heatmap(train.corr(), vmin=-1, vmax=1, center=0, cmap=sns.diverging_palette(20, 220, n=200), square=True)

In [None]:
#Identifying Variables with Co-relation value more than .50
correlation_temp = train.corr()
correlation_fin = correlation_temp.index[abs(correlation_temp["SalePrice"]) >= 0.50]
correlation_fin

In [None]:
plt.figure(figsize=(25,25))
corr2 = sns.heatmap(train[correlation_fin].corr(), vmin=-1, vmax=1, center=0, cmap=sns.diverging_palette(20, 220, n=200), square=True, linewidth=1)

In [None]:
#Taking top 5 high co-relation variable.
corr = train.corr()
print (corr['SalePrice'].sort_values(ascending=False)[:5], '\n')

In [None]:
#Plotting graph of salePrice with OverallQual
rel_OverallQual = train.pivot_table(index='OverallQual', values='SalePrice', aggfunc=np.median)
print(rel_OverallQual)
rel_OverallQual.plot(kind='bar')
plt.xlabel('Overall Quality')
plt.ylabel('Sale Price')
plt.xticks(rotation=0)
plt.show()

In [None]:
#Plotting graph of salePrice with GrLivArea and identify outliers
rel_GrLivArea = train.pivot_table(index='GrLivArea', values='SalePrice', aggfunc=np.median)
print(rel_GrLivArea)
plt.scatter(x=train['GrLivArea'], y=train_skew_log)
plt.xlabel('Living Area')
plt.ylabel('Sale Price')
plt.show()

In [None]:
#Plotting graph of salePrice with GrLivArea removing outliers
train = train[train['GrLivArea'] < 4000]
plt.scatter(x=train['GrLivArea'], y=np.log(train.SalePrice))
plt.xlabel('Living Area')
plt.ylabel('Sale Price')
plt.show()

In [None]:
#Plotting graph of salePrice with GarageCars
rel_GarageCars = train.pivot_table(index='GarageCars', values='SalePrice', aggfunc=np.median)
print(rel_GarageCars)
rel_GarageCars.plot(kind='bar')
plt.xlabel('Garage Cars')
plt.ylabel('Sale Price')
plt.xticks(rotation=0)
plt.show()

In [None]:
#Plotting graph of salePrice with GarageArea and identify outliers
rel_GarageArea = train.pivot_table(index='GarageArea', values='SalePrice', aggfunc=np.median)
print(rel_GarageArea)
plt.scatter(x=train['GarageArea'], y=np.log(train.SalePrice))
plt.xlabel('Garage Area')
plt.ylabel('Sale Price')
plt.xticks(rotation=0)
plt.show()

In [None]:
#Plotting graph of salePrice with GarageArea removing outliers
train = train[train['GarageArea'] < 1100]
plt.scatter(x=train['GarageArea'], y=np.log(train.SalePrice))
plt.xlabel('Garage Area')
plt.ylabel('Sale Price')
plt.show()

In [None]:
# separate the features and the target variable for modeling.
# Assigning the features to X and the target variable(Sales Price)to y.
y = np.log(train.SalePrice)
X = train.drop(['SalePrice', 'Id'], axis=1)

In [None]:
#Bifurcating dataset into test and training set with 35% of the data is devoted to the hold-out set
#Initiating Linear Regression model and check score.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.35, random_state=0)
regression=LinearRegression()
regression.fit(X_train,y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
y_pred=regression.predict(X_test)
score=r2_score(y_test,y_pred)
score

In [None]:
#Initializing RFE Model
#Fitting the data to model.
from sklearn.feature_selection import RFE
RFE = RFE(regression, 10)
X_RFE = RFE.fit_transform(X,y)
regression.fit(X_RFE,y)

print(RFE.support_)
print(RFE.ranking_)

In [None]:
#To find the optimum number of features, for which the accuracy is the highest, we are using loop starting with 1 feature and going up to 83
from sklearn.feature_selection import RFE
nof_list=np.arange(1,83)            
high_score=0
#Variable to store the optimum features
nof=0           
score_list =[]
for n in range(len(nof_list)):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 0)
    model = LinearRegression()
    
    rfe = RFE(model,nof_list[n])
    X_train_rfe = rfe.fit_transform(X_train,y_train)
    X_test_rfe = rfe.transform(X_test)
    model.fit(X_train_rfe,y_train)
    score = model.score(X_test_rfe,y_test)
    score_list.append(score)
    if(score>high_score):
        high_score = score
        nof = nof_list[n]

print("Optimum number of features: %d" %nof)
print("Score with %d features: %f" % (nof, high_score))

In [None]:
#Identify 10 features giving True results in RFE Model.
from sklearn.feature_selection import RFE
cols = list(X.columns)
model = LinearRegression()
rfe = RFE(model, 10)
X_rfe = rfe.fit_transform(X,y)
model.fit(X_rfe,y)              
temp = pd.Series(rfe.support_,index = cols)
selected_features_rfe = temp[temp==True].index
print(selected_features_rfe)

In [None]:
#Initializing Ridge Regression
#Plotting co-efficient value in feature importance order
#Best alpha value
#Best score achieved
Rreg = RidgeCV()
Rreg.fit(X, y)
print("Best alpha with RidgeCV: %f" % Rreg.alpha_)
print("Best score with RidgeCV: %f" % Rreg.score(X,y))
coef = pd.Series(Rreg.coef_, index = X.columns)
imp_coef = coef.sort_values()
import matplotlib
matplotlib.rcParams['figure.figsize'] = (15.0, 30.0)
imp_coef.plot(kind = "barh")
plt.title("Feature importance using Ridge Model")

In [None]:
#Initializing Lasso Regression
#Plotting co-efficient value in feature importance order
#Best alpha value
#Best score achieved
Lreg = LassoCV()
Lreg.fit(X, y)
print("Best alpha with LassoCV: %f" % Lreg.alpha_)
print("Best score with LassoCV: %f" % Lreg.score(X,y))
coef = pd.Series(Lreg.coef_, index = X.columns)
imp_coef = coef.sort_values()
import matplotlib
matplotlib.rcParams['figure.figsize'] = (5.0, 30.0)
imp_coef.plot(kind = "barh")
plt.title("Feature importance using Lasso Model")

In [None]:
#Regularization with various values of alpha
for i in range (-3, 3):
    alpha = 10**i
    rm = linear_model.Ridge(alpha=alpha)
    ridge_model = rm.fit(X_train, y_train)
    preds_ridge = ridge_model.predict(X_test)

    print('Ridge Regularization with alpha = {}'.format(alpha))
    print('Model Score: %f' %ridge_model.score(X_test, y_test))
    print('Mean Square Error: %f' %mean_squared_error(y_test, preds_ridge), '\n')

In [None]:
#Regularization with various values of alpha
for i in range (-3, 3):
    alpha = 10**i
    lrm = linear_model.Lasso(alpha=alpha)
    lasso_model = lrm.fit(X_train, y_train)
    preds_lasso = lasso_model.predict(X_test)

    print('Lasso Regularization with alpha = {}'.format(alpha))
    print('Model Score: %f' %lasso_model.score(X_test, y_test))
    print('Mean Square Error: %f' %mean_squared_error(y_test, preds_lasso), '\n')