In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

In [None]:
train=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

# EDA

In [None]:
train.drop('Id',axis=1,inplace=True)
test.drop('Id',axis=1,inplace=True)

In [None]:
train_n=train[[c for c in train.columns if train[c].dtypes!='O']].copy()
test_n=test[[c for c in test.columns if test[c].dtypes!='O']].copy()

train_c=train[[c for c in train.columns if train[c].dtypes=='O']].copy()
test_c=test[[c for c in test.columns if test[c].dtypes=='O']].copy()

# **EDA on Numerical Features**


**1.1 Removing non-linear and co-linear features**

In [None]:
corrmat=train_n.corr()
fig,ax=plt.subplots(figsize=(12,12))
sns.heatmap(corrmat,vmax=.8, square=True,ax=ax,annot=True, fmt='.2f', annot_kws={'size': 6})

In [None]:
n=15
top15_cols=corrmat.nlargest(n,'SalePrice')['SalePrice'].index

In [None]:
#remove SaleType column
top14_cols=top15_cols[1:]
top14_cols

In [None]:
corrmat_top15=train_n[top15_cols].corr()
fig1,ax1=plt.subplots(figsize=(8,8))
sns.heatmap(corrmat_top15,vmax=.8, square=True,ax=ax1,annot=True, fmt='.2f', annot_kws={'size': 12})

--> Co-linear:

So we will take only one feature from each pair.

* GarageCars and GarageArea (i would take GarageArea)
* TotalBsmtSF and 1stFirSF (i would take TotalBsmtSF)
* TotRmsAbvGrd and GrLIveArea (i would take GrLIveArea)

I took this decision by plotting below scatter plots

In [None]:
fig,ax=plt.subplots(3,2,figsize=(15,15))
sns.scatterplot(train_n['SalePrice'],train_n['GarageCars'],ax=ax[0][0])
sns.scatterplot(train_n['SalePrice'],train_n['GarageArea'],ax=ax[0][1])
sns.scatterplot(train_n['SalePrice'],train_n['TotalBsmtSF'],ax=ax[1][0])
sns.scatterplot(train_n['SalePrice'],train_n['1stFlrSF'],ax=ax[1][1])
sns.scatterplot(train_n['SalePrice'],train_n['TotRmsAbvGrd'],ax=ax[2][0])
sns.scatterplot(train_n['SalePrice'],train_n['GrLivArea'],ax=ax[2][1])
fig.tight_layout()

In [None]:
train_n1=train_n.drop(['GarageCars','1stFlrSF','TotRmsAbvGrd'],axis=1)
test_n1=test_n.drop(['GarageCars','1stFlrSF','TotRmsAbvGrd'],axis=1)

We have removed co-linearity from our dataset, Now we will manually examine each feature and remove non-linear features from the dataset

In [None]:
# top_cols=np.array(['OverallQual', 'GrLivArea', 'GarageArea','TotalBsmtSF', 'FullBath', 'YearBuilt','YearRemodAdd'])

In [None]:
fig,ax=plt.subplots(17,2,figsize=(15,60))
def graph(x,y,r,c,title):
    sns.scatterplot(train_n1[x],y,color=('orange'),ax=ax[r][c])
    ax[r][c].set_xlabel(x)
    fig.tight_layout(pad=5.0)

for r,col in enumerate(train_n1.columns):
    c=r%2
    graph(col,train['SalePrice'],r//2,c,col)


In [None]:
non_linear=['MSSubClass','LotArea','OverallCond','LowQualFinSF','BsmtFullBath', 'BsmtHalfBath','FullBath', 'HalfBath','BedroomAbvGr',
            'KitchenAbvGr','Fireplaces','3SsnPorch','ScreenPorch' ,'PoolArea','MiscVal','MoSold', 'YrSold']
print("total non-linear",len(non_linear)," columns")

In [None]:
linear_features=np.array([col for col in train_n1.columns if col not in non_linear])

We have removed some non-linear features from the dataset

**1.2 Misssing Values**

In [None]:
train_n1.isnull().sum()[train_n1.isnull().sum()!=0]

In [None]:
test_n1.isnull().sum()[test_n1.isnull().sum()!=0]

In [None]:
train_n1['GarageYrBlt']=train_n1['GarageYrBlt'].fillna(train_n1['GarageYrBlt'].median())
test_n1['GarageYrBlt']=test_n1['GarageYrBlt'].fillna(test_n1['GarageYrBlt'].median())

for col in train_n1.columns:
    train_n1[col] = train_n1[col].fillna(0)
for col in test_n1.columns:
    test_n1[col] = test_n1[col].fillna(0)

In [None]:
print(train_n1[linear_features].shape)
print(test_n1[linear_features[:-1]].shape)

In [None]:
train_n1.isnull().sum()[train_n1.isnull().sum()!=0]

In [None]:
test_n1.isnull().sum()[test_n1.isnull().sum()!=0]

**1.3 Feature Generation**

In [None]:
# fig,ax=plt.subplots(9,2,figsize=(15,40))
# def graph(x,y,r,c,title):
#     sns.scatterplot(train_n1[x],train_n1[y],color=('red'),ax=ax[r][c])
#     ax[r][c].set_ylabel(y)
#     ax[r][c].set_xlabel(x)
#     fig.tight_layout(pad=5.0)

# for r,col in enumerate(train_n1[linear_features].columns):
#     c=r%2
#     graph(col,'SalePrice',r//2,c,col)

>* We can clearly see in the above graphs that features have so many zeros.
>* That means many missing values are filled with zero.

In [None]:
train_n1['Bsmt']=train_n1['BsmtFinSF1']+train_n1['BsmtFinSF2']+train_n1['BsmtUnfSF']+train_n1['TotalBsmtSF']
test_n1['Bsmt']=test_n1['BsmtFinSF1']+test_n1['BsmtFinSF2']+test_n1['BsmtUnfSF']+test_n1['TotalBsmtSF']
sns.scatterplot(train_n1['Bsmt'],train_n1['SalePrice'])

In [None]:
train_n1['TotalPorchSF']=train_n1['EnclosedPorch']+train_n1['OpenPorchSF']
test_n1['TotalPorchSF']=test_n1['EnclosedPorch']+test_n1['OpenPorchSF']

sns.scatterplot(train_n1['TotalPorchSF'],train_n1['SalePrice'])

But this new TotalPorchSF feature is not so useful so we will not use it .

In [None]:
linear_features

In [None]:
linear_features=['LotFrontage', 'OverallQual', 'YearBuilt', 'YearRemodAdd',
       'MasVnrArea','Bsmt', '2ndFlrSF', 'GrLivArea', 'GarageYrBlt',
       'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch']

In [None]:
train_n1[linear_features].columns

In [None]:
train_n1[linear_features].shape

In [None]:
test_n1[linear_features].shape

**1.4 Outliers**

In [None]:
fig,ax=plt.subplots(7,2,figsize=(15,30))
def graph(x,y,r,c,title):
    sns.regplot(train_n1[x],train_n1[y],color=('green'),ax=ax[r][c])
    ax[r][c].set_ylabel(y)
    ax[r][c].set_xlabel(x)
    fig.tight_layout(pad=5.0)

for r,col in enumerate(train_n1[linear_features].columns):
    c=r%2
    graph(col,'SalePrice',r//2,c,col)

>Outliers:

     LotFrontage > 300 
     MasVnrArea > 1200
     Bsmt > 12000
     GrLivArea > 4600
     GarageArea > 1200
     SalePrice > 700000

In [None]:
a=train_n1['LotFrontage'].sort_values(ascending=False).head(2)
b=train_n1['MasVnrArea'].sort_values(ascending=False).head(2)
c=train_n1['Bsmt'].sort_values(ascending=False).head(1)
d=train_n1['GrLivArea'].sort_values(ascending=False).head(2)
e=train_n1['GarageArea'].sort_values(ascending=False).head(3)
f=train_n1['SalePrice'].sort_values(ascending=False).head(2)

In [None]:
b

In [None]:
index=a.index
index=index.append(b.index)
index=index.append(c.index)
index=index.append(d.index)
index=index.append(e.index)
index=index.append(f.index)

In [None]:
train_n1=train_n1.drop(index).reset_index(drop=True)

>we have removed the outliears manually

In [None]:
# fig,ax=plt.subplots(7,2,figsize=(15,30))
# def graph(x,y,r,c,title):
#     sns.regplot(train_n1[x],train_n1[y],color=('green'),ax=ax[r][c])
#     ax[r][c].set_ylabel(y)
#     ax[r][c].set_xlabel(x)
#     fig.tight_layout(pad=5.0)

# for r,col in enumerate(train_n1[linear_features].columns):
#     c=r%2
#     graph(col,'SalePrice',r//2,c,col)

In [None]:
print(train_n1.shape)
print(train_n1[linear_features].shape)

**1.5 Skewing the features**

In [None]:
Train_X_n=train_n1[linear_features]
Train_Y_n=train_n1['SalePrice']

In [None]:
Train_X_n.skew()

In [None]:
fig,ax=plt.subplots(1,2,figsize=(20,5))
sns.distplot(Train_Y_n,ax=ax[0])
sns.distplot(np.log1p(Train_Y_n),ax=ax[1])
# fig.tight_layout()

In [None]:
train_y=np.log1p(Train_Y_n)

**1.6 scaling**

In [None]:
scaler=StandardScaler()
final_train_n=pd.DataFrame(scaler.fit_transform(Train_X_n),columns=Train_X_n.columns)
final_test_n=pd.DataFrame(scaler.fit_transform(test_n1[linear_features]),columns=Train_X_n.columns)

In [None]:
final_train_n.head()

In [None]:
final_test_n.head()

# **EDA on Categorical Features**

In [None]:
train_c=train[[c for c in train.columns if train[c].dtypes=='O']].copy()
test_c=test[[c for c in test.columns if test[c].dtypes=='O']].copy()

In [None]:
print(train_c.shape)
train_c.isnull().sum()[train_c.isnull().sum()!=0].sort_values(ascending=False)

In [None]:
test_c.isnull().sum()[test_c.isnull().sum()!=0].sort_values(ascending=False)

Drop first five columns 

In [None]:
train_c.dropna(thresh=len(train_c)*0.9,axis=1,inplace=True)

In [None]:
test_c.dropna(thresh=len(test_c)*0.9,axis=1,inplace=True)

In [None]:
train_c['Electrical']=train_c['Electrical'].fillna(method='ffill')
test_c['SaleType']=test_c['SaleType'].fillna(method='ffill')
test_c['KitchenQual']=test_c['KitchenQual'].fillna(method='ffill')
test_c['Exterior1st']=test_c['Exterior1st'].fillna(method='ffill')
test_c['Exterior2nd']=test_c['Exterior2nd'].fillna(method='ffill')
test_c['Functional']=test_c['Functional'].fillna(method='ffill')
test_c['Utilities']=test_c['Utilities'].fillna(method='ffill')
test_c['MSZoning']=test_c['MSZoning'].fillna(method='ffill')


In [None]:
# Categorical missing values
for col in train_c.columns:
    train_c[col] = train_c[col].fillna("None")
for col in test_c.columns:
    test_c[col] = test_c[col].fillna("None")

In [None]:
print("Train null values : ",len(train_c.isnull().sum()[train_c.isnull().sum()!=0]))
print("Test null values : ",len(test_c.isnull().sum()[test_c.isnull().sum()!=0]))

In [None]:
train_c.shape

In [None]:
test_c.shape

In [None]:
# train_c_labeled=pd.DataFrame()
# test_c_labeled=pd.DataFrame()

# for col in train_c.columns:
#     le = preprocessing.LabelEncoder()
#     train_c_labeled[col]=le.fit_transform(train_c[col])
#     test_c_labeled[col]=le.fit_transform(test_c[col])

In [None]:
c = pd.concat((train_c, test_c), sort=False).reset_index(drop=True)

In [None]:
c1=pd.get_dummies(c)

In [None]:
final_train_c=c1.iloc[:train_c.shape[0]]
final_train_c=final_train_c.drop(index).reset_index(drop=True)
final_train_c.shape

In [None]:
final_test_c=c1.iloc[train_c.shape[0]:]
final_test_c=final_test_c.reset_index(drop=True)
final_test_c.shape

In [None]:
final_train_n.shape

# Merging training and testing dataset

In [None]:
x_train=final_train_n.merge(final_train_c,left_index=True,right_index=True).reset_index(drop=True)
x_test=final_test_n.merge(final_test_c,left_index=True,right_index=True).reset_index(drop=True)

In [None]:
x_train.shape

In [None]:
x_test.head()

In [None]:
y_train=train_y

# Splitting data into training and testing data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(x_train, y_train,test_size = .3, random_state=0)

# **Regularization and Evaluation**

In [None]:
import sklearn.model_selection as GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

ridge=Ridge()
# parameters= {'alpha':[x for x in range(1,101)]}

# ridge_reg=GridSearchCV(ridge, param_grid=parameters, scoring='neg_mean_squared_error')
# ridge_reg.fit(X_train,Y_train)
# print("The best value of Alpha is: ",ridge_reg.best_params_)
# print("The best score achieved with Alpha=11 is: ",math.sqrt(-ridge_reg.best_score_))
# ridge_pred=math.sqrt(-ridge_reg.best_score_)

In [None]:
ridge_mod=Ridge(alpha=15)
ridge_mod.fit(x_train,y_train)
y_pred_train=ridge_mod.predict(X_train)
y_pred_test=ridge_mod.predict(X_test)

print('Root Mean Square Error train = ' + str(np.sqrt(mean_squared_error(Y_train, y_pred_train))))
print('Root Mean Square Error test = ' + str(np.sqrt(mean_squared_error(Y_test, y_pred_test)))) 

In [None]:
y_test=ridge_mod.predict(x_test)

In [None]:
final_y_test=np.expm1(y_test)

In [None]:
final_y_test

In [None]:
sample=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')
submission=pd.DataFrame({"Id":sample['Id'],
                         "SalePrice":final_y_test})
submission.to_csv('submission.csv',index=False)

In [None]:
submission.head()

# XGBoost Regression

In [None]:
# from xgboost.sklearn import XGBRegressor

In [None]:
# xg_reg = XGBRegressor()
# xgparam_grid= {'learning_rate' : [0.01],'n_estimators':[2000, 3460, 4000],
#                                     'max_depth':[3], 'min_child_weight':[3,5],
#                                     'colsample_bytree':[0.5,0.7],
#                                     'reg_alpha':[0.0001,0.001,0.01,0.1,10,100],
#                                    'reg_lambda':[1,0.01,0.8,0.001,0.0001]}

# xg_grid=GridSearchCV(xg_reg, param_grid=xgparam_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
# xg_grid.fit(X_train,Y_train)
# print(xg_grid.best_estimator_)
# print(xg_grid.best_score_)

In [None]:
# xgb= XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
#              colsample_bynode=1, colsample_bytree=0.5, gamma=0,
#              importance_type='gain', learning_rate=0.01, max_delta_step=0,
#              max_depth=3, min_child_weight=0, missing=None, n_estimators=4000,
#              n_jobs=1, nthread=None, objective='reg:squarederror', random_state=0,
#              reg_alpha=0.0001, reg_lambda=0.01, scale_pos_weight=1, seed=None,
#              silent=None, subsample=1, verbosity=1)
# xgmod=xgb.fit(X_train,Y_train)
# xg_pred=xgmod.predict(X_test)


In [None]:
# print('Root Mean Square Error test = ' + str(np.sqrt(mean_squared_error(Y_test, xg_pred))))