In [None]:
import numpy as np 
import pandas as pd
import os
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("/house-prices-advanced-regression-techniques/train.csv",index_col ='Id')
df

# Data Preprocessing

## Dealing with Nan Values

In [None]:
nan_cols = dict()
for j,i in zip(df.columns,np.array(df.isna().sum())):
    if i > 500:
        nan_cols[j] = i
nan_cols

These columns have too many missing values so we can't use them:

'Alley': 1369,

'MasVnrType': 872,

'FireplaceQu': 690,

'PoolQC': 1453,

'Fence': 1179,

'MiscFeature': 1406

In [None]:
df.drop(list(nan_cols.keys()), axis = 1,inplace = True)

In [None]:
df

In [None]:
df.dtypes['MSZoning']

In [None]:
nan_rows = dict()
for j,i in zip(df.columns,np.array(df.isna().sum())):
    if i > 0:
        nan_rows[j] = i
nan_rows

In [None]:
nan_obj_col = df[list(nan_rows.keys())].select_dtypes(include = 'object').columns

In [None]:
nan_obj_col

In [None]:
for col in nan_obj_col:
     df[col] = df[col].fillna(df[col].value_counts().keys()[0])

In [None]:
nan_rows = dict()
for j,i in zip(df.columns,np.array(df.isna().sum())):
    if i > 0:
        nan_rows[j] = i
nan_rows

In [None]:
for col in nan_rows.keys():
    df[col] = df[col].fillna(df[col].mean())

In [None]:
df.isna().sum().sum()

Hence No Null values left

## Dealing with duplicates

In [None]:
df.duplicated().sum()

Hence, no duplicated rows

## Applying Label Encoder

In [None]:
obj_cols = df.select_dtypes(include = 'object').columns

In [None]:
obj_cols

In [None]:
plt.figure(figsize=(15, 5 * 13))
for i,col in enumerate(obj_cols):
    plt.subplot(13,3,i+1)
    plt.hist(df[col],bins = 20, edgecolor = 'black')
    plt.title(col)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder
les = dict()
for col in obj_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    df[col] += 0.5
    les[col] = le

In [None]:
numerical_col =  []
for col in df.columns:
    if col not in obj_cols:
        numerical_col.append(col)
print(numerical_col)

since last col is prediction col

In [None]:
numerical_col = numerical_col[:-1]

outliers removal after data visualization

# Data Visualization

In [None]:
df[numerical_col].hist(bins = 20,figsize = (20,20))

In [None]:
continous_cols = ['MSSubClass','LotFrontage','LotArea','YearBuilt','TotalBsmtSF','BsmtUnfSF',
                  'BsmtFinSF2','BsmtFinSF1','YearRemodAdd','MasVnrArea','1stFlrSF', '2ndFlrSF',
                  'LowQualFinSF','GrLivArea', 'GarageYrBlt','GarageArea','WoodDeckSF','OpenPorchSF',
                  'EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal','MoSold']

In [None]:
import seaborn as sns

for col in continous_cols:
    plt.boxplot(df[col])
    plt.title(col)
    plt.show()

# Outlier Detection and Removal

In [None]:
outlier_detection_on_col = ['MSSubClass','LotFrontage','LotArea','YearBuilt','TotalBsmtSF','BsmtUnfSF',
                  'BsmtFinSF2','BsmtFinSF1','MasVnrArea','1stFlrSF', '2ndFlrSF',
                  'LowQualFinSF','GrLivArea', 'GarageYrBlt','GarageArea','WoodDeckSF','OpenPorchSF',
                  'EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal']
means = np.mean(df[outlier_detection_on_col],axis = 0)
threshold = 3
std = np.std(df[outlier_detection_on_col],axis = 0)
z_scores = np.abs((df[outlier_detection_on_col] - means)/ std)
outliers = np.where(z_scores>threshold)[0]
print(len(outliers))

In [None]:
df.drop(outliers,axis= 0,inplace = True)

In [None]:
df

# Correlation 

In [None]:
_, ax = plt.subplots(figsize = (20,20))
sns.heatmap(df.corr(),cmap = 'coolwarm',ax=ax)
plt.show()

# Mutal Information

In [None]:
from sklearn.feature_selection import mutual_info_regression 

In [None]:
mutual_info = mutual_info_regression(df.drop(['SalePrice'],axis = 1),df['SalePrice'],random_state = 42)

In [None]:
dc = dict()
for col, val in zip(df.columns[:-1],mutual_info):
    dc[col] = [val]

In [None]:
dc = pd.DataFrame(dc)

In [None]:
less_mutual_info_col = dc.iloc[:,(mutual_info <= 0.01) & (mutual_info >= -0.01)].columns

In [None]:
less_mutual_info_col

In [None]:
df.loc[:,(df.corr()['SalePrice']<= 0.025) & (df.corr()['SalePrice'] >= -0.025)].columns

Now, selecting the columns with both less correlation with SalePrice less than 0.025 and less mutual info less than 0.01

In [None]:
remov_col = ['Street','BsmtHalfBath','MiscVal','YrSold']

In [None]:
df.drop(remov_col,axis = 1,inplace = True)

In [None]:
df

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
sc = StandardScaler()
X = sc.fit_transform(df)

In [None]:
scy = StandardScaler()
scy.fit([df['SalePrice']])

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components = 24)
x = pca.fit_transform(X[:,:-1],X[:,-1])
pca_df = pd.DataFrame(x)
pca_df['SalePrice'] = X[:,-1]
pca_df.corr()['SalePrice']

In [None]:
_, ax = plt.subplots(figsize = (20,20))
sns.heatmap(pca_df.corr(),cmap = 'bone',ax=ax)
plt.show()

# Splitting

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(pca_df.drop('SalePrice',axis = 1),pca_df['SalePrice'],test_size = 0.2)

# Linear Regression

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train,y_train)

In [None]:
y_pred = lr.predict(X_test)
print("MSE : ", mean_squared_error(y_test,y_pred))
print("R2 : ",r2_score(y_test,y_pred))

# Polymonial Regression 

In [None]:
from sklearn.preprocessing import PolynomialFeatures
 
poly = PolynomialFeatures(degree=4)
X_poly = poly.fit_transform(X_train)
 
lin2 = LinearRegression()
lin2.fit(X_poly, y_train)

In [None]:
y_pred = lin2.predict(poly.transform(X_test))
print("MSE : ", mean_squared_error(y_test,y_pred))
print("R2 : ",r2_score(y_test,y_pred))

# Ridge Regression

In [None]:
from sklearn.linear_model import Ridge

rr = Ridge(alpha = 1)
rr.fit(X_train,y_train)

In [None]:
y_pred = rr.predict(X_test)
print("MSE : ", mean_squared_error(y_test,y_pred))
print("R2 : ",r2_score(y_test,y_pred))

# Lasso Regression

In [None]:
from sklearn.linear_model import Lasso

lar = Lasso(alpha = 1)
lar.fit(X_train,y_train)

In [None]:
y_pred = lar.predict(X_test)
print("MSE : ", mean_squared_error(y_test,y_pred))
print("R2 : ",r2_score(y_test,y_pred))

# ARDRegression

In [None]:
from sklearn.linear_model import ARDRegression

ardr = ARDRegression()
ardr.fit(X_train,y_train)

In [None]:
y_pred = ardr.predict(X_test)
print("MSE : ", mean_squared_error(y_test,y_pred))
print("R2 : ",r2_score(y_test,y_pred))