# Start

In [55]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

import scipy.stats as stats

In [56]:
df = pd.read_csv('House_Pred_train.csv')
df = df.drop(axis=1, columns='Id')

In [57]:
for col in df.columns:
    percent = ((df[col].isnull().sum())/len(df))*100
    if percent>0:
        (f'{col} \t {percent}')

Dropping Columns with Higher Missing Values

In [58]:
drop_cols = ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']
df = df.drop(axis=1, columns=drop_cols)

## NUMERICAL DF AND CATEGORIAL DF

In [59]:
numerical_df = df.select_dtypes(include=np.number)
categorial_df = df.select_dtypes(exclude=np.number)

# FEATURE ENGINNERING

### HNADLING NULL VALUES

#####                          NUMERICAL DF

In [60]:
missing = numerical_df.dropna().sample(numerical_df['GarageYrBlt'].isnull().values.sum())['GarageYrBlt']
missing.index = numerical_df[numerical_df.GarageYrBlt.isnull()].index
numerical_df.loc[numerical_df[numerical_df.GarageYrBlt.isnull()].index, 'GarageYrBlt'] = missing

In [61]:
missing = numerical_df.dropna().sample(numerical_df['LotFrontage'].isnull().values.sum())['LotFrontage']
missing.index = numerical_df[numerical_df.LotFrontage.isnull()].index
numerical_df.loc[numerical_df[numerical_df.LotFrontage.isnull()].index, 'LotFrontage'] = missing

In [62]:
missing = numerical_df.dropna().sample(numerical_df['MasVnrArea'].isnull().values.sum())['MasVnrArea']
missing.index = numerical_df[numerical_df.MasVnrArea.isnull()].index
numerical_df.loc[numerical_df[numerical_df.MasVnrArea.isnull()].index, 'MasVnrArea'] = missing

In [63]:
numerical_df.isnull().values.any()

False

##### CATEGORICAL DF

In [64]:
def handle_categorical_missing(datafram, column):
    missing = datafram.dropna().sample(datafram[column].isnull().values.sum())[column]
    missing.index = datafram[datafram[column].isnull()].index
    datafram.loc[datafram[datafram[col].isnull()].index, column] = missing

In [65]:
for col in ['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'GarageType','GarageFinish','GarageQual','GarageCond']:
    handle_categorical_missing(categorial_df, col)

## Handling Outliers

In [66]:
def handling_outlier(dataframe, feature):
    median = dataframe[feature].median()
    quantile_1, quantile_3 = np.quantile(dataframe[feature], [0.25,0.75])
    IQR = quantile_3 - quantile_1
    lower_bound = quantile_1 - (1.5*IQR)
    upper_bound = quantile_3 + (1.5*IQR)
    dataframe.loc[dataframe[dataframe[feature]<lower_bound].index, feature] = lower_bound
    dataframe.loc[dataframe[dataframe[feature]>upper_bound].index, feature] = upper_bound

In [67]:
for col in numerical_df[:-1]:
    handling_outlier(numerical_df, col)

In [68]:
numerical_df.drop(columns=['KitchenAbvGr', 'EnclosedPorch', '3SsnPorch',
                           'ScreenPorch', 'PoolArea', 'MiscVal'], axis=1, inplace=True)

### Convert categorial into numerical

In [69]:
from sklearn.preprocessing import OrdinalEncoder

column_names = categorial_df.columns

ordinal_encoder = OrdinalEncoder()
categorial_df = ordinal_encoder.fit_transform(categorial_df)
categorial_df = pd.DataFrame(data=categorial_df, columns=column_names)

In [70]:
categorial_df.head()

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,Electrical,KitchenQual,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
0,3.0,1.0,3.0,3.0,0.0,4.0,0.0,5.0,2.0,2.0,...,4.0,2.0,6.0,1.0,1.0,4.0,4.0,2.0,8.0,4.0
1,3.0,1.0,3.0,3.0,0.0,2.0,0.0,24.0,1.0,2.0,...,4.0,3.0,6.0,1.0,1.0,4.0,4.0,2.0,8.0,4.0
2,3.0,1.0,0.0,3.0,0.0,4.0,0.0,5.0,2.0,2.0,...,4.0,2.0,6.0,1.0,1.0,4.0,4.0,2.0,8.0,4.0
3,3.0,1.0,0.0,3.0,0.0,0.0,0.0,6.0,2.0,2.0,...,4.0,2.0,6.0,5.0,2.0,4.0,4.0,2.0,8.0,0.0
4,3.0,1.0,0.0,3.0,0.0,2.0,0.0,15.0,2.0,2.0,...,4.0,2.0,6.0,1.0,1.0,4.0,4.0,2.0,8.0,4.0


In [71]:
numerical_df.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,MoSold,YrSold,SalePrice
0,60,65.0,8450.0,7,5.0,2003,2003.0,196.0,706.0,0,...,8,0.0,2003.0,2.0,548.0,0,61,2.0,2008,208500.0
1,20,80.0,9600.0,6,7.5,1976,1976.0,0.0,978.0,0,...,6,1.0,1976.0,2.0,460.0,298,0,5.0,2007,181500.0
2,60,68.0,11250.0,7,5.0,2001,2002.0,162.0,486.0,0,...,6,1.0,2001.0,2.0,608.0,0,42,9.0,2008,223500.0
3,70,60.0,9550.0,7,5.0,1915,1970.0,0.0,216.0,0,...,7,1.0,1998.0,3.0,642.0,0,35,2.0,2006,140000.0
4,60,84.0,14260.0,8,5.0,2000,2000.0,350.0,655.0,0,...,9,1.0,2000.0,3.0,836.0,192,84,12.0,2008,250000.0


In [72]:
final_df = pd.concat([numerical_df, categorial_df], axis=1)
final_df.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,Electrical,KitchenQual,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
0,60,65.0,8450.0,7,5.0,2003,2003.0,196.0,706.0,0,...,4.0,2.0,6.0,1.0,1.0,4.0,4.0,2.0,8.0,4.0
1,20,80.0,9600.0,6,7.5,1976,1976.0,0.0,978.0,0,...,4.0,3.0,6.0,1.0,1.0,4.0,4.0,2.0,8.0,4.0
2,60,68.0,11250.0,7,5.0,2001,2002.0,162.0,486.0,0,...,4.0,2.0,6.0,1.0,1.0,4.0,4.0,2.0,8.0,4.0
3,70,60.0,9550.0,7,5.0,1915,1970.0,0.0,216.0,0,...,4.0,2.0,6.0,5.0,2.0,4.0,4.0,2.0,8.0,0.0
4,60,84.0,14260.0,8,5.0,2000,2000.0,350.0,655.0,0,...,4.0,2.0,6.0,1.0,1.0,4.0,4.0,2.0,8.0,4.0


# X & Y

In [127]:
X = final_df.drop(axis=1,columns='SalePrice')
y= final_df.SalePrice

# Feature Selection

In [128]:
from sklearn.feature_selection import f_regression

In [129]:
f,p = f_regression(X,y)

In [130]:
select = pd.Series(data=p, index=X.columns).sort_values(ascending=True)

In [131]:
select.index[:25]

Index(['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF',
       'ExterQual', '1stFlrSF', 'BsmtQual', 'FullBath', 'KitchenQual',
       'YearBuilt', 'YearRemodAdd', 'TotRmsAbvGrd', 'GarageFinish',
       'GarageYrBlt', 'Fireplaces', 'MasVnrArea', 'LotArea', 'HeatingQC',
       'Foundation', 'GarageType', 'OpenPorchSF', 'BsmtFinSF1', 'LotFrontage',
       'WoodDeckSF'],
      dtype='object')

In [132]:
X = final_df[select.index[:25]]
y= final_df.SalePrice

In [134]:
X.head()

Unnamed: 0,OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,ExterQual,1stFlrSF,BsmtQual,FullBath,KitchenQual,...,Fireplaces,MasVnrArea,LotArea,HeatingQC,Foundation,GarageType,OpenPorchSF,BsmtFinSF1,LotFrontage,WoodDeckSF
0,7,1710.0,2.0,548.0,856,2.0,856.0,2.0,2.0,2.0,...,0.0,196.0,8450.0,0.0,2.0,1.0,61,706.0,65.0,0
1,6,1262.0,2.0,460.0,1262,3.0,1262.0,2.0,2.0,3.0,...,1.0,0.0,9600.0,0.0,1.0,1.0,0,978.0,80.0,298
2,7,1786.0,2.0,608.0,920,2.0,920.0,2.0,2.0,2.0,...,1.0,162.0,11250.0,0.0,2.0,1.0,42,486.0,68.0,0
3,7,1717.0,3.0,642.0,756,3.0,961.0,3.0,1.0,2.0,...,1.0,0.0,9550.0,2.0,0.0,5.0,35,216.0,60.0,0
4,8,2198.0,3.0,836.0,1145,2.0,1145.0,2.0,2.0,2.0,...,1.0,350.0,14260.0,0.0,2.0,1.0,84,655.0,84.0,192


# Standard Scaling

In [135]:
from sklearn.preprocessing import StandardScaler

In [136]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Splitting

In [137]:
from sklearn.model_selection import train_test_split

In [138]:
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=20, test_size=0.2)

# Model

In [139]:
from sklearn.metrics import r2_score, mean_squared_error
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import RidgeCV

In [140]:
xgb = XGBRegressor()
xgb.fit(x_train, y_train)
pred_xgb = xgb.predict(x_test)
print(r2_score(pred_xgb, y_test))
print(mean_squared_error(pred_xgb, y_test))

0.8374076495803677
531497646.3473684


In [141]:
rfc = RandomForestRegressor()
rfc.fit(x_train, y_train)
pred_rfc = rfc.predict(x_test)
print(r2_score(pred_rfc, y_test))
print(mean_squared_error(pred_rfc, y_test))

0.8514803259487598
490070529.8473178


In [142]:
ridge = RidgeCV(cv=4)
ridge.fit(x_train, y_train)
pred_ridge = ridge.predict(x_test)
print(r2_score(pred_ridge, y_test))
print(mean_squared_error(pred_ridge, y_test))

0.8718160301945144
456817635.8308884
