# Importing Useful libraries

In [1]:
# import libararies
import numpy       as np
import pandas      as pd
import matplotlib.pyplot as plt
import seaborn     as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from collections import Counter


In [2]:
# load Datasets
raw_data = pd.read_csv('F:/Books/Machine Learning/DataSets/house-prices-advanced-regression-technique/train.csv') 
test_data = pd.read_csv('F:/Books/Machine Learning/DataSets/house-prices-advanced-regression-technique/test.csv')

raw_label = raw_data.SalePrice
raw_data.drop('SalePrice',axis=1,inplace=True)

object_features = raw_data.dtypes[raw_data.dtypes == 'object'].keys()
int_features    = raw_data.dtypes[raw_data.dtypes == 'int64'].keys()
float_features  = raw_data.dtypes[raw_data.dtypes == 'float64'].keys()

drop_obj_feat = ['Street','Alley','Utilities','LandSlope','Condition2','RoofMatl','Heating',
                'BsmtCond','PoolQC','MiscFeature','HeatingQC','Functional','GarageQual','GarageCond',
                'PavedDrive','MiscFeature','ExterCond','CentralAir','Fence','FireplaceQu','MasVnrType']

object_features_2 = object_features.drop(drop_obj_feat)

null = raw_data[object_features_2].isnull().sum()
null_obj = []
for key,value in zip(null.index,null.values):
    if value == 0:
        continue
    else:
        null_obj.append(key)
        
raw_data.loc[:,null_obj] = raw_data.loc[:,null_obj].fillna('not_given')

split = round(raw_data.shape[0] * .02)  
for i in object_features_2:
    count = raw_data[i].value_counts()
    raw_data[i][raw_data[i].isin(count[count < split].index)] = 'allmix'

# Concatinating One hot feature with remaining
raw_data = pd.concat([raw_data,pd.get_dummies(raw_data[object_features_2])],axis=1)
raw_data.drop(object_features_2, inplace=True,axis=1) # drop these feature because we hot encode these values
raw_data.drop(drop_obj_feat, inplace=True,axis=1) # drop these feature because these are not important 

int_features = raw_data.select_dtypes(['int64']).columns

outliers = []
drop_int_feat = []
outliers.extend(raw_data.LotArea[raw_data.LotArea > 35000].index.tolist()) 
outliers.extend(raw_data.OverallQual[raw_data.OverallQual < 2].index.tolist()) 
outliers.extend(raw_data.OverallCond[raw_data.OverallCond < 2].index.tolist())  
outliers.extend(raw_data.BsmtFinSF1[raw_data.BsmtFinSF1 > 2500].index.tolist()) 
outliers.extend(raw_data.LowQualFinSF[raw_data.LowQualFinSF > 500].index.tolist()) 
outliers.extend(raw_data.BsmtFullBath[raw_data.BsmtFullBath > 2].index.tolist())
outliers.extend(raw_data.FullBath[raw_data.FullBath < 1].index.tolist()) 
outliers.extend(raw_data.BedroomAbvGr[raw_data.BedroomAbvGr > 6].index.tolist())  
outliers.extend(raw_data.KitchenAbvGr[raw_data.KitchenAbvGr > 2].index.tolist()) 
outliers.extend(raw_data.WoodDeckSF[raw_data.WoodDeckSF > 600].index.tolist()) 
outliers.extend(raw_data.OpenPorchSF[raw_data.OpenPorchSF > 400].index.tolist()) 
outliers.extend(raw_data.EnclosedPorch[raw_data.EnclosedPorch > 300].index.tolist()) 
outliers.extend(raw_data.ScreenPorch[raw_data.ScreenPorch > 350].index.tolist())  
outliers.extend(raw_data.MiscVal[raw_data.MiscVal > 2100].index.tolist()) 

drop_int_feat.append('BsmtHalfBath')
drop_int_feat.append('BsmtFinSF2')
drop_int_feat.append('3SsnPorch')
drop_int_feat.append('PoolArea')


# Fill missing values after removing outliers

outlier = raw_data.LotFrontage[raw_data.LotFrontage > 200].index.tolist()
outliers.extend(outlier)
m = raw_data.LotFrontage.drop(outlier).mean()
s = raw_data.LotFrontage.drop(outlier).std()
randoms = np.random.normal(m , s/1.5 , raw_data.LotFrontage.isna().sum())
raw_data.LotFrontage[raw_data.LotFrontage.isna()] = randoms


outlier = raw_data.MasVnrArea[raw_data.MasVnrArea > 1100].index.tolist()
outliers.extend(outlier)
m = raw_data.MasVnrArea.drop(outlier).mean()
raw_data.MasVnrArea.fillna(m,inplace=True)



x = np.random.normal(round(raw_data.GarageYrBlt.mean()),
                        round(raw_data.GarageYrBlt.std()/2),
                        raw_data.GarageYrBlt.isna().sum())
raw_data.GarageYrBlt[raw_data.GarageYrBlt.isna()]  =  list(map(round,x))


def outlierDetect(df, no_of_feature_contains = 3):
    '''pass pandas"s dataframe'''
    outlier_indices = []
    for col in df:
        q1 = np.percentile(df[col],25)
        q3 = np.percentile(df[col],75)
        # IQR
        iqr = q3 - q1
        lower = q1 - 1.5 * iqr
        upper = q3 + 1.5 * iqr
        outlier_index = df[ (df[col] < lower) | (df[col] > upper)].index.tolist()
        outlier_indices.extend(outlier_index)

    counted_outliers = Counter(outlier_indices)    
    indexs = [k for k,v in counted_outliers.items() if v > no_of_feature_contains] 
    return indexs


outliers.extend(outlierDetect(raw_data[int_features],no_of_feature_contains = 5))

# Dropping all outliers 
raw_data.drop(set(outliers), inplace = True)
raw_label.drop(set(outliers),inplace = True)

# Droping int features
raw_data.drop(drop_int_feat,axis=1,inplace=True)

int_features = int_features.append(raw_data.select_dtypes('float64').columns)
int_features = int_features.drop(drop_int_feat)
int_features = int_features.drop('Id')

uint8_features = raw_data.select_dtypes('uint8').columns


In [None]:
raw_data.shape , raw_label.shape

In [None]:
# load Datasets
raw_data = pd.read_csv('F:/Books/Machine Learning/DataSets/house-prices-advanced-regression-technique/train.csv') 
test_data = pd.read_csv('F:/Books/Machine Learning/DataSets/house-prices-advanced-regression-technique/test.csv')

In [None]:
# Extract different columns between testdata and traindata
set(raw_data.columns).difference(test_data.columns) # this is a feature

In [None]:
# Seperate labels from dataset
raw_label = raw_data.SalePrice
raw_data.drop('SalePrice',axis=1,inplace=True)

In [None]:
# Checking shape of datasets
print(raw_data.shape)
print(test_data.shape)
print(raw_label.shape)

In [None]:
# Information about data
raw_data.info()

In [None]:
# checking total no. of object,int,float type features
object_features = raw_data.dtypes[raw_data.dtypes == 'object'].keys()
int_features    = raw_data.dtypes[raw_data.dtypes == 'int64'].keys()
float_features  = raw_data.dtypes[raw_data.dtypes == 'float64'].keys()

#raw_data.select_dtypes(include=[object]).columns 

print(len(object_features)) # 43
print(len(int_features))    # 34
print(len(float_features))  #  3

# 1. Feature Exploration, Engineering and Cleaning


## 1.1 Dealing with Object Data_Type

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(raw_data[object_features].isnull())
plt.show()

In [None]:
for i in object_features:
    sns.countplot(raw_data[i])
    plt.show()

In [None]:
for i in object_features:
    print(raw_data[i].value_counts())

In [None]:
#####################################
##### Dropping  Object Feature ######
#####################################
# Street     :: 99% are same (these values dont provide us information because all values are same)
# Alley      :: only 91 values remaining are null. So we can drop this column
# Utilities  :: 99.99% are same
# LandSlope  :: same values most
# Condition2 :: same values most
# RoofMatl   :: same values most
# Heating    :: same values most
# BsmtCond   :: same values most
# PoolQC     :: null values most
# MiscFeature:: null values most
# HeatingQC  :: mostly yes
# Functional :: not varies
# GarageQual :: not varies ## take it
# GarageCond :: same value most
# PavedDrive :: null value most
# MiscFeature:: mostly are null
# ExterCond  :: same value most
# CentralAir :: not varies
# Fence      :: not varies
# MasVnrType :: None

In [None]:
drop_obj_feat = ['Street','Alley','Utilities','LandSlope','Condition2','RoofMatl','Heating',
                'BsmtCond','PoolQC','MiscFeature','HeatingQC','Functional','GarageQual','GarageCond',
                'PavedDrive','MiscFeature','ExterCond','CentralAir','Fence','FireplaceQu','MasVnrType']

object_features_2 = object_features.drop(drop_obj_feat)

### 1.1.1 Dealing with missing values in Object DataType

In [None]:
# finding all null values columns 
null = raw_data[object_features_2].isnull().sum()
null_obj = []
for key,value in zip(null.index,null.values):
    if value == 0:
        continue
    else:
        null_obj.append(key)

In [None]:
null_obj  # these are only features which contains null values

In [None]:
for i in null_obj:
    sns.countplot(raw_data[i])
    plt.show()

In [None]:
# We cannot replace missing values with mode because not all distribution are same
# So, we replace with a Fixed value 'notgiven'
raw_data.loc[:,null_obj] = raw_data.loc[:,null_obj].fillna('not_given')

In [None]:
raw_data[object_features_2].info()

### 1.1.2 Masking the values of Object DataTypes
#### I try to reduce the dimension of dataset (Try to avoid this step because it is possible that those values which are less in train data will be more in test data)

In [None]:
# value which are only 2% of all dataset unique will assign in a new column called allmix
split = round(raw_data.shape[0] * .02)  
for i in object_features_2:
    count = raw_data[i].value_counts()
    raw_data[i][raw_data[i].isin(count[count < split].index)] = 'allmix'

In [None]:
for i in object_features_2:
    sns.countplot(raw_data[i])
    plt.show()

In [None]:
for i in object_features_2:
    print(raw_data[i].value_counts())

### 1.1.3 One Hot Encoding

In [None]:
pd.get_dummies(raw_data[object_features_2]).head()

In [None]:
# Concatinating One hot feature with remaining
raw_data = pd.concat([raw_data,pd.get_dummies(raw_data[object_features_2])],axis=1)
raw_data.drop(object_features_2, inplace=True,axis=1) # drop these feature because we hot encode these values
raw_data.drop(drop_obj_feat, inplace=True,axis=1) # drop these feature because these are not important 

In [None]:
raw_data.head()

## 1.2 Dealing with int,float DataType

In [None]:
int_features = raw_data.select_dtypes(['int64']).columns

In [None]:
raw_data[int_features].hist(bins='auto',figsize=(15,35),layout=(13,3))
plt.show()

### 1.2.1 Dealing with Missing, Outlier of IntDataType

In [None]:
raw_data.select_dtypes('int64').info()

In [None]:
## Int datatype features not contains any missing values

In [None]:
raw_data.select_dtypes('int64').describe()

In [None]:
## Detect Int datatype outliers

In [None]:
#raw_data.LotArea[raw_data.LotArea > 35000] # drop these 18 observations
#raw_data.OverallQual[raw_data.OverallQual < 2]  # 375 , 533
## OverallCond ## seem to be important
## OverallQual ## seem to be important
## raw_data.OverallCond[raw_data.OverallCond < 2]  # 375
## yearbuild       ok
## yearremodadd    ok
#raw_data.BsmtFinSF1[raw_data.BsmtFinSF1 > 2500]   # 1298
#raw_data.BsmtFinSF2.value_counts() DROP
## BsmtfinUnfSF    ok
#raw_data.TotalBsmtSF[raw_data.TotalBsmtSF > 3500]  # 1298
#raw_data['1stFlrSF'][raw_data['1stFlrSF'] > 3500]  # 1298
# 2ndflrsf         ok
#raw_data.LowQualFinSF[raw_data.LowQualFinSF > 500]  # 88,170,185,635,1009
#raw_data.GrLivArea[raw_data.GrLivArea > 5000]       # 1298
#raw_data.BsmtFullBath[raw_data.BsmtFullBath > 2]  # 738
#BsmthalfBath   drop
#raw_data.FullBath[raw_data.FullBath < 1] # 9 observations
# Halfbath      ok
#raw_data.BedroomAbvGr[raw_data.BedroomAbvGr > 6] # 635 
#raw_data.KitchenAbvGr[raw_data.KitchenAbvGr > 2] # 48, 809
# fireplace     ok
# garage cars   ok
# garage ares   ok
#raw_data.WoodDeckSF[raw_data.WoodDeckSF > 600] # 6 observations
#raw_data.OpenPorchSF[raw_data.OpenPorchSF > 400] # 5 observations
#raw_data.EnclosedPorch[raw_data.EnclosedPorch > 300] # 5 observations
# 3SnPorsch  drop
#raw_data.ScreenPorch[raw_data.ScreenPorch > 350]  # 6 observations
# poolarea   drop
#raw_data.MiscVal[raw_data.MiscVal > 2100]  # 346,705,1230,1457
# MoSold    ok
# YrSold    ok

In [None]:
# Check distribution and detect outliers
for i in raw_data.select_dtypes('int64').drop('Id',axis=1):
    sns.distplot(raw_data[i])
    plt.show()

In [None]:

outliers = []
drop_int_feat = []
outliers.extend(raw_data.LotArea[raw_data.LotArea > 35000].index.tolist()) 
outliers.extend(raw_data.OverallQual[raw_data.OverallQual < 2].index.tolist()) 
outliers.extend(raw_data.OverallCond[raw_data.OverallCond < 2].index.tolist())  
outliers.extend(raw_data.BsmtFinSF1[raw_data.BsmtFinSF1 > 2500].index.tolist()) 
outliers.extend(raw_data.LowQualFinSF[raw_data.LowQualFinSF > 500].index.tolist()) 
outliers.extend(raw_data.BsmtFullBath[raw_data.BsmtFullBath > 2].index.tolist())
outliers.extend(raw_data.FullBath[raw_data.FullBath < 1].index.tolist()) 
outliers.extend(raw_data.BedroomAbvGr[raw_data.BedroomAbvGr > 6].index.tolist())  
outliers.extend(raw_data.KitchenAbvGr[raw_data.KitchenAbvGr > 2].index.tolist()) 
outliers.extend(raw_data.WoodDeckSF[raw_data.WoodDeckSF > 600].index.tolist()) 
outliers.extend(raw_data.OpenPorchSF[raw_data.OpenPorchSF > 400].index.tolist()) 
outliers.extend(raw_data.EnclosedPorch[raw_data.EnclosedPorch > 300].index.tolist()) 
outliers.extend(raw_data.ScreenPorch[raw_data.ScreenPorch > 350].index.tolist())  
outliers.extend(raw_data.MiscVal[raw_data.MiscVal > 2100].index.tolist()) 

drop_int_feat.append('BsmtHalfBath')
drop_int_feat.append('BsmtFinSF2')
drop_int_feat.append('3SsnPorch')
drop_int_feat.append('PoolArea')


### 1.2.2. Dealing with Missing, Outlier values of Float DataTypes

In [None]:
raw_data.select_dtypes('float64').info()

In [None]:
# Only Float datatype features contains missing values
sns.heatmap(raw_data.select_dtypes('float64').isna())

In [None]:
for i in raw_data.select_dtypes('float64').columns:
    sns.distplot(raw_data[i].dropna())
    plt.show()

In [None]:
# Fill missing values after removing outliers

outlier = raw_data.LotFrontage[raw_data.LotFrontage > 200].index.tolist()
outliers.extend(outlier)
m = raw_data.LotFrontage.drop(outlier).mean()
s = raw_data.LotFrontage.drop(outlier).std()
randoms = np.random.normal(m , s/1.5 , raw_data.LotFrontage.isna().sum())
raw_data.LotFrontage[raw_data.LotFrontage.isna()] = randoms


outlier = raw_data.MasVnrArea[raw_data.MasVnrArea > 1100].index.tolist()
outliers.extend(outlier)
m = raw_data.MasVnrArea.drop(outlier).mean()
raw_data.MasVnrArea.fillna(m,inplace=True)



x = np.random.normal(round(raw_data.GarageYrBlt.mean()),
                        round(raw_data.GarageYrBlt.std()/2),
                        raw_data.GarageYrBlt.isna().sum())
raw_data.GarageYrBlt[raw_data.GarageYrBlt.isna()]  =  list(map(round,x))


In [None]:
for i in raw_data.select_dtypes('float64').columns:
    sns.distplot(raw_data[i])
    plt.show()

In [None]:

from collections import Counter
# Detect outlier using Interquartile rate

def outlierDetect(df, no_of_feature_contains = 3):
    '''pass pandas"s dataframe'''
    outlier_indices = []
    for col in df:
        q1 = np.percentile(df[col],25)
        q3 = np.percentile(df[col],75)
        # IQR
        iqr = q3 - q1
        lower = q1 - 1.5 * iqr
        upper = q3 + 1.5 * iqr
        outlier_index = df[ (df[col] < lower) | (df[col] > upper)].index.tolist()
        outlier_indices.extend(outlier_index)

    counted_outliers = Counter(outlier_indices)    
    indexs = [k for k,v in counted_outliers.items() if v > no_of_feature_contains] 
    return indexs


In [None]:
outliers.extend(outlierDetect(raw_data[int_features],no_of_feature_contains = 5))

# Dropping all outliers 
raw_data.drop(set(outliers), inplace = True)
raw_label.drop(set(outliers),inplace = True)

# Droping int features
raw_data.drop(drop_int_feat,axis=1,inplace=True)

int_features = int_features.append(raw_data.select_dtypes('float64').columns)
int_features = int_features.drop(drop_int_feat)
int_features = int_features.drop('Id')

uint8_features = raw_data.select_dtypes('uint8').columns


In [None]:
corr = raw_data[int_features].corr()
plt.figure(figsize=(16,16))
sns.heatmap(corr, cbar = True,  square = True, 
            annot=True, fmt= '.2f',annot_kws={'size': 5},
           xticklabels= int_features, yticklabels= int_features, 
            alpha = 0.7,   cmap= 'coolwarm')
plt.show()

In [3]:
from xgboost              import XGBRegressor
from sklearn.svm          import SVR
from sklearn.ensemble     import RandomForestRegressor
from sklearn.ensemble     import AdaBoostRegressor
from sklearn.ensemble     import ExtraTreesRegressor 
from sklearn.ensemble     import GradientBoostingRegressor
from sklearn.neighbors    import KNeighborsRegressor
from sklearn.metrics      import r2_score, accuracy_score, mean_squared_error
from sklearn.pipeline     import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing   import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
import time

In [13]:
x_train, x_test, y_train, y_test = train_test_split(raw_data.iloc[:,1:],
                                                    raw_label,
                                                   test_size=.3,
                                                   random_state= 45,
                                                   shuffle=True)

In [14]:
sc = StandardScaler()
new_x_train = sc.fit_transform(x_train[int_features])
new_x_test = sc.transform(x_test[int_features])

x_train[int_features] = new_x_train
x_test[int_features] = new_x_test

In [15]:
model = XGBRegressor(tree_method='gpu_hist',objective='reg:squarederror')
model.fit(x_train, y_train)
model.score(x_test,y_test)

0.8869171873078638

In [None]:
pipelines = []
n_estimators = 200
seed = 42
pipelines.append( ( 'SVR' , Pipeline( [('SVC',SVR()) ]) ) ) 

pipelines.append( ( 'KNN' , Pipeline( [('KNN',KNeighborsRegressor()) ]) ) )

pipelines.append( ( 'RF'  , Pipeline( [('RF', RandomForestRegressor(random_state=seed, n_estimators=n_estimators)) ]) ) )

pipelines.append( ( 'Ada' , Pipeline( [('Ada',AdaBoostRegressor(random_state=seed    , n_estimators=n_estimators)) ]) ) )

pipelines.append( ( 'ET'  , Pipeline( [('ET', ExtraTreesRegressor(random_state=seed  , n_estimators=n_estimators)) ]) ) )

pipelines.append( ( 'GB'  , Pipeline( [('GB', GradientBoostingRegressor(random_state=seed)) ]) ) )

pipelines.append( ( 'XGB'  , Pipeline( [('XGB', XGBRegressor(random_state=seed)) ]) ) )


In [None]:
results, names  = [], []
for name, model in pipelines:
    cv_results = cross_val_score(model, x_train, y_train, cv = 5 ) 
    results.append(cv_results)
    names.append(name)
    
fig = plt.figure(figsize=(12,8))    
fig.suptitle("Algorithms comparison")
ax = fig.add_subplot(1,1,1)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
score = pd.DataFrame(results,index=names,columns=['CV1','CV2','CV3','CV4','CV5'])
score

In [None]:
gbm = GradientBoostingRegressor()
gbm.fit(x_train,y_train)
gbm.score(x_test,y_test)

In [None]:
xgbm = XGBRegressor()
xgbm.fit(x_train,y_train)
xgbm.score(x_test,y_test)

In [None]:
xgbm.get_params

In [None]:
kfold = KFold(n_splits=3, random_state=7)
result = cross_val_score(lr, X, y, cv=kfold, scoring='accuracy')
print(result.mean())

In [None]:
xgb_clf = XGBRegressor(objective='reg:squarederror')

params = {'learning_rate':[0.1,0.08,0.05,0.01,0.001],
         'gamma':[0.01,0.1,0.3,0.5,1,1.5,2],
         'max_depth':[2,4,7,10],
         'colsample_bytree':[0.3,0.6,0.8,1],
         "subsample": [0.2, 0.4, 0.5, 0.6, 0.7],
         "reg_alpha": [0, 0.5, 1],
         "reg_lambda": [1, 1.5, 2, 3, 4.5],
         "min_child_weight": [1, 3, 5, 7],
         "n_estimators": [100, 250, 500, 1000]}

xgb_rscv = RandomizedSearchCV(xgb_clf, param_distributions = params,
                             cv = 7, verbose = 3, random_state = 40, n_jobs=-1)

In [None]:
start_time = time.time()

model_xgb = xgb_rscv.fit(x_train,y_train)

print("Best: %f using %s" % (model_xgb.best_score_, model_xgb.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' ms')

print(model_xgb.score(x_test,y_test))

In [16]:
best_params = {'colsample_bytree': 0.3,  # this I get after random search
         'gamma': 1.5,
         'learning_rate': 0.01,
         'max_depth': 7,
         'min_child_weight': 1,
         'n_estimators': 1000,
         'reg_alpha': 0,
         'reg_lambda': 1.5,
         'subsample': 0.4}

model_xgb = XGBRegressor( tree_method='gpu_hist',objective='reg:squarederror',**best_params)
model_xgb.fit(x_train,y_train)
model_xgb.score(x_test,y_test)

0.8906130403751986

In [17]:
y_pred = model_xgb.predict(x_test)
np.sqrt(mean_squared_log_error( y_test, y_pred ))

0.12653935868181856