In [9]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [10]:
df_train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
df_test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
print(df_test.shape)
print(df_train.shape)

(1459, 80)
(1460, 81)


In [11]:
df = pd.concat([df_train, df_test])
df.shape

(2919, 81)

## Dealing with null Values

In [12]:
df.drop("Id", axis = 1, inplace = True)

In [13]:
null_count = df.isna().sum().to_dict()

for i in list(null_count):
     if null_count[i] <= 500:
        null_count.pop(i)

# dropping columns that have more than 500 null values

df.drop(null_count.keys(), axis = 1, inplace = True)
print(f"Dropped Columns : {null_count.keys()}")

Dropped Columns : dict_keys(['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature', 'SalePrice'])


In [14]:
g = df.columns.to_series().groupby(df.dtypes).groups
print(g)

{int64: ['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold'], float64: ['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt', 'GarageCars', 'GarageArea'], object: ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'Garag

In [15]:
# filling null object value by mode

object_feature = ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']
for feature in object_feature:
    df[feature].fillna(df[feature].mode(), inplace = True)

# filling null numerical values using mean
num_feature = ['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold','LotFrontage', 'MasVnrArea', 'GarageYrBlt']
for feature in num_feature:
    df[feature].fillna(df[feature].mean(), inplace = True)

## Converting Categorical feature into dummy variable

In [16]:
# convert category feature into dummy
df = pd.get_dummies(df, columns = object_feature, drop_first=True)
df.shape

(2919, 232)

### Scaling

In [17]:
from sklearn.preprocessing import StandardScaler

df = StandardScaler().fit_transform(df)


In [18]:
# splitting 
X_train = df[:1460]
y_train = df_train["SalePrice"]
X_test = df[1460:]

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(1460, 232)
(1460,)
(1459, 232)


## Model

In [24]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

randomforest = RandomForestRegressor(random_state = 42)
print(cross_val_score(randomforest, X_train, y_train, cv=3))

randomforest.fit(X_train, y_train)
y_predict = randomforest.predict(X_test)

[0.8849198  0.85019645 0.82748002]


In [28]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor()

print(cross_val_score(gbr, X_train, y_train, cv=3))

[0.91620014 0.86916572 0.87888725]


### Hypertuning parameters

In [32]:
from sklearn.model_selection import GridSearchCV
parameters = {'learning_rate': [0.04,0.08,0.1,0,2],
                  'subsample'    : [0.9, 0.5, 0.2, 0.1],
                  'n_estimators' : [1000, 1500, 2000],
                  'max_depth'    : [4,6,8,10]
                 }
grid_GBR = GridSearchCV(estimator=gbr, param_grid = parameters, cv = 2, n_jobs=-1, verbose = 10)
grid_GBR.fit(X_train, y_train)

Fitting 2 folds for each of 240 candidates, totalling 480 fits


  8.75827306e-001  8.79369420e-001  8.61945393e-001  8.52558116e-001
  8.76444543e-001  8.75051577e-001  8.71348188e-001  8.53787921e-001
  8.51717840e-001  8.75756045e-001  8.61357687e-001  8.48704946e-001
  8.53603321e-001  8.66646444e-001  8.63766769e-001  8.60521022e-001
  8.54858011e-001  8.75015809e-001  8.66569609e-001  8.50508093e-001
  8.35083226e-001  8.67262745e-001  8.63168227e-001  8.51789382e-001
  8.37187278e-001  8.72380236e-001  8.50387664e-001  8.49571461e-001
  8.46339687e-001  8.62675892e-001  8.59438716e-001  8.51475851e-001
  8.28021257e-001  8.65862608e-001  8.71982121e-001  8.52598725e-001
  8.27034951e-001  8.64855743e-001  8.58331327e-001  8.57523737e-001
  8.22862793e-001  8.57255989e-001  8.59175632e-001  8.40681645e-001
  8.77119273e-001  8.69177351e-001  8.50118439e-001  8.50855707e-001
  8.63456323e-001  8.77248435e-001  8.66204326e-001  8.22198526e-001
  8.64254555e-001  8.71742850e-001  8.68356145e-001  8.31783470e-001
  8.52879811e-001  8.64562740e-001

GridSearchCV(cv=2, estimator=GradientBoostingRegressor(), n_jobs=-1,
             param_grid={'learning_rate': [0.04, 0.08, 0.1, 0, 2],
                         'max_depth': [4, 6, 8, 10],
                         'n_estimators': [1000, 1500, 2000],
                         'subsample': [0.9, 0.5, 0.2, 0.1]},
             verbose=10)

In [34]:
grid_GBR.best_params_

{'learning_rate': 0.04, 'max_depth': 4, 'n_estimators': 1500, 'subsample': 0.5}

In [36]:
gbr = GradientBoostingRegressor(learning_rate = 0.04, max_depth = 4, n_estimators = 1500, subsample = 0.5)

print(cross_val_score(gbr, X_train, y_train, cv=3))

gbr.fit(X_train, y_train)
predict = gbr.predict(X_test)

[0.92583657 0.86581437 0.88677972]


In [23]:
########### MODEL 2 

from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']

max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf_random = RandomizedSearchCV(estimator = randomforest, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=10, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3,
                   estimator=RandomForestRegressor(max_depth=30,
                                                   max_features='sqrt',
                                                   min_samples_split=5,
                                                   n_estimators=600),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1

In [24]:
rf_random.best_score_

0.8640274603786936

In [25]:
rf_random.best_params_

{'n_estimators': 1600,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 70,
 'bootstrap': False}

In [38]:
randomforest = RandomForestRegressor(n_estimators =  1600,
 min_samples_split = 5,
 min_samples_leaf = 1,
 max_features = 'sqrt',
 max_depth = 70,
 bootstrap = False)
print(cross_val_score(randomforest, X_train, y_train, cv=3))

[0.89524707 0.84586671 0.84884174]


In [39]:
randomforest.fit(X_train, y_train)
y_predict = randomforest.predict(X_test)

In [42]:
randomforest.feature_importances_

array([5.86614833e-03, 1.40964178e-02, 2.48301682e-02, 8.69678501e-02,
       4.32889268e-03, 4.07660957e-02, 2.12171696e-02, 1.89384813e-02,
       3.23892535e-02, 1.13296674e-03, 7.79940041e-03, 5.37464594e-02,
       4.91156974e-02, 2.87479184e-02, 5.18846354e-04, 7.24531673e-02,
       3.36345924e-03, 8.88809512e-04, 2.72003752e-02, 5.43039667e-03,
       6.24067020e-03, 1.08422247e-03, 2.47907004e-02, 1.90542696e-02,
       2.88997620e-02, 6.10372902e-02, 4.73029968e-02, 8.14758052e-03,
       1.44059089e-02, 1.23132679e-03, 2.86873434e-04, 1.94875882e-03,
       1.92268087e-03, 2.51039901e-04, 3.65196410e-03, 2.16334675e-03,
       3.98977748e-04, 6.45694330e-05, 2.68084308e-03, 2.43089874e-03,
       7.77850205e-05, 7.34989540e-04, 3.90794299e-04, 2.01119617e-03,
       7.35658414e-04, 5.45534458e-04, 1.41561130e-03, 4.74869296e-06,
       6.73632318e-04, 1.64277190e-04, 2.32963172e-05, 8.93535554e-04,
       7.91604021e-04, 3.12592276e-04, 2.51415782e-06, 1.17725090e-04,
      

## SUBMIT

In [37]:
# submit = pd.DataFrame({'Id':df_test.Id, 'SalePrice':predict})
# submit.to_csv("GRBTuned.csv", index = False)