In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import cufflinks as cf
cf.go_offline()
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")


In [3]:

train_data=pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")

In [64]:
test_data=pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

In [4]:
train_data.head()

## Data Preprocessing and EDA

In [5]:
def columns_with_null_values(df):
    columns=[]
    r=pd.DataFrame((train_data.isnull().sum()/len(train_data))*100).reset_index()
    for i,j in zip(r[r.columns[0]],r[r.columns[1]]):
        if j!=0:
            columns.append(i)
    return columns

In [6]:
columns=columns_with_null_values(train_data)

In [7]:
print((train_data[columns].isnull().sum()/len(train_data))*100)

In [8]:
def column_dropper(df):
    columns=columns_with_null_values(df)
    for i in columns:
        if ((df[i].isnull().sum()/len(df))*100)>10:
            df.drop(i,1,inplace=True)
    return df
    

In [9]:
train_data=column_dropper(train_data)

In [10]:
train_data.shape

In [11]:
train_data.dropna(inplace=True)

In [12]:
train_data.shape

In [13]:
fig,axes=plt.subplots(figsize=(30,30))
sns.heatmap(train_data.corr(),linewidths=2,linecolor="black",annot=True)

In [14]:
train_data["SalePrice"].iplot(kind="hist",xTitle="Price",yTitle="Frequency")

In [15]:
train_data.hist(figsize=(30,30))

## Selecting best features for the target variable(SalePrice) using mutual information classifier.

In [16]:
from sklearn.preprocessing import StandardScaler, LabelEncoder 
std=StandardScaler()
le=LabelEncoder()

In [17]:
def encode(df):
    categorical=list(df.select_dtypes(include=['category','object'])) #Takes categorical features into account.
    for feature in categorical: #Iterates over every feature.
        try:
            df[feature] = le.fit_transform(df[feature]) #Encodes each and every value of a particular feature.
        except:
            print('Error encoding '+feature) #Throws an error if a feature failed the task.
    return df

In [18]:
train_data=encode(train_data)
X=train_data.drop(["SalePrice"],1)
y=train_data["SalePrice"]

In [19]:
from sklearn.feature_selection import mutual_info_classif

In [20]:
def Feature_selection(indep_var,depend_var):
    """
    indep_var: Dataframe comprising of independent variables.
    depend_var: Dataframe comprising of dependent variable.
    """
    importances=mutual_info_classif(indep_var,depend_var)
    feat_imp=pd.Series(importances, indep_var.columns[0:len(indep_var.columns)])
    print(feat_imp)
    feat_imp.plot(kind='barh',figsize=(30,30))
    plt.show()
    column_num=[]
    imp_feat=[]
    for i,j in enumerate(importances):
        if j>np.percentile(importances,5): #Removing columns that contribute nothing to the prediction.
            column_num.append(i)
    for i in column_num:
        imp_feat.append(list(indep_var.columns)[i])
    return imp_feat

In [21]:
imp_feat=Feature_selection(X,y)

In [22]:
X=X[imp_feat]

In [23]:
#Fitting several vanilla models.
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge

In [24]:
Grad_B=GradientBoostingRegressor()
XGB=XGBRegressor()
Rnd_Reg=RandomForestRegressor()
Lin_Reg=LinearRegression()
Lasso=Lasso()
Ridge=Ridge()

In [25]:
Algo_list=[Grad_B,XGB,Rnd_Reg,Lin_Reg,Lasso,Ridge]

In [26]:
from sklearn.metrics import mean_squared_error,r2_score

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
def Value_prediction(independent_data,dependent_data,list_of_algorithms):
    """
    independent_data: DataFrame comprising of Independent Variables.
    dependent_data: DataFrame comprising of dependent variable.
    list_of_algorithms: Pre loaded list of algorithms.
    """
    result_table=pd.DataFrame(columns=["Classifiers","RMS","R^2","Adj_R^2"])
    X_train, X_test, y_train, y_test = train_test_split(independent_data, dependent_data, test_size=0.25, random_state=33)
    for cls in list_of_algorithms:
        model=cls.fit(X_train,y_train)
        y_pred=cls.predict(X_test)
        rms=mean_squared_error(y_test,y_pred,squared=False)
        r2=r2_score(y_test,y_pred)
        Adj_R2=1 - (1-r2)*(len(y)-1)/(len(y)-X.shape[1]-1)
        
        result_table=result_table.append({"Classifiers":cls.__class__.__name__,
                                         "RMS":rms,
                                         "R^2":r2,
                                         "Adj_R^2":Adj_R2},ignore_index=True)
    result_table.set_index("Classifiers",inplace=True)
    return result_table

In [29]:
table=Value_prediction(X,y,Algo_list)

In [30]:
table

## Reducing multi_collinearity using variance inflation factor.

In [31]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [32]:

from statsmodels.tools.tools import add_constant

In [33]:
def multi_collinear_features(df):
    """
    Where,
    X: DataFrame of independent variables.
    variance_inflation_factor expects the presence of a constant in the matrix 
    of explanatory variables. One can use add_constant from statsmodels to add
    the required constant to the dataframe before passing its values to the function.
    """
    df_VIF=add_constant(df)
    VIF= pd.Series([variance_inflation_factor(df_VIF.values,i) for i in range(df_VIF.shape[1])],index=df_VIF.columns)
    VIF=VIF.reset_index()
    Multi_coll=[]
    for i,j in zip(VIF["index"],VIF[0]):
        if j>10:
            Multi_coll.append(i)
    Multi_coll.remove("const")
    return Multi_coll

In [34]:
list_of_multi_collinear_features=multi_collinear_features(X)

In [35]:
list_of_multi_collinear_features

In [36]:
X.drop(list_of_multi_collinear_features,1,inplace=True)

In [37]:
#Lets Check the performance of our ML models.
table_new=Value_prediction(X,y,Algo_list)

In [38]:
table_new

## The codes of XGBoost hyper parameter tuning have been referred from [here](https://blog.cambridgespark.com/hyperparameter-tuning-in-xgboost-4ff9100a3b2f)

In [39]:
#Using XGBRegressor since it has the highest Adjusted R^2 score
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:linear',
    'eval_metric':'mae'
}
num_boost_round=999


In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)

In [41]:
import xgboost

In [42]:
dtrain = xgboost.DMatrix(X_train, label=y_train)
dtest = xgboost.DMatrix(X_test, label=y_test)

In [43]:
model = xgboost.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)
print("Best MAE: {:.2f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))

## Cross Validation with XGBoost

In [44]:
cv_results = xgboost.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'mae'},
    early_stopping_rounds=10
)
cv_results

In [45]:
cv_results['test-mae-mean'].min()

In [46]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]

In [47]:
# Define initial best params and MAE
min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgboost.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

In [48]:
#Updating the params.
params["max_depth"]=9
params["min_child_weight"]=7

In [49]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [50]:
min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgboost.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

In [51]:
params['subsample'] = 0.8
params['colsample_bytree'] = 0.7

In [52]:
min_mae = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    cv_results = xgboost.cv(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            seed=42,
            nfold=5,
            metrics=['mae'],
            early_stopping_rounds=10)
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
print("Best params: {}, MAE: {}".format(best_params, min_mae))

In [53]:
params['eta'] = 0.05


In [54]:
#The list of params
params

In [55]:
model = xgboost.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)
print("Best MAE: {:.2f} in {} rounds".format(model.best_score, model.best_iteration+1))

In [67]:
print(test_data[imp_feat].drop(list_of_multi_collinear_features,1,inplace=True))