### Import libraries

In [None]:
import pandas as pd
import sklearn
import numpy as np
import random

import xgboost
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline

random.seed(10)

### Read Data

In [None]:
data_train = pd.read_csv('training_cleaned.csv')
data_test = pd.read_csv('test_cleaned.csv')

In [None]:
data_test.columns

### Data Processing

##### Convert categorical data to dummies

In [None]:
data_int = list(data_train.select_dtypes(exclude=[object]).columns)
data_cat = list(data_train.select_dtypes(include=[object]).columns)
data_train = pd.get_dummies(data_train, columns=data_cat)
data_test = pd.get_dummies(data_test, columns=data_cat)

##### Making columns same in all data sets

In [None]:
com_cols = list(set(list(data_train.columns)+list(data_test.columns)))

for i in com_cols:
    if i not in data_train.columns:
        data_train[i] = 0
    elif i not in data_test.columns:
        data_test[i] = 0

### Feature Engineering

##### Log Tranformation

In [None]:
for j in data_int:
    if j != 'SalePrice':
        data_train[j] = np.log1p(data_train[j])
        data_test[j] = np.log1p(data_test[j])

### Train, Test and Validation split

In [None]:
# complete train data
x_train_complete = data_train[data_train.columns[~data_train.columns.isin(['SalePrice'])]]
y_train_complete = data_train[['SalePrice']]

# partial train data 
x_train = x_train_complete.iloc[:1201,:]
y_train = y_train_complete.iloc[:1201,:]

# validation data
x_val = x_train_complete.iloc[1201:,:]
y_val = y_train_complete.iloc[1201:,:]

In [None]:
import matplotlib.pyplot

In [None]:
y_train_complete.plot.hist()

In [None]:
np.log1p(y_train_complete).plot.hist()

### Feature Selection

In [None]:
#This step will take lot of time
mod = RandomForestRegressor(random_state = 42)
rfecv = RFECV(estimator=mod, step=1, cv=5, scoring='neg_mean_absolute_error')
rfecv.fit(x_train_complete,y_train_complete)
features = list(x_train_complete.columns[rfecv.support_])

In [None]:
features

### Training data

##### XGBoost

In [None]:
# params_xgb = {'min_child_weight': [1],
#               'gamma': [0.5],
#               'subsample': [0.8,1.0],
#               'colsample_bytree': [0.5,0.7],
#               'max_depth': [3],
#               'learning_rate':[0.01,0.05,0.1],
#               'n_estimators':[100,200,500],
#               'random_state':[0]}

params_xgb = {'min_child_weight': [1],
              'gamma': [0.5],
              'subsample': [0.8],
              'colsample_bytree': [0.7],
              'max_depth': [3],
              'learning_rate':[0.01],
              'n_estimators':[5000],
              'random_state':[0]}

# partial data
model_xgb = xgboost.XGBRegressor() 
xgb_model = GridSearchCV(model_xgb, params_xgb, n_jobs=-1, cv=5, refit=True, scoring = 'neg_mean_absolute_error')
xgb_model.fit(x_train[features].values,np.log1p(y_train))

# complete data
model_xgb_complete = xgboost.XGBRegressor()
xgb_model_complete = GridSearchCV(model_xgb_complete, params_xgb, n_jobs=-1, cv=5, refit=True, scoring = 'neg_mean_absolute_error')
xgb_model_complete.fit(x_train_complete[features].values,np.log1p(y_train_complete))

##### GradientBoost

In [None]:
# params_gb = { 'loss':['ls'],
#               'subsample': [0.5],
#               'max_depth': [3],
#               'learning_rate':[0.05],
#               'n_estimators':[100],
#               'random_state':[0]}

params_gb = { 'loss':['ls'],
              'subsample': [0.7],
              'max_depth': [3],
              'learning_rate':[0.01],
              'n_estimators':[5000],
              'random_state':[0]}

# partial data
model_gb = GradientBoostingRegressor()
gb_model = GridSearchCV(model_gb, params_gb, n_jobs=-1, cv=5, refit=True, scoring = 'neg_mean_absolute_error')
gb_model.fit(x_train[features].values,np.log1p(y_train))


# complete data
model_gb_complete = GradientBoostingRegressor()
gb_model_complete = GridSearchCV(model_gb_complete, params_gb, n_jobs=-1, cv=5, refit=True, scoring = 'neg_mean_absolute_error')
gb_model_complete.fit(x_train_complete[features].values,np.log1p(y_train_complete))

##### RandomForest

In [None]:
params_rf = {
              'max_leaf_nodes': [5],
              'max_depth': [3],
              'n_estimators':[5000],
              'random_state':[0]}

# partial data
model_rf = RandomForestRegressor() 
rf_model = GridSearchCV(model_rf, params_rf, n_jobs=-1, cv=5, refit=True, scoring = 'neg_mean_absolute_error')
rf_model.fit(x_train[features].values,np.log1p(y_train))

# complete data
model_rf_complete = RandomForestRegressor()
rf_model_complete = GridSearchCV(model_rf_complete, params_rf, n_jobs=-1, cv=5, refit=True, scoring = 'neg_mean_absolute_error')
rf_model_complete.fit(x_train_complete[features].values,np.log1p(y_train_complete))

##### DecisionTree

In [None]:
params_dt = {
                'random_state':[0]}

# partial data
model_dt = DecisionTreeRegressor()
dt_model = GridSearchCV(model_dt, params_dt, n_jobs=-1, cv=5, refit=True, scoring = 'neg_mean_absolute_error')
dt_model.fit(x_train[features].values,np.log1p(y_train))

# complete data
model_dt_complete = DecisionTreeRegressor()
dt_model_complete = GridSearchCV(model_dt_complete, params_dt, n_jobs=-1, cv=5, refit=True, scoring = 'neg_mean_absolute_error')
dt_model_complete.fit(x_train_complete[features].values,np.log1p(y_train_complete))

##### LASSO

In [None]:
params_ls = {
                'alpha': [0.5],
                'max_iter':[11500],
                'random_state':[0],
                }

# partial data
model_ls = Lasso() 
ls_model = GridSearchCV(model_ls, params_ls, n_jobs=-1, cv=5, refit=True, scoring = 'neg_mean_absolute_error')
ls_model.fit(x_train[features].values,np.log1p(y_train))

# complete data
model_ls_complete = Lasso() 
ls_model_complete = GridSearchCV(model_ls_complete, params_ls, n_jobs=-1, cv=5, refit=True, scoring = 'neg_mean_absolute_error')
ls_model_complete.fit(x_train_complete[features].values,np.log1p(y_train_complete))

### Prediction on validation

In [None]:
x_val_p = x_val.copy()
x_val_p['act'] = y_val
pred_xgb = xgb_model.predict(x_val[features].values)
x_val_p['pred_xgb'] = np.expm1(pred_xgb)
pred_gb = gb_model.predict(x_val[features].values)
x_val_p['pred_gb'] = np.expm1(pred_gb)
pred_rf = rf_model.predict(x_val[features].values)
x_val_p['pred_rf'] = np.expm1(pred_rf)
pred_dt = dt_model.predict(x_val[features].values)
x_val_p['pred_dt'] = np.expm1(pred_dt)
pred_ls = ls_model.predict(x_val[features].values)
x_val_p['pred_ls'] = np.expm1(pred_ls)

#### MAPE represented as percentage.The lowest MAPE was from Gradient Boost Model.
-- MAPE on Validation Data – 8.27%  

-- MAPE on Test Data – 8.33%

##### XGBoost MAPE

In [None]:
mape_xgb_p = round((np.mean(np.abs(x_val_p['pred_xgb'] - x_val_p['act']) / x_val_p['act']))*100,2)
mape_xgb_p

##### Gradient Boost MAPE

In [None]:
mape_gb_p = round((np.mean(np.abs(x_val_p['pred_gb'] - x_val_p['act']) / x_val_p['act']))*100,2)
mape_gb_p

##### Random Forest MAPE

In [None]:
mape_rf_p = round((np.mean(np.abs(x_val_p['pred_rf'] - x_val_p['act']) / x_val_p['act']))*100,2)
mape_rf_p

##### DecisionTree MAPE

In [None]:
mape_dt_p = round((np.mean(np.abs(x_val_p['pred_dt'] - x_val_p['act']) / x_val_p['act']))*100,2)
mape_dt_p

##### LASSO MAPE

In [None]:
mape_ls_p = round((np.mean(np.abs(x_val_p['pred_ls'] - x_val_p['act']) / x_val_p['act']))*100,2)
mape_ls_p

### Prediction on test data

In [None]:
sample = pd.read_csv('test_actual_price.csv')
sample = sample.sort_values(by='Id')

In [None]:
pred_xgb_c = xgb_model_complete.predict(data_test[features].values)
pred_gb_c = gb_model_complete.predict(data_test[features].values)
pred_rf_c = rf_model_complete.predict(data_test[features].values)
pred_ls_c = ls_model_complete.predict(data_test[features].values)
pred_dt_c = dt_model_complete.predict(data_test[features].values)
sample['pred_xgb'] = np.expm1(pred_xgb_c)
sample['pred_gb'] = np.expm1(pred_gb_c)
sample['pred_rf'] = np.expm1(pred_rf_c)
sample['pred_ls'] = np.expm1(pred_ls_c)
sample['pred_dt'] = np.expm1(pred_dt_c)

##### XGBoost MAPE

In [None]:
mape_xgb_c = round((np.mean(np.abs(sample['SalePrice'] - sample['pred_xgb']) / sample['SalePrice']))*100,2)
mape_xgb_c

##### Gradient Boost MAPE

In [None]:
mape_gb_c = round((np.mean(np.abs(sample['SalePrice'] - sample['pred_gb']) / sample['SalePrice']))*100,2)
mape_gb_c

##### Random Forest MAPE

In [None]:
mape_rf_c = round((np.mean(np.abs(sample['SalePrice'] - sample['pred_rf']) / sample['SalePrice']))*100,2)
mape_rf_c

##### DecisionTree MAPE

In [None]:
mape_dt_c = round((np.mean(np.abs(sample['SalePrice'] - sample['pred_dt']) / sample['SalePrice']))*100,2)
mape_dt_c

##### LASSO MAPE

In [None]:
mape_ls_c = round((np.mean(np.abs(sample['SalePrice'] - sample['pred_ls']) / sample['SalePrice']))*100,2)
mape_ls_c

##### Ensemble MAPE

In [None]:
sample['ense'] = ((mape_xgb_c*sample['pred_xgb'])+(mape_gb_c*sample['pred_gb'])+(mape_rf_c*sample['pred_rf']))/(mape_xgb_c+mape_gb_c+mape_rf_c)
mape_en_c = round((np.mean(np.abs(sample['SalePrice'] - sample['ense']) / sample['SalePrice']))*100,2)
mape_en_c

In [None]:
final = sample[['Id','pred_gb']]
final.columns = ['Id','prediction']
final.to_csv('final_submission.csv',index=False)

### Important Notes

1. Some columns have missing values of more than 15%. Imputing such columns will lead to false predictions.
   Hence such columns are removed.
   
2. Columns which are integer/float with missing values of less than 15% are imputed with MissForest imputation algorithm.
   It is a Random Forest based imputation algorithm.
   
3. Columns which are categorical with missing values of less than 15% are imputed with Mode.

4. Then the categorical columns are converted to dummies for modelling purposes.

5. The train data is split into two parts actual train and validation.
   The validation dataset is generated to get a sense of the test data.

6. Without feature engineering or feature selection the bagging and boosted models are built to get a sense of the MAPE values.
   The MAPE values for XGBoost, GradientBoost and RandomForest without hyperparameter tuning were 18%,15%,20% appxon validation
   
7. The continuous features and the target feature were not normally distributed. To get make the data normally distributed for
   better predictions, log transformation of continuous and target feature was applied. This decreased the MAPE by 3%
   
8. Since all the features were used for predcition, the model might overfit and the predcitions might not be proper. Hence we 
   used RFECV (Random Feature Elimination Cross Validation) algorithm to eliminate the least import features and avoid      
   overfitting issues. The features reduced from 269 to 149 and the MAPE value improved significantly from around 13% to 10.5%.

9. Now after feature engineering and feature selection, we did hyperparameter tuning and GradeintBoost model gave the least
   MAPE of 8.3.
   
10. Though Random Forest model outperformed Gradient Boost model on few occassions on the test set, it is not conistent with 
    its performance of validation set where GradientBoost model was consisent and trustworthy with its predictions.