# 10. Random Forest Regression

In [1]:
import numpy as np 
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd
%matplotlib inline
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import plot_importance
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.ensemble import RandomForestRegressor

## Process Normalized Data Without PCA

### Load train and test data

In [5]:
train_file = 'house-prices-advanced-regression-techniques/train_normalized.csv'

train_df = pd.read_csv(train_file)

test_file = 'house-prices-advanced-regression-techniques/test_normalized.csv'
test_df = pd.read_csv(test_file)

### Train model on the train data and predict the test data

In [9]:
X = train_df.drop(['SalePrice'], axis=1)
Y = train_df['SalePrice']
# Instantiate model with 1000 decision trees
# rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf = RandomForestRegressor(n_estimators = 10000, random_state = 50)

rf.fit(X,Y)

Y_pred = rf.predict(test_df)
print(Y_pred)
print(Y_pred.shape)

[123386.3346 156980.3259 187487.1025 ... 154023.0264 129363.2621
 236830.8291]
(1459,)


### Save results

In [10]:
original_test_file = 'house-prices-advanced-regression-techniques/test.csv'
original_test_df = pd.read_csv(original_test_file)
id_col = original_test_df['Id']
result_df = pd.concat([id_col, pd.DataFrame(Y_pred, columns=['SalePrice'])], axis=1)
result_df.to_csv('RF_reg_WO_PCA.csv', index=False)

## Improving the results by using Grid Search

### List of parameters and their possible changes as the input for Grid Search

In [None]:
# A parameter grid for XGBoost
params = {'min_child_weight':[4,5],
          'gamma':[i/10.0 for i in range(3,6)],
          'subsample':[i/10.0 for i in range(6,11)],
          'colsample_bytree':[i/10.0 for i in range(6,11)],
          'max_depth': [2,3,4,5,6],
         'learning_rate': [0.01, 0.05, 0.1, 0.15, 0,2],
          'n_estimators' :[1, 10, 50, 100],
         }

### Initialize XGB and GridSearch

In [None]:
X = train_df.drop(['SalePrice'], axis=1)
Y = train_df['SalePrice']


xgb = XGBRegressor(objective ='reg:squarederror')
grid = RandomizedSearchCV(xgb, params)
grid.fit(X, Y)
# grid.get_params
print(grid.best_params_)

### Train the XGBoost again, this time with the best hyper-parameters

In [None]:
xgb = XGBRegressor(objective ='reg:squarederror', **grid.best_params_)
xgb.fit(X,Y)
Y_pred = xg_reg.predict(test_df)

### Save results

In [None]:
original_test_file = 'house-prices-advanced-regression-techniques/test.csv'
original_test_df = pd.read_csv(original_test_file)
id_col = original_test_df['Id']
result_df = pd.concat([id_col, pd.DataFrame(Y_pred, columns=['SalePrice'])], axis=1)
result_df.to_csv('XGBoost_rand_search_WO_PCA.csv', index=False)

## Process Normalized Data With PCA

### Load train and test data

In [None]:
train_file = 'house-prices-advanced-regression-techniques/train_dim_reduced.csv'
train_df = pd.read_csv(train_file)

test_file = 'house-prices-advanced-regression-techniques/test_dim_reduced.csv'
test_df = pd.read_csv(test_file)

### Train model on the train data and predict the test data

In [None]:
X = train_df.drop(['SalePrice'], axis=1)
Y = train_df['SalePrice']
xg_reg = XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
xg_reg.fit(X,Y)
Y_pred = xg_reg.predict(test_df)
print(Y_pred)
print(Y_pred.shape)

### Save results

In [None]:
original_test_file = 'house-prices-advanced-regression-techniques/test.csv'
original_test_df = pd.read_csv(original_test_file)
id_col = original_test_df['Id']
result_df = pd.concat([id_col, pd.DataFrame(Y_pred, columns=['SalePrice'])], axis=1)
result_df.to_csv('XGBoost_reg_W_PCA.csv', index=False)