## Setting up the Notebook

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from src.train import train_and_get_tree_model, train_and_get_forest, train_and_get_gbr, grid_search

Using TensorFlow backend.


In [2]:
%matplotlib inline

## Load Data

In [3]:
def load_data(path = "./data/preprocessed_data_task3.csv"):

    train_data = pd.read_csv(path)

    X = train_data.iloc[:,0:-1]
    X = np.array(X)
    y = train_data["pred"]
    y = np.array(y)

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2021)
    print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
    return x_train, y_train, x_test, y_test

x_train, y_train, x_test, y_test = load_data()

(4463, 53) (4463,) (1116, 53) (1116,)


## Grid Search for Tree-Based Models

In [12]:
# For Decision Tree
parameters = {'max_depth':[20,40,60,80,100,120]}
grid_search("tree", parameters, "./data/preprocessed_data_task3.csv")

(5579, 53) (5579,)
(4463, 53) (4463,) (1116, 53) (1116,)
Parameters: {'max_depth': 100} (RMSE: 0.21702585097669108)


In [9]:
# For Random Forest
parameters = {'n_estimators':[50,100,150],'max_depth':[10,15,20]}
grid_search("forest", parameters, "./data/preprocessed_data_task3.csv")

(5579, 53) (5579,)
(4463, 53) (4463,) (1116, 53) (1116,)
Parameters: {'max_depth': 10, 'n_estimators': 50} (RMSE: 0.15711417049327958)


In [10]:
# For Gradient Boosting Rrgession
parameters = {'n_estimators':[100,200,300],'learning_rate':[0.1,0.15,0.2]}
grid_search("gbr", parameters, "./data/preprocessed_data_task3.csv")

(5579, 53) (5579,)
(4463, 53) (4463,) (1116, 53) (1116,)
Parameters: {'learning_rate': 0.15, 'n_estimators': 300} (RMSE: 0.13243662835001713)


## Train the tree, forest and gbr model

In [4]:
train_and_get_tree_model(x_train, y_train, x_test, y_test, csv=False)

0.2081674603465012
[ 0.16051895 -0.23398764  0.18898529  0.28392979  0.51186611  0.33336393
  0.20379581 -0.03052993  0.4651284   0.04736761 -0.4271479  -0.71528258
 -0.56509951  0.06232847  0.04073908]
[ 1.31736157e-01 -3.14202335e-01  2.04341022e-01  3.47937110e-01
  4.27033597e-01  2.55077420e-01  2.83759781e-01  1.29459013e-01
  4.84905491e-01  8.60934483e-05 -2.82120130e-01 -6.08311755e-01
 -1.37057501e-01 -1.16464946e-02  6.94989090e-02]


DecisionTreeRegressor(max_depth=100)

In [5]:
train_and_get_forest(x_train, y_train, x_test, y_test, csv=False)

0.15269300546698364
[ 0.18476731 -0.31776829  0.0712733   0.37015236  0.47426182  0.26928513
  0.24582937  0.09955485  0.50020506  0.07248069 -0.41837975 -0.58730234
 -0.58010351  0.05572334  0.0632535 ]
[ 1.31736157e-01 -3.14202335e-01  2.04341022e-01  3.47937110e-01
  4.27033597e-01  2.55077420e-01  2.83759781e-01  1.29459013e-01
  4.84905491e-01  8.60934483e-05 -2.82120130e-01 -6.08311755e-01
 -1.37057501e-01 -1.16464946e-02  6.94989090e-02]


RandomForestRegressor(max_depth=10, n_estimators=50, random_state=2021)

In [6]:
train_and_get_gbr(x_train, y_train, x_test, y_test, csv=False)

0.12194088275928698
[ 0.1967996  -0.28788795  0.14887663  0.19435483  0.41857429  0.2714377
  0.27015024  0.07239945  0.45194879 -0.05777614 -0.50578207 -0.65846466
 -0.47052991  0.01865495  0.10911381]
[ 1.31736157e-01 -3.14202335e-01  2.04341022e-01  3.47937110e-01
  4.27033597e-01  2.55077420e-01  2.83759781e-01  1.29459013e-01
  4.84905491e-01  8.60934483e-05 -2.82120130e-01 -6.08311755e-01
 -1.37057501e-01 -1.16464946e-02  6.94989090e-02]


GradientBoostingRegressor(learning_rate=0.15, max_features='sqrt',
                          n_estimators=300, random_state=2021)