## Setting up the Notebook

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from src.train import train_and_get_tree_model, train_and_get_forest, train_and_get_gbr

In [9]:
%matplotlib inline

## Load Data

In [10]:
def load_data(path = "./data/preprocessed_data_task3.csv"):

    train_data = pd.read_csv(path)

    X = train_data.iloc[:,0:-1]
    X = np.array(X)
    y = train_data["pred"]
    y = np.array(y)

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2021)
    print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
    return x_train, y_train, x_test, y_test

x_train, y_train, x_test, y_test = load_data()

(9130, 74) (9130,) (2283, 74) (2283,)


## Train the tree, forest and gbr model

In [11]:
train_and_get_tree_model(x_train, y_train, x_test, y_test, csv=False)

0.968348958159981
[-2.5477096  -2.7613489  -2.12241094 -3.92079979 -1.14277659 -0.75419831
 -4.15863781 -1.77296852 -2.78936132 -3.17429803 -2.98058252 -2.75822776
 -4.9111747  -1.85458928 -0.69927628]
[-2.73693776 -2.00569628  0.2865212  -3.39772256 -1.55085937 -0.0746939
 -3.45006244 -2.66104713 -2.78936132 -2.85666328 -2.81003202 -2.37546682
 -4.47135584 -1.88228236 -0.87941486]


DecisionTreeRegressor()

In [12]:
train_and_get_forest(x_train, y_train, x_test, y_test, csv=False)

0.7143979374309047
[-2.36290889 -2.31176092 -1.33432035 -3.93384895 -1.23433496 -0.84671134
 -3.94219161 -2.26821308 -2.72170454 -3.13789126 -2.98171655 -2.32508574
 -4.72663404 -1.89294793 -0.83026666]
[-2.73693776 -2.00569628  0.2865212  -3.39772256 -1.55085937 -0.0746939
 -3.45006244 -2.66104713 -2.78936132 -2.85666328 -2.81003202 -2.37546682
 -4.47135584 -1.88228236 -0.87941486]


RandomForestRegressor(n_estimators=200, random_state=2021)

In [13]:
train_and_get_gbr(x_train, y_train, x_test, y_test, csv=False)

0.6702359059229837
[-2.52092848 -2.4814002  -0.87589846 -3.67290659 -1.59274065 -1.67955593
 -3.88357543 -2.47239182 -2.46624394 -3.06465181 -2.90239876 -1.93865574
 -4.01461774 -1.8390614  -0.98122661]
[-2.73693776 -2.00569628  0.2865212  -3.39772256 -1.55085937 -0.0746939
 -3.45006244 -2.66104713 -2.78936132 -2.85666328 -2.81003202 -2.37546682
 -4.47135584 -1.88228236 -0.87941486]


GradientBoostingRegressor(learning_rate=0.05, max_depth=8, max_features='sqrt',
                          min_samples_leaf=16, min_samples_split=8,
                          n_estimators=200, random_state=2021)