In [35]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error

In [2]:
load = pd.read_table("ADV1.csv", sep=',').dropna()

In [3]:
num_predictor_names = ['BG', 'BPS', 'BPD', 'HR']
cat_predictor_names = ['Activity']
target_names = ['IR', 'DR', 'Stroop']

num_X = load[num_predictor_names]
cat_X = pd.get_dummies(load[cat_predictor_names])
Y = load[target_names]

### Train-Test Split

In [4]:
selected_target_name = ['IR']
test_size=0.3

num_X_train, num_X_test, y_train, y_test = train_test_split(num_X, 
                                                            Y[selected_target_name],
                                                            test_size=test_size,
                                                            random_state=42)
cat_X_train, cat_X_test, y_train2, y_test2 = train_test_split(cat_X,
                                                              Y[selected_target_name],
                                                              test_size=test_size,
                                                              random_state=42)

### Min-Max Scaler on Numeric X

In [5]:
scaler = MinMaxScaler()
scaler.fit(num_X_train)
rescaled_num_X_train = pd.DataFrame(scaler.transform(num_X_train), columns=num_predictor_names)
rescaled_num_X_test = pd.DataFrame(scaler.transform(num_X_test), columns=num_predictor_names)

### Bind Numeric and Categorical X

In [6]:
X_train = pd.concat([rescaled_num_X_train.reset_index(drop=True), cat_X_train.reset_index(drop=True)], 
                    axis=1)
X_test = pd.concat([rescaled_num_X_test.reset_index(drop=True), cat_X_test.reset_index(drop=True)], 
                   axis=1)

### Machine Learning - Default Settings

In [7]:
score = "neg_mean_squared_error"
num_folds = 10
iterator = KFold(n_splits=num_folds, shuffle=True, random_state=42)

In [8]:
model_list = [LinearRegression(), KNeighborsRegressor(), 
              DecisionTreeRegressor(), RandomForestRegressor()]

for model in model_list:
    cv_results = cross_val_score(estimator=model, 
                                 X=X_train, y=np.array(y_train).reshape(-1,), 
                                 cv=iterator, scoring=score)
    
    print(f"{str(model)} | negMSE: {np.mean(cv_results)}")

LinearRegression() | negMSE: -233.05997180947776
KNeighborsRegressor() | negMSE: -254.98719999999997
DecisionTreeRegressor() | negMSE: -519.4300000000001
RandomForestRegressor() | negMSE: -259.522279


### Tuning - KNN Regression

In [9]:
model = KNeighborsRegressor()
model.get_params().keys()

dict_keys(['algorithm', 'leaf_size', 'metric', 'metric_params', 'n_jobs', 'n_neighbors', 'p', 'weights'])

In [10]:
%%time
param_grid = dict(n_neighbors=np.arange(1,31), 
                  weights=['uniform', 'distance'], 
                  metric=['euclidean','minkowski'])

grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=score, cv=iterator)
knn_result = grid.fit(X_train, y_train)

Wall time: 6.4 s


### Tuning - Decision Tree Regression

In [11]:
model = DecisionTreeRegressor()
model.get_params().keys()

dict_keys(['ccp_alpha', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'random_state', 'splitter'])

In [12]:
%%time
param_grid = dict(max_depth=np.arange(1,15),
                  max_features=np.arange(1,X_train.shape[1]))

grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=score, cv=iterator)
tree_result = grid.fit(X_train, y_train)

Wall time: 7.39 s


### Tuning - Random Forest Regression

In [13]:
model = RandomForestRegressor(bootstrap=True)
model.get_params().keys()

dict_keys(['bootstrap', 'ccp_alpha', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [18]:
%%time
param_grid = dict(max_depth=np.arange(1,10),
                  max_features=np.arange(1,X_train.shape[1]),
                  max_samples=np.linspace(0.1,0.9,5))

grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=score, cv=iterator)
forest_result = grid.fit(X_train, np.array(y_train).reshape(-1,))

Wall time: 6min 57s


### Performance Comparison

In [20]:
knn_result.best_params_

{'metric': 'euclidean', 'n_neighbors': 17, 'weights': 'uniform'}

In [29]:
tree_result.best_params_

{'max_depth': 2, 'max_features': 6}

In [28]:
forest_result.best_params_

{'max_depth': 2, 'max_features': 1, 'max_samples': 0.9}

In [30]:
np.max(knn_result.cv_results_['mean_test_score'])

-212.11100346020763

In [31]:
np.max(tree_result.cv_results_['mean_test_score'])

-192.0404529261839

In [32]:
np.max(forest_result.cv_results_['mean_test_score'])

-205.86504338159452