# Read datasets

In [1]:
import pandas as pd
import csv
from SupervisedLearningUtils import *

version="2.0"
dataset_folder = "output-datasets"
results_path = f"output-models/results_{version}.csv"
dataset_name = "all_seasons_merged_mult_feature-selected"
dataset_path = f"{dataset_folder}/{dataset_name}"
model_results = {}

train = read_csv_and_get_inputs_and_labels(f"{dataset_path}-train.csv")
test = read_csv_and_get_inputs_and_labels(f"{dataset_path}-test.csv")
print(train[0].shape)
print(test[0].shape)

(16755, 19)
(7247, 19)


In [99]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

def search_params(model_class, train, test, grid, n_iter=None):
    CV_class = GridSearchCV if n_iter is None else RandomizedSearchCV
    extra_params = {'n_iter': n_iter, 'param_distributions': grid} if n_iter is not None else {'param_grid': grid}
    cross_validation = CV_class(estimator=model_class(), scoring="r2", refit=False, **extra_params)
    cross_validation.fit(train[0], train[1])
    ideal_params = cross_validation.best_params_
    print(f'Ideal parameters: {ideal_params}')
    model_with_params = model_class(**ideal_params)
    return test_model(model_with_params, train, test)

# Decision Tree

In [101]:
from sklearn.tree import DecisionTreeRegressor

decision_tree = DecisionTreeRegressor()
decision_tree_results = test_model(decision_tree, train, test)
print(f"Decision Tree Regression Results: {decision_tree_results}")
model_results['decision_tree'] = decision_tree_results

Decision Tree Regression Results: ((0.9986247969099775, 0.7543883591408245), (-0.25616138593520543, 23.485300849633347))


# Random Forest Regression
1.0 is best

In [125]:
from sklearn.ensemble import RandomForestRegressor

# forest_param_search_grid = {'n_estimators':range(180, 261, 20),
#                            'max_depth': [None, 80, 100, 120, 140],
#                             'min_samples_leaf': [1, 2, 3, 5],
#                             'min_samples_split': [2, 5, 10],
#                            'max_features': ['auto', 'sqrt', 'log2']}
# random_forest_results = search_params(model_class=RandomForestRegressor, train=train, test=test, grid=forest_param_search_grid, n_iter=10)
ideal_params = {'n_estimators': 1000, 'min_samples_split': 15, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'max_depth': None}
ideal_model = RandomForestRegressor(**ideal_params)
random_forest_results = test_model(ideal_model, train, test)
print(f"Random Forest Regression: {random_forest_results}")
model_results['random_forest'] = random_forest_results


Random Forest Regression: ((0.7324492411797208, 10.522396242141225), (0.3447216052661092, 16.962362212075067))


   # Gradient Boosting Regressor

In [71]:
from sklearn.ensemble import GradientBoostingRegressor

gardient_boost_regressor = GradientBoostingRegressor()
gradient_boost_regressor_results = test_model(gardient_boost_regressor, train, test)
print(f"Gradient Boost Regressor: {gradient_boost_regressor_results}")
model_results['gradient_boost_regressor'] = gradient_boost_regressor_results

Gradient Boost Regressor: ((0.41457009265948463, 15.789280096146841), (0.38556066926271126, 16.980486111278786))


# Ridge Regression
Different alpha values attempted for all_seasons_merged_mult, Yields same results

In [127]:
from sklearn.linear_model import Ridge

ridge_model = Ridge(alpha=1)
ridge_results = test_model(ridge_model, train, test)
print(f"Ridge: {ridge_results}")
model_results['ridge'] = ridge_results

Ridge: ((0.3806240534812315, 16.240599038021973), (0.3897327310573063, 16.922738908715058))


# Neural Network

In [119]:
from sklearn.neural_network import MLPRegressor

neural_model = MLPRegressor(max_iter=2500, hidden_layer_sizes=(50, 50, 50, 50, 10))
neural_results = test_model(neural_model, train, test)
print(f"Neural Network Regression: {neural_results}")
model_results['neural_network'] = neural_results

Neural Network Regression: ((0.38468792256103335, 15.957295162437317), (0.32682172669428267, 17.192477067151437))


# Results

In [120]:
new_formatted_results = {}
## Format new results
for model_name, result in model_results.items():
    new_formatted_results[model_name] = [*result[0], *result[1]]
## Read in existing results
existing_results = {}
try:
    with open(results_path, 'r', newline='') as file:
        headers = None
        for row in csv.reader(file, delimiter=','):
            if headers is None:
                headers = row
                continue
            prev_dataset_name = row[0]
            model_name = row[1]

            if prev_dataset_name not in existing_results:
                existing_results[prev_dataset_name] = {}
            existing_results[prev_dataset_name][model_name] = row[2:]
except FileNotFoundError:
    pass
formatted_results = dict(existing_results, **{dataset_name: new_formatted_results})
with open(results_path, 'w+', newline='') as file: 
    if headers is None:
        headers = ['dataset', 'model', 'train_r_2', 'train_rmse', 'test_r_2', 'test_rmse']
    
    output = [headers]
    # write results
    for dataset_name, formatted_model_results in formatted_results.items():
        for model_name, model_result in formatted_model_results.items():
            if len(headers) - 2 != len(model_result):
                raise ValueError(f'Length of headers does not match: {model_result}')
            output.append([dataset_name, model_name, *model_result])
    file.truncate(0)
    writer = csv.writer(file, delimiter=',')
    writer.writerows(output)
    
        
    