# Read datasets

In [310]:
import pandas as pd
import csv
from sklearn.metrics import mean_squared_error

def read_csv_and_get_inputs_and_labels(path, label_column="ttl_pts"):
    df = pd.read_csv(path)
    input = df.loc[:, df.columns != label_column]
    labels = df[[label_column]]
    return input, labels.to_numpy()[:, 0]
version="1.0"
dataset_folder = "output-datasets"
results_path = f"output-models/results_{version}.csv"
dataset_name = "2017_2018_2019_pca"
dataset_path = f"{dataset_folder}/{dataset_name}"
model_results = {}

train = read_csv_and_get_inputs_and_labels(f"{dataset_path}-train.csv")
test = read_csv_and_get_inputs_and_labels(f"{dataset_path}-test.csv")
print(train[0].shape)
print(test[0].shape)

(2564, 10)
(1107, 10)


In [311]:
def test_model(model, train, test):
    train_inputs, train_labels = train
    test_inputs, test_labels = test
    model.fit(train_inputs, train_labels)
    train_score = model.score(train_inputs, train_labels)
    train_msqe = mean_squared_error(train_labels, model.predict(train_inputs))
    test_score = model.score(test_inputs, test_labels)
    test_msqe = mean_squared_error(test_labels, model.predict(test_inputs))

    return (train_score, train_msqe**.5), (test_score, test_msqe**.5)

# Random Forest Regression
1.0 is best

In [312]:
from sklearn.ensemble import RandomForestRegressor

random_forest_model = RandomForestRegressor()
random_forest_results = test_model(random_forest_model, train, test)
print(f"Random Forest Regression: {random_forest_results}")
model_results['random_forest'] = random_forest_results

Random Forest Regression: ((0.8891863515413915, 6.565420615232065), (0.27189215576302805, 18.05239369104588))


# Ridge Regression

In [313]:
from sklearn.linear_model import Ridge

ridge_model = Ridge(alpha=1)
ridge_results = test_model(ridge_model, train, test)
print(f"Ridge: {ridge_results}")
model_results['ridge'] = ridge_results

Ridge: ((0.25598826226395066, 17.01201751199845), (0.2950875051225834, 17.762518592950308))


# Neural Network

In [314]:
from sklearn.neural_network import MLPRegressor

neural_model = MLPRegressor(max_iter=2500, hidden_layer_sizes=(50, 50, 50, 50, 10))
neural_results = test_model(neural_model, train, test)
print(f"Neural Network Regression: {neural_results}")
model_results['neural_network'] = neural_results

Neural Network Regression: ((0.3503263470333371, 15.896940723541293), (0.22528027047129517, 18.621267943341866))


# Results

In [315]:
new_formatted_results = {}
## Format new results
for model_name, result in model_results.items():
    new_formatted_results[model_name] = [*result[0], *result[1]]
## Read in existing results
existing_results = {}
try:
    with open(results_path, 'r', newline='') as file:
        headers = None
        for row in csv.reader(file, delimiter=','):
            if headers is None:
                headers = row
                continue
            prev_dataset_name = row[0]
            model_name = row[1]

            if prev_dataset_name not in existing_results:
                existing_results[prev_dataset_name] = {}
            existing_results[prev_dataset_name][model_name] = row[2:]
except FileNotFoundError:
    pass
formatted_results = dict(existing_results, **{dataset_name: new_formatted_results})
with open(results_path, 'w+', newline='') as file: 
    if headers is None:
        headers = ['dataset', 'model', 'train_r_2', 'train_rmse', 'test_r_2', 'test_rmse']
    
    output = [headers]
    # write results
    for dataset_name, formatted_model_results in formatted_results.items():
        for model_name, model_result in formatted_model_results.items():
            if len(headers) - 2 != len(model_result):
                raise ValueError(f'Length of headers does not match: {model_result}')
            output.append([dataset_name, model_name, *model_result])
    file.truncate(0)
    writer = csv.writer(file, delimiter=',')
    writer.writerows(output)
    
        
    