# modeling.ipynb

## Imports

In [11]:
import csv
import logging
import math
import time
from datetime import datetime

import pandas as pd
import numpy as np
from sklearn import neighbors, model_selection, preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.neural_network import MLPRegressor
from sklearn.utils._testing import ignore_warnings

# !!! WARNING !!!

Setting the value of LIMITER to less than about 50 or so will lead to long processing times. LIMITER value of 1 trains the models against the entire data set. 

Make sure to run `pip install -r requirements.txt` before executing this notebook.

You've been warned :) 

## Variables

In [12]:
LIMITER = 1000 # number of observations in dataset / limiter :: applies to all models
PATH = "../../data/clean_formatted/20210712_214459_formatted-data.csv"
REPORT_PATH = "../../data/results/report.csv" # file saving the performance metrics for each model for later processing
K_VALUE = 4 # used in K-Nearest Neighbor Regressor
ESTIMATORS = 150 # used for Random Forest Regressor
LAYERS = 20 # used for Multi-layer Perceptron Regressor
LAYER_SIZE = 60 # used for Multi-layer Perceptron Regressor

## Shared Functions

In [13]:
def read_csv(path):
    ''' read_csv reads csv from provided path and return dataframe '''
    return pd.read_csv(path)

def clean_copy(data_frame):
    ''' clean_copy copies the given data frame and drops player_id '''
    wdf = data_frame.copy()
    wdf = wdf.drop('player_id', axis=1)
    return wdf

def split_data(data_frame):
    ''' splits dataframe into predictor and target dataframes '''
    predictors = data_frame.drop('fs_total', axis=1)
    target = data_frame['fs_total']
    return predictors, target

def save(path, contents):
    # Open file in append mode
    with open(path, 'a+', newline='\n') as write_obj:
        # Create a writer object from csv module
        csv_writer = csv.writer(write_obj)
        # Add contents of list as last row in the csv file
        csv_writer.writerow(contents)

## Modeling

### K-Nearest Neighbors Regression (KNN)

In [14]:
def knn(description, K_VALUE, predictors, target):
    ''' uses 5-fold cross validation to create and save MLPC model '''
    kfold = model_selection.KFold(5, shuffle=True, random_state=2)

    rmse, mse, r2 = [], [], []

    for train_idx, test_idx in kfold.split(predictors):
        predictors_train, predictors_test = predictors[train_idx], predictors[test_idx]
        target_train, target_test = target[train_idx], target[test_idx]

        knn_regression = neighbors.KNeighborsRegressor(n_neighbors=K_VALUE, weights='uniform')

        knn_regression.fit(predictors_train, target_train)

        rmse += [math.sqrt(np.mean((knn_regression.predict(predictors_test) - target_test) ** 2))]
        mse += [np.mean((knn_regression.predict(predictors_test) - target_test) ** 2)]
        r2 += [knn_regression.score(predictors_test, target_test)]

    now = datetime.now()
    current_time = now.strftime("%D %H:%M:%S")
    results = [
        description,
        "{:.4f}".format(np.mean(rmse)), # rmse
        "{:.4f}".format(np.std(rmse)), # rmse variance
        "{:.4f}".format(np.mean(mse)), # mse
        "{:.4f}".format(np.std(mse)), # mse variance
        "{:.4f}".format(np.mean(r2)), #r2
        "{:.4f}".format(np.std(r2)), #r2 variance
        "{}".format(len(predictors)), #num records analyzed
        "{}".format(current_time) #time of execution
    ]
    return results

### Random Forest Regression (RFR)

In [15]:
def rfr(description, n_estimators, predictors, target):
    ''' uses 5-fold cross validation to create and save rfr model '''
    kfold = model_selection.KFold(5, shuffle=True, random_state=2)

    rmse, mse, r2 = [], [], []

    for train_idx, test_idx in kfold.split(predictors):
        predictors_train, predictors_test = predictors[train_idx], predictors[test_idx]
        target_train, target_test = target[train_idx], target[test_idx]

        regressor = RandomForestRegressor(n_estimators=n_estimators)
        regressor.fit(predictors_train, target_train)

        rmse += [math.sqrt(np.mean((regressor.predict(predictors_test) - target_test) ** 2))]
        mse += [np.mean((regressor.predict(predictors_test) - target_test) ** 2)]
        r2 += [regressor.score(predictors_test, target_test)]

    now = datetime.now()
    current_time = now.strftime("%D %H:%M:%S")
    results = [
        description,
        "{:.4f}".format(np.mean(rmse)), # rmse
        "{:.4f}".format(np.std(rmse)), # rmse variance
        "{:.4f}".format(np.mean(mse)), # mse
        "{:.4f}".format(np.std(mse)), # mse variance
        "{:.4f}".format(np.mean(r2)), #r2
        "{:.4f}".format(np.std(r2)), #r2 variance
        "{}".format(len(predictors)), #num records analyzed
        "{}".format(current_time) #time of execution
    ]
    return results

### Decision Tree Regression (DTR)

In [16]:
def dtr(description, predictors, target):
    ''' uses 5-fold cross validation to create and save MLPC model '''
    kfold = model_selection.KFold(5, shuffle=True, random_state=2)

    rmse, mse, r2 = [], [], []

    for train_idx, test_idx in kfold.split(predictors):
        predictors_train, predictors_test = predictors[train_idx], predictors[test_idx]
        target_train, target_test = target[train_idx], target[test_idx]

        regressor = DecisionTreeRegressor().fit(predictors_train, target_train)

        rmse += [math.sqrt(np.mean((regressor.predict(predictors_test) - target_test) ** 2))]
        mse += [np.mean((regressor.predict(predictors_test) - target_test) ** 2)]
        r2 += [regressor.score(predictors_test, target_test)]

    now = datetime.now()
    current_time = now.strftime("%D %H:%M:%S")
    results = [
        description,
        "{:.4f}".format(np.mean(rmse)), # rmse
        "{:.4f}".format(np.std(rmse)), # rmse variance
        "{:.4f}".format(np.mean(mse)), # mse
        "{:.4f}".format(np.std(mse)), # mse variance
        "{:.4f}".format(np.mean(r2)), #r2
        "{:.4f}".format(np.std(r2)), #r2 variance
        "{}".format(len(predictors)), #num records analyzed
        "{}".format(current_time) #time of execution
    ]
    return results

### Multiple Linear Regression (MLR) 

In [17]:
def mlr(description, predictors, target):
    ''' uses 5-fold cross validation to create and save MLPC model '''
    kfold = model_selection.KFold(5, shuffle=True, random_state=2)

    rmse, mse, r2 = [], [], []

    for train_idx, test_idx in kfold.split(predictors):
        predictors_train, predictors_test = predictors[train_idx], predictors[test_idx]
        target_train, target_test = target[train_idx], target[test_idx]

        regressor = LinearRegression().fit(predictors_train, target_train)

        rmse += [math.sqrt(np.mean((regressor.predict(predictors_test) - target_test) ** 2))]
        mse += [np.mean((regressor.predict(predictors_test) - target_test) ** 2)]
        r2 += [regressor.score(predictors_test, target_test)]

    now = datetime.now()
    current_time = now.strftime("%D %H:%M:%S")
    results = [
        description,
        "{:.4f}".format(np.mean(rmse)), # rmse
        "{:.4f}".format(np.std(rmse)), # rmse variance
        "{:.4f}".format(np.mean(mse)), # mse
        "{:.4f}".format(np.std(mse)), # mse variance
        "{:.4f}".format(np.mean(r2)), #r2
        "{:.4f}".format(np.std(r2)), #r2 variance
        "{}".format(len(predictors)), #num records analyzed
        "{}".format(current_time) #time of execution
    ]
    return results

### Multi-layer Perceptron Regression (MLP)

In [18]:
def mlp(description, arch, activation, predictors, target):
    ''' uses 5-fold cross validation to create and save MLPC model '''
    kfold = model_selection.KFold(5, shuffle=True, random_state=2)

    rmse, mse, r2 = [], [], []

    for train_idx, test_idx in kfold.split(predictors):
        predictors_train, predictors_test = predictors[train_idx], predictors[test_idx]
        target_train, target_test = target[train_idx], target[test_idx]

        mlp_clf = MLPRegressor(
            hidden_layer_sizes=arch,
            max_iter=2000,
            activation=activation,
            random_state=2,
            solver='lbfgs')

        with ignore_warnings(category=UndefinedMetricWarning):

            predictors_scaler = preprocessing.MinMaxScaler()

            predictors_train = predictors_scaler.fit_transform(predictors_train)

            mlp_clf.fit(predictors_train, target_train)

            predictors_test = predictors_scaler.transform(predictors_test)
            target_prediction = mlp_clf.predict(predictors_test)

            rmse += [math.sqrt((np.mean(target_prediction - target_test) ** 2))]
            mse += [np.mean((target_prediction - target_test) ** 2)]
            r2 += [mlp_clf.score(predictors_test, target_test)]

    now = datetime.now()
    current_time = now.strftime("%D %H:%M:%S")
    results = [
        description,
        "{:.4f}".format(np.mean(rmse)), # rmse
        "{:.4f}".format(np.std(rmse)), # rmse variance
        "{:.4f}".format(np.mean(mse)), # mse
        "{:.4f}".format(np.std(mse)), # mse variance
        "{:.4f}".format(np.mean(r2)), #r2
        "{:.4f}".format(np.std(r2)), #r2 variance
        "{}".format(len(predictors)), #num records analyzed
        "{}".format(current_time) #time of execution
    ]
    return results

## Evaluation

### Asses Best Values (K, Estimators)

In [9]:
# Setup
logging.basicConfig(
    level=logging.INFO,
    format="%(levelname)s ::: %(message)s")

logging.info("starting knn.py")
start = time.time()

# Prepare Data
# NOTE: this is using pre cleaned / processed data set
data_frame = read_csv(PATH)
logging.info(
    "read in %s records to data frame with %s features",
    len(data_frame),
    len(data_frame.columns))

size = len(data_frame) // LIMITER
logging.info("will use %s records for MLP Classifier", size)

subset = data_frame.iloc[:size,]
working_subset = clean_copy(subset)
predictors, target = split_data(working_subset)

logging.info("converting predictors and target to numpy arrays")
predictors = predictors.to_numpy()
target = target.to_numpy()

# KNN
for i in [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25]:
    logging.warning("starting KNN Regression - good luck.")
    knn_results = knn(
        "KNN Regressor, where K = " +str(i),
        (i),predictors, target)

    logging.info("KNN Regression completed, results:")
    logging.info(knn_results)

    logging.info("updating reports.csv")
    save(REPORT_PATH, knn_results)

# Random Forest
for i in [50,100,150,200,250,300]:
    logging.warning("starting Random Forest Regression - good luck.")
    rfr_results = rfr(
        "Random Forest Regression, where n_estimators="+str(i),
        i, predictors, target)

    logging.info("Random Forest Regression completed, results:")
    logging.info(rfr_results)

    logging.info("updating reports.csv")
    save(REPORT_PATH, rfr_results)

INFO ::: starting knn.py
INFO ::: read in 200596 records to data frame with 205 features
INFO ::: will use 200 records for MLP Classifier
INFO ::: converting predictors and target to numpy arrays
INFO ::: KNN Regression completed, results:
INFO ::: ['KNN Regressor, where K = 1', '3.0416', '0.6953', '9.7350', '4.2872', '0.6790', '0.1309', '200', '12/10/21 20:00:26']
INFO ::: updating reports.csv
INFO ::: KNN Regression completed, results:
INFO ::: ['KNN Regressor, where K = 2', '3.0341', '0.8948', '10.0062', '4.9744', '0.6800', '0.1682', '200', '12/10/21 20:00:26']
INFO ::: updating reports.csv
INFO ::: KNN Regression completed, results:
INFO ::: ['KNN Regressor, where K = 3', '2.8754', '0.9360', '9.1439', '5.0326', '0.7133', '0.1556', '200', '12/10/21 20:00:26']
INFO ::: updating reports.csv
INFO ::: KNN Regression completed, results:
INFO ::: ['KNN Regressor, where K = 4', '2.8586', '0.7835', '8.7853', '4.0419', '0.7184', '0.1341', '200', '12/10/21 20:00:26']
INFO ::: updating reports

### Generate Results

In [10]:
# Setup
logging.basicConfig(
    level=logging.INFO,
    format="%(levelname)s ::: %(message)s")

logging.info("starting knn.py")
start = time.time()

# Prepare Data
# NOTE: this is using pre cleaned / processed data set
data_frame = read_csv(PATH)
logging.info(
    "read in %s records to data frame with %s features",
    len(data_frame),
    len(data_frame.columns))

size = len(data_frame) // LIMITER
logging.info("will use %s records for MLP Classifier", size)

subset = data_frame.iloc[:size,]
working_subset = clean_copy(subset)
predictors, target = split_data(working_subset)

logging.info("converting predictors and target to numpy arrays")
predictors = predictors.to_numpy()
target = target.to_numpy()

# KNN
logging.warning("starting KNN Regression - good luck.")
knn_results = knn(
    "KNN Regressor, where K = " +str(K_VALUE),
    (K_VALUE),predictors, target)

logging.info("KNN Regression completed, results:")
logging.info(knn_results)

logging.info("updating reports.csv")
save(REPORT_PATH, knn_results)

# Random Forest
logging.warning("starting Random Forest Regression - good luck.")
rfr_results = rfr(
    "Random Forest Regression, where n_estimators="+str(ESTIMATORS),
    ESTIMATORS, predictors, target)

logging.info("Random Forest Regression completed, results:")
logging.info(rfr_results)

logging.info("updating reports.csv")
save(REPORT_PATH, rfr_results)

# Decision Tree
logging.warning("starting Decision Tree Regression - good luck.")
dtr_results = dtr("Decision Tree Regression",predictors, target)

logging.info("Decision Tree Regression completed, results:")
logging.info(dtr_results)

logging.info("updating reports.csv")
save(REPORT_PATH, dtr_results)

# Multiple Linear Regression
logging.warning("starting Multiple Linear Regression - good luck.")
mlr_results = mlr("Multiple Linear Regression",predictors, target)

logging.info("Multiple Linear Regression completed, results:")
logging.info(mlr_results)

logging.info("updating reports.csv")
save(REPORT_PATH, mlr_results)

# Multi-layer Perceptron Regression
logging.warning("starting MLP Regressor - good luck.")
mlp_results = mlp(
    "MLP Regression using Relu: " +str(LAYERS)+" hidden layers with "+str(LAYER_SIZE)+
    " nodes per layer",
    (LAYERS,LAYER_SIZE), "relu", predictors, target)

logging.info("MLP Regressor completed, results:")
logging.info(mlp_results)

logging.info("updating reports.csv")
save(REPORT_PATH, mlp_results)

end = time.time()
print("executed knn.py in :", end-start)

INFO ::: starting knn.py
INFO ::: read in 200596 records to data frame with 205 features
INFO ::: will use 200 records for MLP Classifier
INFO ::: converting predictors and target to numpy arrays
INFO ::: KNN Regression completed, results:
INFO ::: ['KNN Regressor, where K = 4', '2.8586', '0.7835', '8.7853', '4.0419', '0.7184', '0.1341', '200', '12/10/21 20:00:54']
INFO ::: updating reports.csv
INFO ::: Random Forest Regression completed, results:
INFO ::: ['Random Forest Regression, where n_estimators=150', '1.8760', '0.5084', '3.7779', '2.0806', '0.8855', '0.0283', '200', '12/10/21 20:00:56']
INFO ::: updating reports.csv
INFO ::: Decision Tree Regression completed, results:
INFO ::: ['Decision Tree Regression', '2.2244', '0.7597', '5.5250', '3.3412', '0.8044', '0.1707', '200', '12/10/21 20:00:56']
INFO ::: updating reports.csv
INFO ::: Multiple Linear Regression completed, results:
INFO ::: ['Multiple Linear Regression', '0.0000', '0.0000', '0.0000', '0.0000', '1.0000', '0.0000', '2

executed knn.py in : 20.885556936264038
