# Sensitivity to Modeling Parameters


In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression

from pathlib import Path
home = str(Path.home())
import sys
sys.path.insert(0,"%s/sensitivity_study/src"%home)
from experiment import read_raw_pairwise, construct_support_matrix, get_features_from_support, get_target_stability, eval_models
from sensitivity_tests import *
import utilities

In [None]:
years = ["2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009",
             "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018"]

# TODO: measure sensitivity of massey & colley to S construction params

config = {
    "col_mapping": {
        "team1_name":"team1_name",
        "team1_score":"points1",
        "team2_name":"team2_name",
        "team2_score":"points2",
        "team1_select": "team1_madness",
        "team2_select": "team2_madness",
        "date":"date"
    },
    "rankingMethods": [MasseyRankingAlgorithm(), ColleyRankingAlgorithm()],
    "correlationMethod":utilities.kendall_tau,
    "fracs": np.linspace(0.5, 1.0, num=21),
    "n_restarts": 5,
    "direct_thres": [0, 1, 2, 3, 4, 5],
    "spread_thres": [0, 1, 2, 3, 4, 5],
    "weight_indirect": np.linspace(0.1, 1.0, num=10),
    "raw_filepaths": ["{}/sensitivity_study/data/MarchMadnessDataFrames/march_madness_{}.csv".format(home,yr) for yr in years],
    "model_list": [{"model":DummyRegressor(), "param_grid": {}},
                   {"model":LinearRegression(), "param_grid": {'fit_intercept': [True, False]}}]
}

In [None]:
games = {fp: read_raw_pairwise(fp, config["col_mapping"]) for fp in tqdm(config["raw_filepaths"])}

In [None]:
df = {"fp":[], "frac":[], "method":[], "w":[]}
support_matricies = {}
rankings_by_method = {}
feature_df_list = []
# For each raw file (equivalent to a season / tournament / single scenario)
# get feature vector and target scalar
num_matrices = len(games.keys()) * len(config["fracs"]) * len(config["direct_thres"]) \
               * len(config["spread_thres"]) * len(config["weight_indirect"])
with tqdm(total=num_matrices) as pbar:
    for fp in tqdm(games.keys()):
        for frac in config["fracs"]:
            support_matricies[(fp, frac)] = []
            rankings_by_method[(fp, frac)] = {r.__class__.__name__: [] for r in config["rankingMethods"]}

            for d_thresh, s_thresh, w_ind in itertools.product(config["direct_thres"],
                                                               config["spread_thres"],
                                                               config["weight_indirect"]):
                support_mat = construct_support_matrix(games[fp],
                                                       frac,
                                                       direct_thres=d_thresh,
                                                       spread_thres=s_thresh,
                                                       weight_indirect=w_ind)
                support_matricies[(fp, frac)].append(support_mat)
                # get rankings for support for all ranking methods
                for rankingMethod in config["rankingMethods"]:
                    rankings_by_method[(fp, frac)][rankingMethod.__class__.__name__].append(rankingMethod.rank(support_mat.fillna(0).values))
                pbar.update(1)

            for methodName, rankings in rankings_by_method[(fp, frac)].items():
                df["fp"].append(fp[-8:-4])
                df["frac"].append(frac)
                df["method"].append(methodName)
                df["w"].append(kendall_w(rankings)[1])

df = pd.DataFrame(df)

In [None]:
df["fp"] = df["fp"].str[-8:-4] # Turn filename into year

In [None]:
df.groupby(["fp", "method"])["w"].mean().unstack().plot.bar(figsize=(12,5.5))

In [None]:
for yr in df["fp"].unique():
    for method in df["method"].unique():
        data = df.loc[(df.fp==yr)&(df.method==method), ["frac", "w"]]
        plt.plot(data.frac, data.w, label=method)
    plt.title("Sensitivity to Modeling Params ({})".format(yr))
    plt.legend()
    plt.show()

In [None]:
df.to_csv("sensitivity_to_modeling_parameters.csv", index=False)

In [None]:
results_dict = eval_models(features, targets, config["model_list"])

In [None]:
x = results_dict.keys()
maes = [results_dict[model]["MAE"] for model in x]
x_pos = [i for i, _ in enumerate(x)]

plt.bar(x_pos, maes)
plt.xlabel("Model")
plt.ylabel("Error")
plt.title("Mean Absolute Error of Regression Models")

plt.xticks(x_pos, x)

plt.show()