# Rankability Predicting Sensitivity
## March Madness Dataset

Look for new features:</br>
    
Existing features:</br>
    Diversity of P metrics</br>
    Graph measures of tournament matrix as Lapacian</br>
    Eigenvalues of tournament matrix</br>
    
Datasets:</br>
   Lichess:</br>
       API: https://berserk.readthedocs.io/en/master/ </br>
       Look for tournaments, grab games played in that time frame and create D matricies</br>
       API is pretty simple we just need to build a scraping script.</br>
   
   Sumo:
   Data: https://data.world/cervus/sumo-results </br>
   It's literally just CSVs, so grab to PANDAS and build D from columns
   Bad news: Have to make an account to download data :( /s

## Outline of the Experiment

 1. Load in the game-by-game data for each year from 2002-2018
 2. For each year, construct multiple D matrices using increasing percentages of the games for that year.</br>_(For instance in the simplest case, construct a D matrix using only the first half of the season, then a D matrix with the full season.)_
 2. Produce Massey rankings for each D matrix and calculate the Kendall tau between rankings from the same year </br>_(These kendall taus represent the amount that the ranking changed when more data was included)_
 3. For each year, measure features of the restricted dataset (in the simple case, D constructed from 50% of the games) and create a dataset of these early-measurable features.
 4. Evaluate whether these early-measurable features can be used to predict the amount that rankings changed after including more data (Kendall taus).

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import copy
import os
import pandas as pd
import numpy as np
import networkx as nx
from scipy.stats import pearsonr
from scipy.stats import skew
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
from pathlib import Path
home = str(Path.home())
home

In [None]:
import sys
sys.path.insert(0,"%s/rankability_toolbox_dev"%home)
import pyrankability

In [None]:
sys.path.insert(0,"%s/sensitivity_study/src"%home)
from sensitivity_tests import *
from utilities import *
from base import *

In [None]:
# Note to future self: Parameters from FODS paper but might need to be optimized
direct_thres = 2
spread_thres = 2
weight_indirect = 0.5
Ds = {}
# fracs represent how much of the data to include
fracs = [0.75, 1.] # 25% of total data added
pairs = list(zip(fracs[:-1], fracs[1:]))
pair_to_predict = pairs[-1]

In [None]:
games={}
years = ["2002","2003","2004","2005","2006","2007","2008","2009","2010","2011","2012","2013","2014","2015","2016","2017","2018"]
for year in years:
    games[year] = pd.read_csv("../data/MarchMadnessDataFrames/march_madness_%s.csv"%year)
print(year)
games[year]

In [None]:
for year in tqdm(games.keys()):
    Ds[year] = {}
    madness_teams = np.unique(list(games[year].team1_name.loc[games[year].team1_madness == 1]) + list(games[year].team2_name.loc[games[year].team2_madness == 1]))
    game_list = list(games[year].index)
    
    game_df = pd.DataFrame({"team1_name":games[year]['team1_name'],
                            "team1_score":games[year]['points1'],
                            "team1_H_A_N": games[year]['H_A_N1'],
                            "team2_name":games[year]['team2_name'],
                            "team2_score":games[year]['points2'],
                            "team2_H_A_N": games[year]['H_A_N1'],
                            "date": games[year]['date']
                           }).sort_values(by='date').drop('date',axis=1)
    for frac in fracs:
        upper = int(len(game_df)*frac)
        game_df_sample = game_df.iloc[:upper,:]
        # support_map_vectorized_direct_indirect_weighted implements our common approach to looking for evidence of direct and indirect dominance
        # I'm just using an annonymous function because the helper function V_count_vectorized expects a function with one argument
        map_func = lambda linked: pyrankability.construct.support_map_vectorized_direct_indirect_weighted(linked,direct_thres=direct_thres,spread_thres=spread_thres,weight_indirect=weight_indirect)
        Ds[year][frac] = pyrankability.construct.V_count_vectorized(game_df_sample,map_func).loc[madness_teams,madness_teams]

In [None]:
rankings = {}
taus = {}
results = pd.DataFrame(columns=pairs+["Year"]).set_index("Year")

for year in tqdm(games.keys()):
    rankings[year] = []
    taus[year] = {}
    data = []
    for i in range(len(pairs)):
        pair = pairs[i]
        D1 = Ds[year][pair[0]]
        D2 = Ds[year][pair[1]]
        ranking1 = MasseyRankingAlgorithm().rank(D1.fillna(0).values)
        ranking2 = MasseyRankingAlgorithm().rank(D2.fillna(0).values)
        rankings[year].append((ranking1,ranking2))
        ranking1, ranking2 = rankings[year][i]
        taus[year][pair] = kendall_tau(ranking1,ranking2)
        data.append(taus[year][pair])
    results = results.append(pd.Series(data,index=results.columns,name=year))

In [None]:
taus

In [None]:
results

In [None]:
# Note: what do the contents of this matrix mean??
# Ds['2018'][1.0]

In [None]:
df_details = []
# This function constructs a dataframe of features
# (collected from the first D matrix of each pair)
# To be used to predict the movement of the pair (kendall tau of rankings)
def get_rankability_results(n_restarts=250):
    df_ks = []
    df_years = []
    df_fracs = []
    df_p_stats = {}
    for year in tqdm(games.keys()):
        D = Ds[year][pair_to_predict[0]].fillna(0)
        k,details = pyrankability.rank.solve(D,method='lop', num_random_restarts=n_restarts, lazy=False, cont=True)
        p_stats = get_P_stats(details["P"])
        for name, val in p_stats.items():
            if name not in df_p_stats:
                df_p_stats[name] = []
            df_p_stats[name].append(val)
        df_ks.append(k)
        df_years.append(year)
        df_fracs.append(pair_to_predict[0])
        df_details.append(details)

    results_temp = {"k":df_ks,"Year":df_years,"Fraction":df_fracs}
    for key, val in df_p_stats.items():
        if key in results_temp:
            raise ValueError("Duplicate column name! Fix collision before moving on!")
        results_temp[key] = val

    return pd.DataFrame(results_temp)

In [None]:
rankability_results = get_rankability_results(n_restarts=2)
rankability_results

In [None]:
for_corr = rankability_results.loc[rankability_results.Fraction==pair_to_predict[0]].set_index('Year').join(results)
for_corr

In [None]:
for_corr.corr()

In [None]:
pearsonr(for_corr['k'],for_corr[pair_to_predict])

Notes Ethan 9/27/20

Determinant and trace of Ds not good features <br/>
Max and min eigenvalue not bad <br/>

Attempted betweenness centrality features, none worthwhile <br/>

`betweennesses = nx.betweenness_centrality(dsGraph)
avg_bt_centrality = np.average(np.array(list(betweennesses.values())))
var_bt_centrality = np.sqrt(np.var(np.array(list(betweennesses.values()))))
print(avg_bt_centrality, var_bt_centrality)`

Notes Ethan 10/1/20:
Feats: x_star eigenvals, min and max, matrix norm of x_star, etc.
Feature idea: Cut data in half, run alg at 25% mark and see what we would predict for .25,.5


In [None]:
col_names = [
    'Year',
    '# X* frac',
    'k',
    '# X* frac top 40',
    'kendall_w',
    'p_lowerbound',
    'max_L2_dist',
    'mean_L2_dist',
    'min_tau',
    'mean_tau',
    'max_eigenval',
    'min_eigenval',
    'max_eigenval_xstar',
    'min_eigenval_xstar',
    'Pair'
]

all_score_df = pd.DataFrame(columns=col_names)

c=0
for year in tqdm(games.keys()):
    # the support matrix for a pair for a given year
    pair_to_predict = (.75, 1.0)
    V = Ds[year][pair_to_predict[0]]
    # print(V.fillna(0.0))
    vals, vecs = np.linalg.eig(V.fillna(0.0).to_numpy())
    determinant = np.prod(vals)
    trace = np.sum(vals)
    max_eigenval = np.real(np.max(vals))
    min_eigenval = np.real(np.min(vals))
    dsGraph = nx.from_numpy_matrix(V.fillna(0.0).to_numpy())
    
    rresults = rankability_results.iloc[c,:]
    k = rresults['k']
    details = df_details[c]
    x = pd.DataFrame(details['x'],index=V.index,columns=V.columns)
    r = x.sum(axis=0)
    order = np.argsort(r)
    xstar = x.iloc[order,:].iloc[:,order]
    xstar.loc[:,:] = pyrankability.common.threshold_x(xstar.values)
    # print(xstar.values)
    print(np.linalg.norm(xstar.values, "fro"), 'fro')
    vals, vecs = np.linalg.eig(xstar.to_numpy())
    det_xstar = np.real(np.prod(vals))
    print("det", det_xstar)
    max_eigenval_xstar = np.real(np.max(vals))
    min_eigenval_xstar = np.real(np.min(vals))
    print(max_eigenval_xstar)
    print(min_eigenval_xstar)
    
    inxs = np.triu_indices(len(xstar),k=1)
    xstar_upper = xstar.values[inxs[0],inxs[1]]
    nfrac_upper = sum((xstar_upper > 0) & (xstar_upper < 1))
    flat_frac = ((xstar > 0) & (xstar < 1)).sum(axis=0)
    nfrac_top_40 = flat_frac.iloc[:40].sum()
    entry_data = [
        year,
        nfrac_upper*2,
        k,
        nfrac_top_40,
        rresults["kendall_w"],
        rresults["p_lowerbound"],
        rresults["max_L2_dist"],
        rresults["mean_L2_dist"],
        rresults["min_tau"],
        rresults["mean_tau"],
        max_eigenval, 
        min_eigenval,
        max_eigenval_xstar,
        min_eigenval_xstar,
        pair
    ]
    entry = pd.Series(entry_data,col_names,name=c)
    c+=1
    all_score_df = all_score_df.append(entry)
all_score_df.set_index("Year",inplace=True)

In [None]:
all_score_df = all_score_df.loc[all_score_df.Pair == pair_to_predict].drop('Pair',axis=1).join(results)

In [None]:
# By this point, all_score_df includes all features that will be used to predict the sensitivity to new information
all_score_df

In [None]:
all_score_df.to_csv("all_score_df.csv")

#### Only run if you need a_s_d from file

In [None]:
all_score_df = pd.read_csv("all_score_df.csv")
all_score_df = all_score_df.set_index("Year")

# All the pairs have become strings after being read in
pair_to_predict = str(pair_to_predict)
pairs = [str(p) for p in pairs]

In [None]:
all_score_df.columns
cdf = all_score_df.fillna(0.0).corr()
cdf

In [None]:
cdf = all_score_df.fillna(0.0).corr()
print(cdf.columns)
for col in all_score_df.columns:
    if col not in pairs:
        # all_score_df.plot.scatter(col, pair_to_predict, title="Final Sensitivity vs " + col)
        print("Correlation between", pair_to_predict, "and", col, cdf[pair_to_predict].loc[col], "R^2", cdf[pair_to_predict].loc[col]**2)

In [None]:
import sklearn.linear_model as skl_lm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut
from sklearn.svm import SVR
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import GridSearchCV

# List of all features to use to predict
# (drops the quantity to be predicted)
all_feature_cols = [c for c in all_score_df.columns if c not in pairs]

# Evaluates a regression model attempting to regress "pred_col" using leave-one-out
#     cross validation. If the model has parameters values to search over, each unique
#     parameter setting will be evaluated using 3-fold cross validation on top of the LOO CV.
# Reported statistics are [mean of absolute error] and [std of absolute error] over all LOO folds.
def evaluate(df,pred_col=pair_to_predict,feature_cols=all_feature_cols,model=DummyRegressor(),param_grid={}):
    loo = LeaveOneOut()
    pred_col = str(pred_col)
    y = df[pred_col]
    X = df[feature_cols]
    
    grid = GridSearchCV(model,param_grid,refit=True,verbose=0, cv=3, iid=True, n_jobs=-1)
    scores = cross_val_score(grid, X, y, scoring="neg_mean_absolute_error", cv=loo, n_jobs=1)
    return pd.Series([len(scores),np.mean(np.abs(scores)),np.std(scores)],index=["Folds","MAE","STD"])

In [None]:
pairs.append((.5, .75))
pairs.append((.75, 1.0))

Same as above but with exhaustive feature selection

Below:
Seems to be a bug with feature subset selection. All scores come out the same.

In [None]:
all_feature_cols = [c for c in all_score_df.columns if c not in pairs]


from sklearn.linear_model import LinearRegression
from itertools import chain, combinations

# Evaluates a regression model attempting to regress "pred_col" using leave-one-out
#     cross validation. If the model has parameters values to search over, each unique
#     parameter setting will be evaluated using 3-fold cross validation on top of the LOO CV.
# Reported statistics are [mean of absolute error] and [std of absolute error] over all LOO folds.
def evaluate_exhaustive(df,
                        pred_col=pair_to_predict,
                        feature_cols=all_feature_cols,
                        model=DummyRegressor(),
                        param_grid={},
                        print_best_params=False):
    exhaustive = {}
    y = df[pred_col]
    X = df[feature_cols]
    
    # run on subsets of features
    exhaustive_feat_select = list(chain.from_iterable(combinations(list(range(len(X.columns))), r) for r in range(len(X.columns))))[1:3000]
    # only 10 feature subsets (out of 2^n) for debug purposes
    best_score = np.Inf
    best_features = None
    for ps in tqdm(exhaustive_feat_select, ascii=True):
        features = X.iloc[:, list(ps)]
        grid = GridSearchCV(model,param_grid,refit=True,verbose=0, cv=3, iid=True, n_jobs=-1)
        exhaustive[ps] = np.mean(np.abs(cross_val_score(grid, features, y, scoring="neg_mean_absolute_error", cv=3, n_jobs=1)))
        if exhaustive[ps] < best_score:
            best_score = exhaustive[ps]
            best_features = ps
    
    # print(scores)
    return ({"MAE": best_score, "best_feature_subset": [X.columns[f] for f in best_features]}, exhaustive)
all_feature_cols

In [None]:
svr_model_results = evaluate(all_score_df,model=SVR(),param_grid = {'C': [1,10,100], 'epsilon': [.01, 0.1],'kernel': ['linear', 'rbf']})
svr_model_results

In [None]:
svr_model_results

In [None]:
X = all_score_df[all_feature_cols]
y = all_score_df[(.5, .75)]
grid = GridSearchCV(model,param_grid,refit=True,verbose=0, cv=3, iid=True, n_jobs=-1)
np.mean(np.abs(cross_val_score(grid, X.iloc[:, [5, 6, 7]], y, scoring="neg_mean_absolute_error", cv=3, n_jobs=1)))

In [None]:
lr_model_results, lr_model_scores = evaluate_exhaustive(all_score_df,pred_col=(.5, .75),model=LinearRegression(),param_grid = {'fit_intercept': [True, False]})
lr_model_results

In [None]:
lr_model_scores

In [None]:
dummy_model_results, dummyscores = evaluate_exhaustive(all_score_df, pred_col=(.5, .75))
dummy_model_results

In [None]:
# dummy_model_result_score = {'MAE': abs(np.max(np.array([max(i) for i in dummy_model_results.values])))}
dummy_model_result_score

In [None]:
lr_model_results

In [None]:
x = ['Baseline', "Linear Regression"]
maes = [dummy_model_results["MAE"], lr_model_results["MAE"]]
x_pos = [i for i, _ in enumerate(x)]

plt.bar(x_pos, maes)
plt.xlabel("Model")
plt.ylabel("Error")
plt.title("Mean Absolute Error of Regression Models")

plt.xticks(x_pos, x)

plt.show()