# Rankability Predicting Sensitivity
## March Madness Dataset

Goal of this notebook is to analyze and visualize the results

In [11]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
import copy
import os
import pandas as pd
import numpy as np
import networkx as nx
from scipy.stats import pearsonr
from scipy.stats import skew
from tqdm import tqdm
import matplotlib.pyplot as plt
from joblib import Parallel, delayed

In [13]:
from pathlib import Path
home = str(Path.home())
home

'/home/jupyter-pander14'

In [14]:
import sys
sys.path.insert(0,"%s/rankability_toolbox_dev"%home)
import pyrankability

In [15]:
sys.path.insert(0,"%s/sensitivity_study/src"%home)
from sensitivity_tests import *
from utilities import *
from base import *

In [16]:
import itertools
import joblib

In [17]:
problem_1 = joblib.load("/disk/rankability_datasets/sensitivity_study/problem_0001a.joblib.z")

In [18]:
print(problem_1["description"])


This dataset represents the sensitivity problem as defined as follows:

A practitioner wants to predict the degree to which a season of the NCAA Men’s Basketball 
will be sensitive to choice of Massey specific ranking pipeline throughout the entire season beginning at the 
half way mark. The goal is to do so without resorting to full or partial enumeration of all possible choices. 
True sensitivity of a season will be measured as the mean top 10 intersection over the cartesian product 
of algorithms and parameters.

direct_thress = [0,1,2]
spread_thress = [0,3,6]
weight_indirects = [0.25,0.5,1.]
domains_ranges = [('all','madness'),('madness','madness')]



In [19]:
problem_1['target']

Year
2002    0.835684
2003    0.840171
2004    0.860470
2005    0.849786
2006    0.847863
2007    0.869872
2008    0.855128
2009    0.863034
2010    0.867949
2011    0.842308
2012    0.805556
2013    0.804701
2014    0.857051
2015    0.862393
2016    0.837607
2017    0.840812
2018    0.845085
Name: mean_top10_intersection, dtype: float64

In [20]:
problem_1['data'].keys()

dict_keys(['2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018'])

In [21]:
problem_1['data']['2002'].keys()

dict_keys(['frac=0.5', 'frac=0.6', 'frac=0.7', 'frac=0.8', 'frac=0.9', 'frac=1.0'])

In [22]:
problem = problem_1
years = list(problem['target'].index)
remaining_games = problem['other']['remaining_games']
madness_teams = problem['other']['madness_teams']

In [23]:
# Parameters to search
direct_thress = [0,1,2]
spread_thress = [0,3,6]
weight_indirects = [0.25,0.5]
domains_ranges = [('all','madness'),('madness','madness')]

years_train = ['2002','2003','2004','2005','2006']
years_test = copy.copy(years)
for year in years_train:
    years_test.remove(year)

In [113]:
def compute(game_df,team_range,direct_thres,spread_thres,weight_indirect):
    hillside_columns = ["direct_thres","spread_thres","weight_indirect","details"]

    map_func = lambda linked: pyrankability.construct.support_map_vectorized_direct_indirect_weighted(linked,direct_thres=direct_thres,spread_thres=spread_thres,weight_indirect=weight_indirect)
    D = pyrankability.construct.V_count_vectorized(game_df,map_func).reindex(index=team_range,columns=team_range)
    k,details = pyrankability.rank.solve(D,method='hillside',lazy=False,cont=True)
    x = pd.DataFrame(details['x'],columns=D.columns,index=D.index)
    c = pd.DataFrame(pyrankability.construct.C_count(D),columns=D.columns,index=D.index)
    P = details['P']
    simple_details = {'k':k,'x':x,'c':c,'P':P,'D':D}
    hillside_ret = pd.Series([direct_thres,spread_thres,weight_indirect,simple_details],index=hillside_columns)
    return hillside_ret


In [114]:
from scipy.stats import pearsonr
def score_by_correlation(s,r):
    return pearsonr(s,r)[0]

In [152]:
def calc_score(y,score_by,direct_thres,spread_thres,weight_indirect,domain_range,top_n=10):
    parameter_string = f"{domain_range},dt={direct_thres},st={spread_thres},iw={weight_indirect}"
    values = []
    for year in y.index:
        # set the team_range
        team_range = None
        if domain_range[1] == 'madness':
            team_range = madness_teams[year]
        elif domain_range[1] == 'all':
            team_range = all_teams[year]
        elif "top" in domain_range:
            team_range = all_teams[year]
        
        knorms = []
        for frac_key in problem['data'][year].keys():
            hillside_details = compute(problem['data'][year][frac_key],team_range,direct_thres,spread_thres,weight_indirect)
            perm = np.array(hillside_details['details']['P'][0])
            C = hillside_details['details']['c'].iloc[perm,:].iloc[:,perm].iloc[:top_n,:].iloc[:,:top_n]
            n = len(C)
            kmax = (n*n-n)/2 * n
            k = np.sum(np.triu(C))
            knorms.append(k/kmax)
        knorm = np.mean(knorms)
        values.append(knorm)
    print(values)
    return pd.DataFrame([[score_by(values,y),parameter_string]],columns=columns)

In [153]:
results = pd.DataFrame(columns=["Score","Parameters"])
columns = results.columns
results.set_index('Parameters',inplace=True)

outer_keys = list(itertools.product(direct_thress,spread_thress,weight_indirects,domains_ranges))

#direct_thres,spread_thres,weight_indirect,domain_range=outer_keys[0]
#calc_score(problem['target'],score_by_correlation,direct_thres,spread_thres,weight_indirect,domain_range)
job_results = Parallel(n_jobs=-1)(delayed(calc_score)(problem['target'].loc[years_train],score_by_correlation,direct_thres,spread_thres,weight_indirect,domain_range) for direct_thres,spread_thres,weight_indirect,domain_range in outer_keys)
for jresults in job_results:
    jresults = jresults.set_index('Parameters')
    results = results.append(jresults)

In [154]:
best_parameters = results.idxmin()
results.loc[best_parameters]

Unnamed: 0_level_0,Score
Parameters,Unnamed: 1_level_1
"('all', 'madness'),dt=0,st=3,iw=0.25",-0.887119


In [155]:
best_parameters_str = list(best_parameters)[0]
best_parameters_str

"('all', 'madness'),dt=0,st=3,iw=0.25"

In [156]:
def parameter_string_to_parameters(parameter_string):
    cmd = parameter_string.split(')')[0]+")"
    fields = cmd.replace("(","").replace(")","").replace("'","").split(",")
    domain_range = tuple(fields)
    fields = parameter_string.split("),")[1].split(",")
    dt = int(fields[0].split("=")[1])
    st = int(fields[1].split("=")[1])
    iw = float(fields[2].split("=")[1])
    return domain_range,dt,st,iw

In [159]:
test_results = pd.DataFrame(columns=["Score","Parameters"])
columns = test_results.columns
test_results.set_index('Parameters',inplace=True)

domain_range,direct_thres,spread_thres,weight_indirect=parameter_string_to_parameters(best_parameters_str)
#spread_thres = 0
test_results = calc_score(problem['target'].loc[years_test],score_by_correlation,direct_thres,spread_thres,weight_indirect,domain_range)

[0.3285185185185185, 0.47888888888888886, 0.5914814814814814, 0.4114814814814814, 0.3862962962962963, 0.5459259259259258, 0.40592592592592586, 0.5322222222222223, 0.45555555555555555, 0.5748148148148148, 0.727037037037037, 0.6014814814814815]


In [160]:
test_results

Unnamed: 0,Score,Parameters
0,-0.165257,"('all', ' madness'),dt=0,st=3,iw=0.25"
