# Rankability Predicting Sensitivity
## March Madness Dataset

Goal of this notebook is to analyze and visualize the results

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import copy
import os
import pandas as pd
import numpy as np
import networkx as nx
from scipy.stats import pearsonr
from scipy.stats import skew
from tqdm import tqdm
import matplotlib.pyplot as plt
from joblib import Parallel, delayed

In [None]:
from pathlib import Path
home = str(Path.home())
home

In [None]:
import sys
sys.path.insert(0,"%s/rankability_toolbox_dev"%home)
import pyrankability

In [None]:
sys.path.insert(0,"%s/sensitivity_study/src"%home)
from sensitivity_tests import *
from utilities import *
from base import *

In [None]:
import itertools
import joblib

This dataset represents the sensitivity problem as defined as follows:

A practitioner wants to predict the degree to which a tournament from the Lichess Arena Tournaments
will be likely to change after the second half of the tournament is played. The practioner expects that rankings for traditional tournaments are more likely to change than rankings for Berserk tournaments. This is in the context of a Massey with a specific set of parameters:<br>
direct_thress = [0,1,2]<br>
spread_thress = [0,3,6]<br>
weight_indirects = [0.25,0.5,1.]<br>
domains_ranges = [...]

In [None]:
problem = joblib.load("/disk/rankability_datasets/sensitivity_study/problem_0002a.joblib.z")

In [None]:
print(problem["description"])

In [None]:
problem['target']

In [None]:
problem['data'].keys()

In [None]:
problem['data']['2002'].keys()

In [None]:
problem['target'].to_frame().reset_index()

In [None]:
def compute(game_df,team_range,direct_thres,spread_thres,weight_indirect):
    hillside_columns = ["direct_thres","spread_thres","weight_indirect","details"]

    map_func = lambda linked: pyrankability.construct.support_map_vectorized_direct_indirect_weighted(linked,direct_thres=direct_thres,spread_thres=spread_thres,weight_indirect=weight_indirect)
    D = pyrankability.construct.V_count_vectorized(game_df,map_func).reindex(index=team_range,columns=team_range)
    k,details = pyrankability.rank.solve(D,method='hillside',lazy=False,cont=True)
    x = pd.DataFrame(details['x'],columns=D.columns,index=D.index)
    c = pd.DataFrame(pyrankability.construct.C_count(D),columns=D.columns,index=D.index)
    P = details['P']
    simple_details = {'k':k,'x':x,'c':c,'P':P,'D':D}
    hillside_ret = pd.Series([direct_thres,spread_thres,weight_indirect,simple_details],index=hillside_columns)
    return hillside_ret


In [None]:
from scipy.stats import pearsonr
from scipy.stats import kendalltau
def score_by_correlation(s,r):
    return pearsonr(s,r)[0]

In [None]:
def calc_score(y,score_by,direct_thres,spread_thres,weight_indirect,domain_range,top_n=10):
    parameter_string = f"{domain_range},dt={direct_thres},st={spread_thres},iw={weight_indirect}"
    values = []
    for year in y.index:
        # set the team_range
        team_range = None
        if domain_range[1] == 'madness':
            team_range = madness_teams[year]
        elif domain_range[1] == 'all':
            team_range = all_teams[year]
        elif "top" in domain_range:
            team_range = all_teams[year]
        
        knorms = []
        for frac_key in ['frac=0.5']:#problem['data'][year].keys():
            hillside_details = compute(problem['data'][year][frac_key],team_range,direct_thres,spread_thres,weight_indirect)
            perm = np.array(hillside_details['details']['P'][0])
            C = hillside_details['details']['c'].iloc[perm,:].iloc[:,perm].iloc[:top_n,:].iloc[:,:top_n]
            n = len(C)
            kmax = (n*n-n)/2 * n
            k = np.sum(np.triu(C))
            knorms.append(k/kmax)
        knorm = np.mean(knorms)
        values.append(knorm)
    return pd.DataFrame([[score_by(values,y),parameter_string]],columns=columns)

In [None]:
direct_thress = [0,1,2]
spread_thress = [0,3,6]
weight_indirects = [0.25,0.5,1.]
domains_ranges = [('all','madness'),('madness','madness')]

keys = list(itertools.product(direct_thress,spread_thress,weight_indirects,domains_ranges))

test_results = pd.DataFrame(columns=["Score","Parameters"]).set_index('Parameters')
for direct_thres,spread_thres,weight_indirect,domain_range in keys:
    targets = problem['target'].to_frame().reset_index().set_index(['direct_thres','spread_thres','weight_indirect','Domain','Range']).loc[direct_thres,spread_thres,weight_indirect,domain_range[0],domain_range[1]]
    targets = targets.set_index('Year')['intersection_0.5_to_1.0']

    years = list(targets.index)
    remaining_games = problem['other']['remaining_games']
    madness_teams = problem['other']['madness_teams']

    years_train = ['2002','2003','2004','2005','2006']
    years_test = copy.copy(years)
    for year in years_train:
        years_test.remove(year)

    test_results1 = calc_score(targets.loc[years_test],score_by_correlation,direct_thres,spread_thres,weight_indirect,domain_range)
    test_results = test_results.append(test_results1.set_index('Parameters'))

In [None]:
test_results

In [None]:
test_results.plot.hist()

In [None]:
test_results.sort_values(by="Score")