# Rankability Predicting Sensitivity
## March Madness Dataset

Goal of this notebook is to analyze and visualize the results

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import copy
import os
import pandas as pd
import numpy as np
import networkx as nx
from scipy.stats import pearsonr
from scipy.stats import skew
from tqdm import tqdm
import matplotlib.pyplot as plt
from joblib import Parallel, delayed

In [3]:
from pathlib import Path
home = str(Path.home())
home

'/disk/home/amy'

In [4]:
import sys
sys.path.insert(0,"%s/rankability_toolbox_dev"%home)
import pyrankability

In [5]:
sys.path.insert(0,"%s/sensitivity_study/src"%home)
from sensitivity_tests import *
from utilities import *
from base import *

In [6]:
games={}
remaining_games={}
madness_teams={}
all_teams={}
years = ["2002","2003","2004","2005","2006","2007","2008","2009","2010","2011","2012","2013","2014","2015","2016","2017","2018"]
for year in years:
    games[year],remaining_games[year] = read_data('../data/%steams.txt'%year,'../data/%sgames.txt'%year,'../data/%sMadnessTeams.txt'%year)
    madness_teams[year] = list(np.unique(list(games[year].team1_name.loc[games[year].team1_madness == 1]) + list(games[year].team2_name.loc[games[year].team2_madness == 1])))
    all_teams[year] = list(np.unique(list(games[year].team1_name) + list(games[year].team2_name)))
print(year)
games[year]

2018


Unnamed: 0,team2,team1,notsure1,date,H_A_N1,points1,H_A_N2,points2,team1_name,team2_name,team1_madness,team2_madness
0,1,16,737011,2017-11-13,1,83,-1,69,Arkansas_St,Abilene_Chr,0,0
1,1,41,737114,2018-02-24,-1,74,1,72,Cent_Arkansas,Abilene_Chr,0,0
3,1,143,737018,2017-11-20,-1,75,1,67,Lipscomb,Abilene_Chr,1,0
4,1,143,737045,2017-12-17,1,67,-1,65,Lipscomb,Abilene_Chr,1,0
5,1,199,737056,2017-12-28,1,77,-1,74,New_Orleans,Abilene_Chr,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
5535,351,318,737048,2017-12-20,1,91,-1,74,Utah_St,Youngstown_St,0,0
5536,351,339,737086,2018-01-27,1,85,-1,67,WI_Green_Bay,Youngstown_St,0,0
5537,351,340,737084,2018-01-25,1,66,-1,55,WI_Milwaukee,Youngstown_St,0,0
5538,351,347,737074,2018-01-15,1,77,-1,67,Wright_St,Youngstown_St,1,0


In [7]:
remaining_games[year]

Unnamed: 0,team2,team1,notsure1,date,H_A_N1,points1,H_A_N2,points2,team1_name,team2_name,team1_madness,team2_madness
2,1,73,737130,2018-03-12,1,80,-1,73,Drake,Abilene_Chr,0,0
32,2,313,737125,2018-03-07,1,97,-1,90,UNLV,Air_Force,0,0
42,3,79,737126,2018-03-08,0,67,0,58,E_Michigan,Akron,0,0
59,4,137,737128,2018-03-10,0,86,0,63,Kentucky,Alabama,1,1
68,4,326,737135,2018-03-17,0,81,0,58,Villanova,Alabama,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5478,347,287,737133,2018-03-15,0,73,0,47,Tennessee,Wright_St,1,1
5491,348,197,737126,2018-03-08,0,85,0,75,New_Mexico,Wyoming,0,0
5496,349,92,737136,2018-03-18,0,75,0,70,Florida_St,Xavier,1,1
5498,349,234,737127,2018-03-09,0,75,0,72,Providence,Xavier,1,1


In [12]:
# Note to future self: Parameters from FODS paper but might need to be optimized
direct_thress = [0,1,2]
spread_thress = [0,3,6]
weight_indirects = [0.25,0.5,1.]
# fracs represent how much of the data to include
fracs = [0.5,0.55,0.6,1.] # 25% of total data added per step
domains_ranges = [('all','madness'),('madness','madness')]
outer_keys = list(itertools.product(domains_ranges,years))

In [9]:
import itertools
import joblib

In [14]:
loaded = joblib.load('checkpoint1.joblib.z')
colley_rankings=loaded['colley_rankings']
massey_rankings=loaded['colley_rankings']
colley_rs=loaded['colley_rs']
massey_rs=loaded['massey_rs']

In [15]:
feature_name = 'mean_top10_intersection'
def compute_score(data):
    k=10
    s = 0
    c=0
    for i1,i2 in itertools.combinations(range(len(data)),2):
        s+=len(set(data[i1][:k]).intersection(set(data[i2][:k])))/k
        c+=1
    return s/c

def results_to_frame(results,method,domain_range,year):
    t = results.to_frame()
    t.columns=[feature_name]
    t['Method'] = method
    t['Year']=year
    t['Domain']=domain_range[0]
    t['Range']=domain_range[1]
    t.reset_index(inplace=True)
    return t

ms = pd.DataFrame(columns=['frac',feature_name,'Method','Year'])
for domain_range,year in tqdm(outer_keys):
    # set the team_range
    team_range = None
    if domain_range[1] == 'madness':
        team_range = madness_teams[year]
    elif domain_range[1] == 'all':
        team_range = all_teams[year]    
    results = colley_rankings[(domain_range,year)].groupby(['frac']).apply(lambda df: compute_score(df[team_range].astype(int).values.tolist()))
    ms = ms.append(results_to_frame(results,'Colley',domain_range,year))
    results = massey_rankings[(domain_range,year)].groupby(['frac']).apply(lambda df: compute_score(df[team_range].astype(int).values.tolist()))
    ms = ms.append(results_to_frame(results,'Massey',domain_range,year))
ms

100%|██████████| 34/34 [00:02<00:00, 16.71it/s]


Unnamed: 0,frac,mean_top10_intersection,Method,Year,Domain,Range
0,0.50,0.948718,Colley,2002,all,madness
1,0.55,0.948718,Colley,2002,all,madness
2,0.60,0.933333,Colley,2002,all,madness
3,1.00,1.000000,Colley,2002,all,madness
0,0.50,0.948718,Massey,2002,all,madness
...,...,...,...,...,...,...
3,1.00,0.848718,Colley,2018,madness,madness
0,0.50,0.930769,Massey,2018,madness,madness
1,0.55,0.910256,Massey,2018,madness,madness
2,0.60,0.900000,Massey,2018,madness,madness


In [16]:
import altair as alt

ms['Method and Options']=ms['Method'] + ' '+ms['Domain']+':'+ms['Range']

alt.Chart(ms).mark_bar().encode(
    x='Method and Options',
    y=alt.Y(feature_name,scale=alt.Scale(0,1)),
    color='Method and Options'#,
    #column='Year:N'
).properties(
    width=180,
    height=180
).facet(row='Year:N',column='frac')

In [17]:
graph_data = ms.groupby(['frac','Year','Method'])['mean_top10_intersection'].mean().to_frame().reset_index()
alt.Chart(graph_data).mark_bar().encode(
    alt.X("mean_top10_intersection:Q", bin=True),
    y='count()',
)