# Rankability Predicting Sensitivity
## March Madness Dataset

Goal of this notebook is to process the data using massey and colley for a bunch of different parameters. Save those results for later processing.

In [7]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [8]:
import copy
import os
import pandas as pd
import numpy as np
import networkx as nx
from scipy.stats import pearsonr
from scipy.stats import skew
from tqdm import tqdm
import matplotlib.pyplot as plt
from joblib import Parallel, delayed

In [9]:
from pathlib import Path
home = str(Path.home())
home

'/home/jupyter-pander14'

In [10]:
import sys
sys.path.insert(0,"%s/rankability_toolbox_dev"%home)
import pyrankability

In [11]:
sys.path.insert(0,"%s/sensitivity_study/src"%home)
from sensitivity_tests import *
from utilities import *
from base import *

In [12]:
games={}
remaining_games={}
madness_teams={}
all_teams={}
years = ["2002","2003","2004","2005","2006","2007","2008","2009","2010","2011","2012","2013","2014","2015","2016","2017","2018"]
for year in years:
    games[year],remaining_games[year] = read_data('../../data/%steams.txt'%year,'../../data/%sgames.txt'%year,'../../data/%sMadnessTeams.txt'%year)
    madness_teams[year] = list(np.unique(list(games[year].team1_name.loc[games[year].team1_madness == 1]) + list(games[year].team2_name.loc[games[year].team2_madness == 1])))
    all_teams[year] = list(np.unique(list(games[year].team1_name) + list(games[year].team2_name)))
print(year)
games[year]

2018


Unnamed: 0,team2,team1,notsure1,date,H_A_N1,points1,H_A_N2,points2,team1_name,team2_name,team1_madness,team2_madness
0,1,16,737011,2017-11-13,1,83,-1,69,Arkansas_St,Abilene_Chr,0,0
1,1,41,737114,2018-02-24,-1,74,1,72,Cent_Arkansas,Abilene_Chr,0,0
3,1,143,737018,2017-11-20,-1,75,1,67,Lipscomb,Abilene_Chr,1,0
4,1,143,737045,2017-12-17,1,67,-1,65,Lipscomb,Abilene_Chr,1,0
5,1,199,737056,2017-12-28,1,77,-1,74,New_Orleans,Abilene_Chr,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
5535,351,318,737048,2017-12-20,1,91,-1,74,Utah_St,Youngstown_St,0,0
5536,351,339,737086,2018-01-27,1,85,-1,67,WI_Green_Bay,Youngstown_St,0,0
5537,351,340,737084,2018-01-25,1,66,-1,55,WI_Milwaukee,Youngstown_St,0,0
5538,351,347,737074,2018-01-15,1,77,-1,67,Wright_St,Youngstown_St,1,0


In [13]:
remaining_games[year]

Unnamed: 0,team2,team1,notsure1,date,H_A_N1,points1,H_A_N2,points2,team1_name,team2_name,team1_madness,team2_madness
2,1,73,737130,2018-03-12,1,80,-1,73,Drake,Abilene_Chr,0,0
32,2,313,737125,2018-03-07,1,97,-1,90,UNLV,Air_Force,0,0
42,3,79,737126,2018-03-08,0,67,0,58,E_Michigan,Akron,0,0
59,4,137,737128,2018-03-10,0,86,0,63,Kentucky,Alabama,1,1
68,4,326,737135,2018-03-17,0,81,0,58,Villanova,Alabama,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5478,347,287,737133,2018-03-15,0,73,0,47,Tennessee,Wright_St,1,1
5491,348,197,737126,2018-03-08,0,85,0,75,New_Mexico,Wyoming,0,0
5496,349,92,737136,2018-03-18,0,75,0,70,Florida_St,Xavier,1,1
5498,349,234,737127,2018-03-09,0,75,0,72,Providence,Xavier,1,1


In [14]:
# Note to future self: Parameters from FODS paper but might need to be optimized
direct_thress = [0,1,2]
spread_thress = [0,3,6]
weight_indirects = [0.25,0.5,1.]
domains_ranges = [('all','madness'),('madness','madness')]

# fracs represent how much of the data to include
fracs = [0.5,0.6,0.7,0.8,0.9,1.]


In [15]:
import itertools
import joblib

In [16]:
massey_rankings = {}
colley_rankings = {}
massey_rs = {}
colley_rs = {}

outer_keys = list(itertools.product(domains_ranges,years))
for domain_range,year in tqdm(outer_keys):
    # set the team_domain
    team_domain = None
    if domain_range[0] == 'madness':
        team_domain = madness_teams[year]
    elif domain_range[0] == 'all':
        team_domain = all_teams[year]

    # set the team_range
    team_range = None
    if domain_range[1] == 'madness':
        team_range = madness_teams[year]
    elif domain_range[1] == 'all':
        team_range = all_teams[year]

    columns = ["frac","direct_thres","spread_thres","weight_indirect"]+team_range
    massey_rankings[(domain_range,year)] = pd.DataFrame(columns=columns)
    colley_rankings[(domain_range,year)] = pd.DataFrame(columns=columns)
    massey_rs[(domain_range,year)] = pd.DataFrame(columns=columns)
    colley_rs[(domain_range,year)] = pd.DataFrame(columns=columns)

    game_df = pd.DataFrame({"team1_name":games[year]['team1_name'],
                            "team1_score":games[year]['points1'],
                            "team1_H_A_N": games[year]['H_A_N1'],
                            "team2_name":games[year]['team2_name'],
                            "team2_score":games[year]['points2'],
                            "team2_H_A_N": games[year]['H_A_N1'],
                            "date": games[year]['date']
                           }).sort_values(by='date').drop('date',axis=1)
    mask = game_df.team1_name.isin(team_domain) & game_df.team2_name.isin(team_domain)
    game_df = game_df.loc[mask]

    keys = list(itertools.product(fracs,direct_thress,spread_thress,weight_indirects))

    def compute(frac,direct_thres,spread_thres,weight_indirect):
        upper = int(len(game_df)*frac)
        game_df_sample = game_df.iloc[:upper,:]

        map_func = lambda linked: pyrankability.construct.colley_matrices(linked,direct_thres=direct_thres,spread_thres=spread_thres,weight_indirect=weight_indirect)
        colley_matrix,colley_b = pyrankability.construct.map_vectorized(game_df_sample,map_func)
        colley_matrix = colley_matrix.reindex(index=team_range,columns=team_range)
        colley_b = colley_b.reindex(team_range)
        mask = colley_b.isna()
        colley_b = colley_b.loc[~mask]
        colley_matrix = colley_matrix.loc[~mask,~mask]
        inxs = list(np.where(mask)[0])
        ranking1,r1 = pyrankability.construct.ranking_from_matrices(colley_matrix.fillna(0),colley_b,inxs)

        map_func = lambda linked: pyrankability.construct.massey_matrices(linked,direct_thres=direct_thres,spread_thres=spread_thres,weight_indirect=weight_indirect)
        massey_matrix,massey_b = pyrankability.construct.map_vectorized(game_df_sample,map_func)
        massey_matrix = massey_matrix.reindex(index=team_range,columns=team_range)
        massey_b = massey_b.reindex(team_range)
        mask = massey_b.isna()
        massey_b = massey_b.loc[~mask]
        massey_matrix = massey_matrix.loc[~mask,~mask]    
        inxs = list(np.where(mask)[0])    
        ranking2,r2 = pyrankability.construct.ranking_from_matrices(massey_matrix.fillna(0),massey_b,inxs)
        ranking_values1 = [frac,direct_thres,spread_thres,weight_indirect]+list(ranking1)
        ranking_values2 = [frac,direct_thres,spread_thres,weight_indirect]+list(ranking2)
        r_values1 = [frac,direct_thres,spread_thres,weight_indirect]+list(r1)
        r_values2 = [frac,direct_thres,spread_thres,weight_indirect]+list(r2)
        return pd.Series(ranking_values1,index=columns),pd.Series(ranking_values2,index=columns),pd.Series(r_values1,index=columns),pd.Series(r_values2,index=columns)

    #frac,direct_thres,spread_thres,weight_indirect = keys[0]
    #compute(frac,direct_thres,spread_thres,weight_indirect)
    results = Parallel(n_jobs=-1)(delayed(compute)(frac,direct_thres,spread_thres,weight_indirect) for frac,direct_thres,spread_thres,weight_indirect in keys)

    c = 0
    for i,key in enumerate(keys):
        frac,direct_thres,spread_thres,weight_indirect = key
        massey,colley,massey_r,colley_r = results[i]
        massey.name = c
        colley.name = c
        colley_r.name=c
        massey_r.name=c
        massey_rankings[(domain_range,year)] = massey_rankings[(domain_range,year)].append(massey)
        colley_rankings[(domain_range,year)] = colley_rankings[(domain_range,year)].append(colley)
        massey_rs[(domain_range,year)] = massey_rs[(domain_range,year)].append(massey_r)
        colley_rs[(domain_range,year)] = colley_rs[(domain_range,year)].append(colley_r)
        c+=1

100%|██████████| 34/34 [16:29<00:00,  5.73s/it]


In [18]:
a

NameError: name 'a' is not defined

In [29]:
list(colley_rankings[(domain_range,year)].set_index('frac').loc[[0.5,1.0]].reset_index().groupby(['direct_thres','spread_thres','weight_indirect']))[0]

((0.0, 0.0, 0.25),
     frac  direct_thres  spread_thres  weight_indirect  Alabama  Alcorn_St  \
 0    0.5           0.0           0.0             0.25      8.0       39.0   
 27   1.0           0.0           0.0             0.25     12.0        8.0   
 
     Arizona  Boston_College  Boston_Univ  California  ...  UNC_Wilmington  \
 0      14.0            37.0         26.0        41.0  ...            19.0   
 27     14.0            41.0         16.0        26.0  ...            61.0   
 
      USC  Utah  Valparaiso  W_Kentucky  Wake_Forest  Winthrop  Wisconsin  \
 0   20.0  59.0        60.0        28.0         48.0      35.0       11.0   
 27  35.0   1.0        59.0        20.0         48.0      13.0       11.0   
 
     Wyoming  Xavier  
 0      34.0     1.0  
 27     60.0    34.0  
 
 [2 rows x 69 columns])

In [33]:
feature_name = 'intersection_0.5_to_1.0'
def compute_score(data):
    k=10
    s = 0
    c=0
    for i1,i2 in itertools.combinations(range(len(data)),2):
        s+=len(set(data[i1][:k]).intersection(set(data[i2][:k])))/k
        c+=1
    return s/c

def results_to_frame(results,method,domain_range,year):
    t = results.to_frame()
    t.columns=[feature_name]
    t['Method'] = method
    t['Year']=year
    t['Domain']=domain_range[0]
    t['Range']=domain_range[1]
    t.reset_index(inplace=True)
    return t

ms = pd.DataFrame(columns=[feature_name,'Method','Year'])
for domain_range,year in tqdm(outer_keys):
    # set the team_range
    team_range = None
    if domain_range[1] == 'madness':
        team_range = madness_teams[year]
    elif domain_range[1] == 'all':
        team_range = all_teams[year]    
    grouped = colley_rankings[(domain_range,year)].set_index('frac').loc[[0.5,1.0]].reset_index().groupby(['direct_thres','spread_thres','weight_indirect'])
    results = grouped.apply(lambda df: compute_score(df[team_range].astype(int).values.tolist()))
    ms = ms.append(results_to_frame(results,'Colley',domain_range,year))
    grouped = massey_rankings[(domain_range,year)].set_index('frac').loc[[0.5,1.0]].reset_index().groupby(['direct_thres','spread_thres','weight_indirect'])
    results = grouped.apply(lambda df: compute_score(df[team_range].astype(int).values.tolist()))
    ms = ms.append(results_to_frame(results,'Massey',domain_range,year))
ms


  0%|          | 0/34 [00:00<?, ?it/s][A
  6%|▌         | 2/34 [00:00<00:02, 14.59it/s][A
 12%|█▏        | 4/34 [00:00<00:02, 14.39it/s][A
 18%|█▊        | 6/34 [00:00<00:01, 14.12it/s][A
 24%|██▎       | 8/34 [00:00<00:01, 13.89it/s][A
 29%|██▉       | 10/34 [00:00<00:01, 13.80it/s][A
 35%|███▌      | 12/34 [00:00<00:01, 13.70it/s][A
 41%|████      | 14/34 [00:01<00:01, 13.60it/s][A
 47%|████▋     | 16/34 [00:01<00:01, 13.48it/s][A
 53%|█████▎    | 18/34 [00:01<00:01, 13.46it/s][A
 59%|█████▉    | 20/34 [00:01<00:01, 13.48it/s][A
 65%|██████▍   | 22/34 [00:01<00:00, 13.48it/s][A
 71%|███████   | 24/34 [00:01<00:00, 13.58it/s][A
 76%|███████▋  | 26/34 [00:01<00:00, 13.51it/s][A
 82%|████████▏ | 28/34 [00:02<00:00, 13.54it/s][A
 88%|████████▊ | 30/34 [00:02<00:00, 13.57it/s][A
 94%|█████████▍| 32/34 [00:02<00:00, 13.62it/s][A
100%|██████████| 34/34 [00:02<00:00, 13.55it/s][A

Unnamed: 0,intersection_0.5_to_1.0,Method,Year,direct_thres,spread_thres,weight_indirect,Domain,Range
0,0.7,Colley,2002,0.0,0.0,0.25,all,madness
1,0.7,Colley,2002,0.0,0.0,0.50,all,madness
2,0.7,Colley,2002,0.0,0.0,1.00,all,madness
3,0.7,Colley,2002,0.0,3.0,0.25,all,madness
4,0.7,Colley,2002,0.0,3.0,0.50,all,madness
...,...,...,...,...,...,...,...,...
22,0.5,Massey,2018,2.0,3.0,0.50,madness,madness
23,0.6,Massey,2018,2.0,3.0,1.00,madness,madness
24,0.6,Massey,2018,2.0,6.0,0.25,madness,madness
25,0.5,Massey,2018,2.0,6.0,0.50,madness,madness


In [36]:
sensitivity_target = ms.set_index('Method').loc['Massey'].reset_index().set_index(['Year','direct_thres','spread_thres','weight_indirect','Domain','Range'])['intersection_0.5_to_1.0']

sensitivity_data = {}
for year in tqdm(years):
    sensitivity_data[year] = {}
    for frac in fracs:
        game_df = pd.DataFrame({"team1_name":games[year]['team1_name'],
                                "team1_score":games[year]['points1'],
                                "team1_H_A_N": games[year]['H_A_N1'],
                                "team2_name":games[year]['team2_name'],
                                "team2_score":games[year]['points2'],
                                "team2_H_A_N": games[year]['H_A_N1'],
                                "date": games[year]['date']
                               }).sort_values(by='date').drop('date',axis=1)
        mask = game_df.team1_name.isin(team_domain) & game_df.team2_name.isin(team_domain)
        game_df = game_df.loc[mask]
        upper = int(len(game_df)*frac)
        game_df_sample = game_df.iloc[:upper,:]
        sensitivity_data[year][f"frac={frac}"]=game_df_sample
        
description = """
This dataset represents the sensitivity problem as defined as follows:

A practitioner wants to predict the degree to which a season of the NCAA Men’s Basketball 
will be likely to change after the second half of the season is played. This is in the context of a Massey
with a specific set of parameters

direct_thress = [0,1,2]
spread_thress = [0,3,6]
weight_indirects = [0.25,0.5,1.]
domains_ranges = [('all','madness'),('madness','madness')]
"""
joblib.dump({'description':description,'target':sensitivity_target,'data':sensitivity_data,'other':{'madness_teams':madness_teams,'remaining_games':remaining_games}},"/disk/rankability_datasets/sensitivity_study/problem_0002a.joblib.z")


  0%|          | 0/17 [00:00<?, ?it/s][A
 24%|██▎       | 4/17 [00:00<00:00, 36.80it/s][A
 47%|████▋     | 8/17 [00:00<00:00, 36.26it/s][A
 71%|███████   | 12/17 [00:00<00:00, 36.07it/s][A
 94%|█████████▍| 16/17 [00:00<00:00, 35.79it/s][A
100%|██████████| 17/17 [00:00<00:00, 35.44it/s][A

['/disk/rankability_datasets/sensitivity_study/problem_0002a.joblib.z']

In [39]:
sensitivity_target = ms.set_index('Method').loc['Colley'].reset_index().set_index(['Year','direct_thres','spread_thres','weight_indirect','Domain','Range'])['intersection_0.5_to_1.0']

sensitivity_data = {}
for year in tqdm(years):
    sensitivity_data[year] = {}
    for frac in fracs:
        game_df = pd.DataFrame({"team1_name":games[year]['team1_name'],
                                "team1_score":games[year]['points1'],
                                "team1_H_A_N": games[year]['H_A_N1'],
                                "team2_name":games[year]['team2_name'],
                                "team2_score":games[year]['points2'],
                                "team2_H_A_N": games[year]['H_A_N1'],
                                "date": games[year]['date']
                               }).sort_values(by='date').drop('date',axis=1)
        mask = game_df.team1_name.isin(team_domain) & game_df.team2_name.isin(team_domain)
        game_df = game_df.loc[mask]
        upper = int(len(game_df)*frac)
        game_df_sample = game_df.iloc[:upper,:]
        sensitivity_data[year][f"frac={frac}"]=game_df_sample
        
description = """
This dataset represents the sensitivity problem as defined as follows:

A practitioner wants to predict the degree to which a season of the NCAA Men’s Basketball 
will be likely to change after the second half of the season is played. This is in the context of a Colley
with a specific set of parameters

direct_thress = [0,1,2]
spread_thress = [0,3,6]
weight_indirects = [0.25,0.5,1.]
domains_ranges = [('all','madness'),('madness','madness')]
"""
joblib.dump({'description':description,'target':sensitivity_target,'data':sensitivity_data,'other':{'madness_teams':madness_teams,'remaining_games':remaining_games}},"/disk/rankability_datasets/sensitivity_study/problem_0002b.joblib.z")


  0%|          | 0/17 [00:00<?, ?it/s][A
 24%|██▎       | 4/17 [00:00<00:00, 37.18it/s][A
 35%|███▌      | 6/17 [00:00<00:00, 27.56it/s][A
 59%|█████▉    | 10/17 [00:00<00:00, 29.52it/s][A
 82%|████████▏ | 14/17 [00:00<00:00, 30.83it/s][A
100%|██████████| 17/17 [00:00<00:00, 31.40it/s][A

['/disk/rankability_datasets/sensitivity_study/problem_0002b.joblib.z']

In [30]:
ms

Unnamed: 0,frac,mean_top10_intersection,Method,Year,Domain,Range
0,0.5,0.948718,Colley,2002,all,madness
1,0.6,0.933333,Colley,2002,all,madness
2,0.7,0.961538,Colley,2002,all,madness
3,0.8,0.953846,Colley,2002,all,madness
4,0.9,0.953846,Colley,2002,all,madness
...,...,...,...,...,...,...
1,0.6,0.794872,Massey,2018,madness,madness
2,0.7,0.858974,Massey,2018,madness,madness
3,0.8,0.769231,Massey,2018,madness,madness
4,0.9,0.830769,Massey,2018,madness,madness


In [None]:
sensitivity_target = ms.set_index('Method').loc['Massey'].reset_index().set_index([''])

groupby('Year')['mean_top10_intersection'].mean()

sensitivity_data = {}
for year in tqdm(years):
    sensitivity_data[year] = {}
    for frac in fracs:
        game_df = pd.DataFrame({"team1_name":games[year]['team1_name'],
                                "team1_score":games[year]['points1'],
                                "team1_H_A_N": games[year]['H_A_N1'],
                                "team2_name":games[year]['team2_name'],
                                "team2_score":games[year]['points2'],
                                "team2_H_A_N": games[year]['H_A_N1'],
                                "date": games[year]['date']
                               }).sort_values(by='date').drop('date',axis=1)
        mask = game_df.team1_name.isin(team_domain) & game_df.team2_name.isin(team_domain)
        game_df = game_df.loc[mask]
        upper = int(len(game_df)*frac)
        game_df_sample = game_df.iloc[:upper,:]
        sensitivity_data[year][f"frac={frac}"]=game_df_sample
        
description = """
This dataset represents the sensitivity problem as defined as follows:

A practitioner wants to predict the degree to which the second half of the season of the NCAA
Men’s Basketball will change after they play the games. The practioner believes that Massey is 
the best. They feel that a direct win has to be above 1 point difference and that the spread threshold
must be 3. They also feel that indirects are 0.25 weight of a direct game. They feel like indirects
between any teams are important.

direct_thress = [0,1,2]
spread_thress = [0,3,6]
weight_indirects = [0.25,0.5,1.]
domains_ranges = [('all','madness'),('madness','madness')]
"""
joblib.dump({'description':description,'target':sensitivity_target,'data':sensitivity_data,'other':{'madness_teams':madness_teams,'remaining_games':remaining_games}},"/disk/rankability_datasets/sensitivity_study/problem_0002b.joblib.z")
