# RPLib Problem 001
## March Madness Dataset

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [5]:
import copy
import os
import pandas as pd
import numpy as np
import networkx as nx
from scipy.stats import pearsonr
from scipy.stats import skew
from tqdm import tqdm
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import itertools
import joblib

In [6]:
from pathlib import Path
home = str(Path.home())
home

'/home/jupyter-pander14'

In [7]:
import sys
sys.path.insert(0,"%s/rankability_toolbox_dev"%home)
import pyrankability

In [8]:
sys.path.insert(0,"%s/sensitivity_study/src"%home)
from sensitivity_tests import *
from utilities import *
from base import *

In [12]:
games={}
remaining_games={}
madness_teams={}
all_teams={}
years = ["2002","2003","2004","2005","2006","2007","2008","2009","2010","2011","2012","2013","2014","2015","2016","2017","2018"]
for year in years:
    games[year],remaining_games[year] = read_data(f'{home}/marchmadness_study/data/%steams.txt'%year,f'{home}/marchmadness_study/data/%sgames.txt'%year,f'{home}/marchmadness_study/data/%sMadnessTeams.txt'%year)
    madness_teams[year] = list(np.unique(list(games[year].team1_name.loc[games[year].team1_madness == 1]) + list(games[year].team2_name.loc[games[year].team2_madness == 1])))
    all_teams[year] = list(np.unique(list(games[year].team1_name) + list(games[year].team2_name)))
print(year)
games[year]

2018


Unnamed: 0,team2,team1,notsure1,date,H_A_N1,points1,H_A_N2,points2,team1_name,team2_name,team1_madness,team2_madness
0,1,16,737011,2017-11-13,1,83,-1,69,Arkansas_St,Abilene_Chr,0,0
1,1,41,737114,2018-02-24,-1,74,1,72,Cent_Arkansas,Abilene_Chr,0,0
3,1,143,737018,2017-11-20,-1,75,1,67,Lipscomb,Abilene_Chr,1,0
4,1,143,737045,2017-12-17,1,67,-1,65,Lipscomb,Abilene_Chr,1,0
5,1,199,737056,2017-12-28,1,77,-1,74,New_Orleans,Abilene_Chr,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
5535,351,318,737048,2017-12-20,1,91,-1,74,Utah_St,Youngstown_St,0,0
5536,351,339,737086,2018-01-27,1,85,-1,67,WI_Green_Bay,Youngstown_St,0,0
5537,351,340,737084,2018-01-25,1,66,-1,55,WI_Milwaukee,Youngstown_St,0,0
5538,351,347,737074,2018-01-15,1,77,-1,67,Wright_St,Youngstown_St,1,0


In [13]:
remaining_games[year]

Unnamed: 0,team2,team1,notsure1,date,H_A_N1,points1,H_A_N2,points2,team1_name,team2_name,team1_madness,team2_madness
2,1,73,737130,2018-03-12,1,80,-1,73,Drake,Abilene_Chr,0,0
32,2,313,737125,2018-03-07,1,97,-1,90,UNLV,Air_Force,0,0
42,3,79,737126,2018-03-08,0,67,0,58,E_Michigan,Akron,0,0
59,4,137,737128,2018-03-10,0,86,0,63,Kentucky,Alabama,1,1
68,4,326,737135,2018-03-17,0,81,0,58,Villanova,Alabama,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5478,347,287,737133,2018-03-15,0,73,0,47,Tennessee,Wright_St,1,1
5491,348,197,737126,2018-03-08,0,85,0,75,New_Mexico,Wyoming,0,0
5496,349,92,737136,2018-03-18,0,75,0,70,Florida_St,Xavier,1,1
5498,349,234,737127,2018-03-09,0,75,0,72,Providence,Xavier,1,1


In [233]:
# Note to future self: Parameters from FODS paper but might need to be optimized
direct_thress = [0,3.]
spread_thress = [0,3.]
weight_indirects = [0,0.25]
domains_ranges = [('all','madness')]

# fracs represent how much of the data to include
fracs = [0.5,0.6,0.7,0.8,0.9,1.]

In [247]:
massey_rankings = {}
colley_rankings = {}
massey_rs = {}
colley_rs = {}

outer_keys = list(itertools.product(domains_ranges,years))
for domain_range,year in tqdm(outer_keys):
    # set the team_domain
    team_domain = None
    if domain_range[0] == 'madness':
        team_domain = madness_teams[year]
    elif domain_range[0] == 'all':
        team_domain = all_teams[year]

    # set the team_range
    team_range = None
    if domain_range[1] == 'madness':
        team_range = madness_teams[year]
    elif domain_range[1] == 'all':
        team_range = all_teams[year]

    columns = ["frac","direct_thres","spread_thres","weight_indirect"]+team_range
    massey_rankings[(domain_range,year)] = pd.DataFrame(columns=columns)
    colley_rankings[(domain_range,year)] = pd.DataFrame(columns=columns)
    massey_rs[(domain_range,year)] = pd.DataFrame(columns=columns)
    colley_rs[(domain_range,year)] = pd.DataFrame(columns=columns)

    game_df = pd.DataFrame({"team1_name":games[year]['team1_name'],
                            "team1_score":games[year]['points1'],
                            "team1_H_A_N": games[year]['H_A_N1'],
                            "team2_name":games[year]['team2_name'],
                            "team2_score":games[year]['points2'],
                            "team2_H_A_N": games[year]['H_A_N1'],
                            "date": games[year]['date']
                           }).sort_values(by='date').drop('date',axis=1)
    mask = game_df.team1_name.isin(team_domain) & game_df.team2_name.isin(team_domain)
    game_df = game_df.loc[mask]

    keys = list(itertools.product(fracs,direct_thress,spread_thress,weight_indirects))

    def compute(frac,direct_thres,spread_thres,weight_indirect,team_range,all_teams,game_df):
        upper = int(len(game_df)*frac)
        game_df_sample = game_df.iloc[:upper,:]

        map_func = lambda linked: pyrankability.construct.colley_matrices(linked,direct_thres=direct_thres,spread_thres=spread_thres,weight_indirect=weight_indirect)
        colley_matrix,colley_b = pyrankability.construct.map_vectorized(game_df_sample,map_func)
        colley_matrix = colley_matrix.reindex(index=all_teams,columns=all_teams)
        #import pdb; pdb.set_trace()
        colley_b = colley_b.reindex(all_teams)
        #mask = colley_b.isna()
        #colley_b = colley_b.loc[~mask]
        #colley_matrix = colley_matrix.loc[~mask,~mask]
        inxs = []
        for team in team_range:
            inxs.append(int(np.where(colley_b.index == team)[0][0]))
        #inxs = list(np.where(~colley_b.index.isin(team_range))[0]) #list(np.where(mask)[0])
        ranking1,r1 = pyrankability.construct.ranking_from_matrices(colley_matrix.fillna(0),colley_b.fillna(0),inxs)
        
        map_func = lambda linked: pyrankability.construct.massey_matrices(linked,direct_thres=direct_thres,spread_thres=spread_thres,weight_indirect=weight_indirect)
        massey_matrix,massey_b = pyrankability.construct.map_vectorized(game_df_sample,map_func)
        massey_matrix = massey_matrix.reindex(index=all_teams,columns=all_teams)
        massey_b = massey_b.reindex(all_teams)
        #mask = massey_b.isna()
        #massey_b = massey_b.loc[~mask]
        #massey_matrix = massey_matrix.loc[~mask,~mask]    
        #inxs = list(np.where(mask)[0])    
        #inxs = list(np.where(~massey_b.index.isin(team_range))[0]) #list(np.where(mask)[0])
        inxs = []
        for team in team_range:
            inxs.append(int(np.where(massey_b.index == team)[0][0]))
        ranking2,r2 = pyrankability.construct.ranking_from_matrices(massey_matrix.fillna(0),massey_b.fillna(0),inxs)
        
        ranking_values1 = [frac,direct_thres,spread_thres,weight_indirect]+list(ranking1)
        ranking_values2 = [frac,direct_thres,spread_thres,weight_indirect]+list(ranking2)
        r_values1 = [frac,direct_thres,spread_thres,weight_indirect]+list(r1)
        r_values2 = [frac,direct_thres,spread_thres,weight_indirect]+list(r2)
        
        return pd.Series(ranking_values1,index=columns),pd.Series(ranking_values2,index=columns),pd.Series(r_values1,index=columns),pd.Series(r_values2,index=columns)

    #frac,direct_thres,spread_thres,weight_indirect = keys[0]
    #for frac,direct_thres,spread_thres,weight_indirect in keys:
    #    compute(frac,direct_thres,spread_thres,weight_indirect,team_range,all_teams[year],game_df)
    results = Parallel(n_jobs=-1)(delayed(compute)(frac,direct_thres,spread_thres,weight_indirect,team_range,all_teams[year],game_df) for frac,direct_thres,spread_thres,weight_indirect in keys)

    c = 0
    for i,key in enumerate(keys):
        frac,direct_thres,spread_thres,weight_indirect = key
        massey,colley,massey_r,colley_r = results[i]
        massey.name = c
        colley.name = c
        colley_r.name=c
        massey_r.name=c
        massey_rankings[(domain_range,year)] = massey_rankings[(domain_range,year)].append(massey)
        colley_rankings[(domain_range,year)] = colley_rankings[(domain_range,year)].append(colley)
        massey_rs[(domain_range,year)] = massey_rs[(domain_range,year)].append(massey_r)
        colley_rs[(domain_range,year)] = colley_rs[(domain_range,year)].append(colley_r)
        c+=1
















  0%|          | 0/17 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














  6%|▌         | 1/17 [00:23<06:14, 23.38s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 12%|█▏        | 2/17 [00:44<05:39, 22.62s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 18%|█▊        | 3/17 [01:04<05:06, 21.92s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 24%|██▎       | 4/17 [01:25<04:43, 21.79s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 29%|██▉       | 5/17 [01:47<04:20, 21.73s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 35%|███▌      | 6/17 [02:12<04:08, 22.56s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 41%|████      | 7/17 [02:37<03:53, 23.38s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 47%|████▋     | 8/17 [03:02<03:36, 24.02s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 71%|███████   | 12/17 [04:46<02:07, 25

In [237]:
game_df.to_csv(f'{home}/game_df_example.csv',index=False)

## What parameters should we pick?
One way to focus our analysis is to study the parameters that result in the best accuracy when predicting future games.

In [248]:
def calc_predictability(games,r):
    numberCorrectPredictions = 0
    numGames = 0
    for i in games.index:
        team1ID = games.loc[i, "team1_name"]
        team1Score = games.loc[i, "points1"]
        team2ID = games.loc[i, "team2_name"]
        team2Score = games.loc[i, "points2"]
        
        if team1ID in r.index and team2ID in r.index:
            if team1Score > team2Score and r.loc[team1ID] > r.loc[team2ID]:
                numberCorrectPredictions += 1
            elif team2Score > team1Score and r.loc[team2ID] > r.loc[team1ID]:
                numberCorrectPredictions += 1
            elif team1Score == team2Score and r.loc[team1ID] == r.loc[team2ID]:
                numberCorrectPredictions += 1

            numGames += 1
           
    predictability = numberCorrectPredictions/numGames*100
    return predictability,numGames

In [249]:
pred_df = pd.DataFrame(columns=['frac','domain','range',"direct_thres","spread_thres","weight_indirect",'Method','Year','Predictability'])

frac = 1. # Use all the data available

keys = list(itertools.product(domains_ranges,direct_thress,spread_thress,weight_indirects,years))

c=0
for domain_range,dt,st,iw,year in keys:
    dom = domain_range[0]
    ran = domain_range[1]
    method = 'Massey'
    r = massey_rs[(domain_range,year)].set_index(["frac","direct_thres","spread_thres","weight_indirect"]).loc[(frac,dt,st,iw)]
    val = calc_predictability(remaining_games[year],r)[0]
    entry = pd.Series([frac,dom,ran,dt,st,iw,method,year,val],name=c,index=pred_df.columns)
    c+=1
    pred_df=pred_df.append(entry)

    method = 'Colley'
    r = colley_rs[(domain_range,year)].set_index(["frac","direct_thres","spread_thres","weight_indirect"]).loc[(frac,dt,st,iw)]
    val = calc_predictability(remaining_games[year],r)[0]
    entry = pd.Series([frac,dom,ran,dt,st,iw,method,year,val],name=c,index=pred_df.columns)
    c+=1
    pred_df=pred_df.append(entry)
    

In [250]:
pred_df

Unnamed: 0,frac,domain,range,direct_thres,spread_thres,weight_indirect,Method,Year,Predictability
0,1.0,all,madness,0,0,0,Massey,2002,64.044944
1,1.0,all,madness,0,0,0,Colley,2002,69.662921
2,1.0,all,madness,0,0,0,Massey,2003,67.857143
3,1.0,all,madness,0,0,0,Colley,2003,64.285714
4,1.0,all,madness,0,0,0,Massey,2004,65.591398
...,...,...,...,...,...,...,...,...,...
267,1.0,all,madness,3,3,0.25,Colley,2016,67.857143
268,1.0,all,madness,3,3,0.25,Massey,2017,54.838710
269,1.0,all,madness,3,3,0.25,Colley,2017,61.290323
270,1.0,all,madness,3,3,0.25,Massey,2018,61.797753


In [251]:
pred_df.groupby(["domain","range","frac","Method","direct_thres","spread_thres","weight_indirect"])["Predictability"].median().to_frame().reset_index().sort_values(by="Predictability")

Unnamed: 0,domain,range,frac,Method,direct_thres,spread_thres,weight_indirect,Predictability
13,all,madness,1.0,Massey,3.0,0.0,0.25,61.904762
15,all,madness,1.0,Massey,3.0,3.0,0.25,61.904762
8,all,madness,1.0,Massey,0.0,0.0,0.0,64.285714
10,all,madness,1.0,Massey,0.0,3.0,0.0,64.285714
9,all,madness,1.0,Massey,0.0,0.0,0.25,64.444444
11,all,madness,1.0,Massey,0.0,3.0,0.25,64.444444
12,all,madness,1.0,Massey,3.0,0.0,0.0,64.444444
14,all,madness,1.0,Massey,3.0,3.0,0.0,64.444444
5,all,madness,1.0,Colley,3.0,0.0,0.25,65.116279
7,all,madness,1.0,Colley,3.0,3.0,0.25,65.116279


In [252]:
sorted_df = pred_df.groupby(["domain","range","frac","Method","direct_thres","spread_thres","weight_indirect"])["Predictability"].median().to_frame().reset_index().sort_values(by="Predictability").set_index('Method')
sorted_df

Unnamed: 0_level_0,domain,range,frac,direct_thres,spread_thres,weight_indirect,Predictability
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Massey,all,madness,1.0,3.0,0.0,0.25,61.904762
Massey,all,madness,1.0,3.0,3.0,0.25,61.904762
Massey,all,madness,1.0,0.0,0.0,0.0,64.285714
Massey,all,madness,1.0,0.0,3.0,0.0,64.285714
Massey,all,madness,1.0,0.0,0.0,0.25,64.444444
Massey,all,madness,1.0,0.0,3.0,0.25,64.444444
Massey,all,madness,1.0,3.0,0.0,0.0,64.444444
Massey,all,madness,1.0,3.0,3.0,0.0,64.444444
Colley,all,madness,1.0,3.0,0.0,0.25,65.116279
Colley,all,madness,1.0,3.0,3.0,0.25,65.116279


In [253]:
best_colley = sorted_df.loc['Colley'].iloc[-1]
best_colley

domain                 all
range              madness
frac                     1
direct_thres             0
spread_thres             3
weight_indirect       0.25
Predictability     65.4762
Name: Colley, dtype: object

In [254]:
best_massey = sorted_df.loc['Massey'].iloc[-1]
best_massey

domain                 all
range              madness
frac                     1
direct_thres             3
spread_thres             3
weight_indirect          0
Predictability     64.4444
Name: Massey, dtype: object

In [255]:
best_df = pd.DataFrame(columns=best_massey.index)
best_df = best_df.append(best_massey)
best_df = best_df.append(best_colley)
best_df.index.name = "Method"
best_df = best_df.reset_index()
best_df

Unnamed: 0,Method,domain,range,frac,direct_thres,spread_thres,weight_indirect,Predictability
0,Massey,all,madness,1.0,3.0,3.0,0.0,64.444444
1,Colley,all,madness,1.0,0.0,3.0,0.25,65.47619


In [264]:
top_k = 15
feature_name = f'top{top_k}_intersection'
ms = pd.DataFrame(columns=['frac1','frac2','domain','range',"direct_thres","spread_thres","weight_indirect",'Method','Year',feature_name])

#keys = list(itertools.product(domains_ranges,direct_thress,spread_thress,weight_indirects,years))
pair_fracs = [sorted(fracs) for fracs in list(itertools.combinations(fracs,2))]

c=0
for index,row in best_df.iterrows():
    dom,ran,dt,st,iw = row.loc['domain'],row.loc['range'],row.loc['direct_thres'],row.loc['spread_thres'],row.loc['weight_indirect']
    method = row.loc['Method']
    for year in years:
        for frac1,frac2 in pair_fracs:
            if method == 'Massey':
                rankings = massey_rankings[(domain_range,year)].set_index(["direct_thres","spread_thres","weight_indirect"]).loc[(dt,st,iw)]
            elif method == 'Colley':
                rankings = colley_rankings[(domain_range,year)].set_index(["direct_thres","spread_thres","weight_indirect"]).loc[(dt,st,iw)]
                #import pdb; pdb.set_trace()
            else:
                raise Exception('Unsupported')
            rankings1 = rankings.set_index('frac').loc[frac1].T
            rankings1 = rankings1.loc[rankings1 < top_k]
            rankings2 = rankings.set_index('frac').loc[frac2].T
            rankings2 = rankings2.loc[rankings2 < top_k]
            #import pdb; pdb.set_trace()
            val = len(set(rankings1.index).intersection(set(rankings2.index)))/top_k
            entry = pd.Series([frac1,frac2,dom,ran,dt,st,iw,method,year,val],name=c,index=ms.columns)
            c+=1
            ms=ms.append(entry)

  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel

  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel

  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel

  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel

  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app










In [265]:
ms.sort_values(by=feature_name)

Unnamed: 0,frac1,frac2,domain,range,direct_thres,spread_thres,weight_indirect,Method,Year,top15_intersection
67,0.6,0.9,all,madness,3.0,3.0,0.00,Massey,2006,0.000000
247,0.6,0.9,all,madness,3.0,3.0,0.00,Massey,2018,0.000000
208,0.8,1.0,all,madness,3.0,3.0,0.00,Massey,2015,0.000000
246,0.6,0.8,all,madness,3.0,3.0,0.00,Massey,2018,0.066667
158,0.6,1.0,all,madness,3.0,3.0,0.00,Massey,2012,0.066667
...,...,...,...,...,...,...,...,...,...,...
58,0.8,1.0,all,madness,3.0,3.0,0.00,Massey,2005,0.600000
270,0.5,0.6,all,madness,0.0,3.0,0.25,Colley,2003,0.666667
260,0.6,0.7,all,madness,0.0,3.0,0.25,Colley,2002,0.666667
70,0.7,0.9,all,madness,3.0,3.0,0.00,Massey,2006,0.666667


In [266]:
import altair as alt

graph_df = ms.copy().reset_index()
graph_df["Interval"] = graph_df["frac1"].astype(str) +"-"+graph_df["frac2"].astype(str)
graph_df["Interval Width"] = np.round(-100*(graph_df["frac1"] - graph_df["frac2"]))
graph_df[f"Intersection in top {top_k}"] = graph_df[feature_name]

alt.Chart(graph_df).mark_bar().encode(
    x='Year',
    y=f"average(Intersection in top {top_k})",
).facet(row='Interval Width:N',column='Method')

In [271]:
import altair as alt

graph_df = ms.copy().reset_index()
graph_df["Interval"] = graph_df["frac1"].astype(str) +"-"+graph_df["frac2"].astype(str)
graph_df["Interval Width"] = np.round(-100*(graph_df["frac1"] - graph_df["frac2"]))
graph_df[f"Intersection in top {top_k}"] = graph_df[feature_name]

error_bars = alt.Chart(graph_df).mark_errorbar(extent='ci').encode(
    x='Interval Width:N',
    y=f"average(Intersection in top {top_k})",
    color='Method'
)

error_bars+alt.Chart(graph_df).mark_line().encode(
    x='Interval Width:N',
    y=f"average(Intersection in top {top_k})",
    color='Method'
)

In [272]:
sensitivity_target = ms.copy()

sensitivity_data = {}
for year in tqdm(years):
    sensitivity_data[year] = {}
    for frac in fracs:
        game_df = pd.DataFrame({"team1_name":games[year]['team1_name'],
                                "team1_score":games[year]['points1'],
                                "team1_H_A_N": games[year]['H_A_N1'],
                                "team2_name":games[year]['team2_name'],
                                "team2_score":games[year]['points2'],
                                "team2_H_A_N": games[year]['H_A_N1'],
                                "date": games[year]['date']
                               }).sort_values(by='date').drop('date',axis=1)
        mask = game_df.team1_name.isin(team_domain) & game_df.team2_name.isin(team_domain)
        game_df = game_df.loc[mask]
        upper = int(len(game_df)*frac)
        game_df_sample = game_df.iloc[:upper,:]
        sensitivity_data[year][f"frac={frac}"]=game_df_sample
        
description = """
A practitioner wants to predict the degree to which a the rankings during season 
of the NCAA Men’s Basketball are likely to change as more games are played (i.e., sensitivity to more games). 
They want to start the analysis after a minimum of 50% of the games are played. 
They want to run Massey and Colley.

Sensitivity of new games will be measured as the intersection of between two 
rankings derived from before and after the new games are included.
"""
joblib.dump({'description':description,'target':sensitivity_target,'data':sensitivity_data,'other':{'madness_teams':madness_teams,'remaining_games':remaining_games}},"/disk/RPLib/problem_0001.joblib.z")
















  0%|          | 0/17 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 29%|██▉       | 5/17 [00:00<00:00, 47.90it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 65%|██████▍   | 11/17 [00:00<00:00, 49.09it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














100%|██████████| 17/17 [00:00<00:00, 49.85it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

['/disk/RPLib/problem_0001.joblib.z']