# RPLib Problem 001
## March Madness Dataset

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [5]:
import copy
import os
import pandas as pd
import numpy as np
import networkx as nx
from scipy.stats import pearsonr
from scipy.stats import skew
from tqdm import tqdm
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import itertools
import joblib

In [6]:
from pathlib import Path
home = str(Path.home())
home

'/home/jupyter-pander14'

In [7]:
import sys
sys.path.insert(0,"%s/rankability_toolbox_dev"%home)
import pyrankability

In [8]:
sys.path.insert(0,"%s/sensitivity_study/src"%home)
from sensitivity_tests import *
from utilities import *
from base import *

In [12]:
games={}
remaining_games={}
madness_teams={}
all_teams={}
years = ["2002","2003","2004","2005","2006","2007","2008","2009","2010","2011","2012","2013","2014","2015","2016","2017","2018"]
for year in years:
    games[year],remaining_games[year] = read_data(f'{home}/marchmadness_study/data/%steams.txt'%year,f'{home}/marchmadness_study/data/%sgames.txt'%year,f'{home}/marchmadness_study/data/%sMadnessTeams.txt'%year)
    madness_teams[year] = list(np.unique(list(games[year].team1_name.loc[games[year].team1_madness == 1]) + list(games[year].team2_name.loc[games[year].team2_madness == 1])))
    all_teams[year] = list(np.unique(list(games[year].team1_name) + list(games[year].team2_name)))
print(year)
games[year]

2018


Unnamed: 0,team2,team1,notsure1,date,H_A_N1,points1,H_A_N2,points2,team1_name,team2_name,team1_madness,team2_madness
0,1,16,737011,2017-11-13,1,83,-1,69,Arkansas_St,Abilene_Chr,0,0
1,1,41,737114,2018-02-24,-1,74,1,72,Cent_Arkansas,Abilene_Chr,0,0
3,1,143,737018,2017-11-20,-1,75,1,67,Lipscomb,Abilene_Chr,1,0
4,1,143,737045,2017-12-17,1,67,-1,65,Lipscomb,Abilene_Chr,1,0
5,1,199,737056,2017-12-28,1,77,-1,74,New_Orleans,Abilene_Chr,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
5535,351,318,737048,2017-12-20,1,91,-1,74,Utah_St,Youngstown_St,0,0
5536,351,339,737086,2018-01-27,1,85,-1,67,WI_Green_Bay,Youngstown_St,0,0
5537,351,340,737084,2018-01-25,1,66,-1,55,WI_Milwaukee,Youngstown_St,0,0
5538,351,347,737074,2018-01-15,1,77,-1,67,Wright_St,Youngstown_St,1,0


In [13]:
remaining_games[year]

Unnamed: 0,team2,team1,notsure1,date,H_A_N1,points1,H_A_N2,points2,team1_name,team2_name,team1_madness,team2_madness
2,1,73,737130,2018-03-12,1,80,-1,73,Drake,Abilene_Chr,0,0
32,2,313,737125,2018-03-07,1,97,-1,90,UNLV,Air_Force,0,0
42,3,79,737126,2018-03-08,0,67,0,58,E_Michigan,Akron,0,0
59,4,137,737128,2018-03-10,0,86,0,63,Kentucky,Alabama,1,1
68,4,326,737135,2018-03-17,0,81,0,58,Villanova,Alabama,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5478,347,287,737133,2018-03-15,0,73,0,47,Tennessee,Wright_St,1,1
5491,348,197,737126,2018-03-08,0,85,0,75,New_Mexico,Wyoming,0,0
5496,349,92,737136,2018-03-18,0,75,0,70,Florida_St,Xavier,1,1
5498,349,234,737127,2018-03-09,0,75,0,72,Providence,Xavier,1,1


In [69]:
# Note to future self: Parameters from FODS paper but might need to be optimized
direct_thress = [1.]
spread_thress = [3.]
weight_indirects = [0.25]
domains_ranges = [('all','madness'),('madness','madness')]

# fracs represent how much of the data to include
fracs = [0.5,0.6,0.7,0.8,0.9,1.]

In [70]:
massey_rankings = {}
colley_rankings = {}
massey_rs = {}
colley_rs = {}

outer_keys = list(itertools.product(domains_ranges,years))
for domain_range,year in tqdm(outer_keys):
    # set the team_domain
    team_domain = None
    if domain_range[0] == 'madness':
        team_domain = madness_teams[year]
    elif domain_range[0] == 'all':
        team_domain = all_teams[year]

    # set the team_range
    team_range = None
    if domain_range[1] == 'madness':
        team_range = madness_teams[year]
    elif domain_range[1] == 'all':
        team_range = all_teams[year]

    columns = ["frac","direct_thres","spread_thres","weight_indirect"]+team_range
    massey_rankings[(domain_range,year)] = pd.DataFrame(columns=columns)
    colley_rankings[(domain_range,year)] = pd.DataFrame(columns=columns)
    massey_rs[(domain_range,year)] = pd.DataFrame(columns=columns)
    colley_rs[(domain_range,year)] = pd.DataFrame(columns=columns)

    game_df = pd.DataFrame({"team1_name":games[year]['team1_name'],
                            "team1_score":games[year]['points1'],
                            "team1_H_A_N": games[year]['H_A_N1'],
                            "team2_name":games[year]['team2_name'],
                            "team2_score":games[year]['points2'],
                            "team2_H_A_N": games[year]['H_A_N1'],
                            "date": games[year]['date']
                           }).sort_values(by='date').drop('date',axis=1)
    mask = game_df.team1_name.isin(team_domain) & game_df.team2_name.isin(team_domain)
    game_df = game_df.loc[mask]

    keys = list(itertools.product(fracs,direct_thress,spread_thress,weight_indirects))

    def compute(frac,direct_thres,spread_thres,weight_indirect):
        upper = int(len(game_df)*frac)
        game_df_sample = game_df.iloc[:upper,:]

        map_func = lambda linked: pyrankability.construct.colley_matrices(linked,direct_thres=direct_thres,spread_thres=spread_thres,weight_indirect=weight_indirect)
        colley_matrix,colley_b = pyrankability.construct.map_vectorized(game_df_sample,map_func)
        colley_matrix = colley_matrix.reindex(index=team_range,columns=team_range)
        colley_b = colley_b.reindex(team_range)
        mask = colley_b.isna()
        colley_b = colley_b.loc[~mask]
        colley_matrix = colley_matrix.loc[~mask,~mask]
        inxs = list(np.where(mask)[0])
        ranking1,r1 = pyrankability.construct.ranking_from_matrices(colley_matrix.fillna(0),colley_b,inxs)

        map_func = lambda linked: pyrankability.construct.massey_matrices(linked,direct_thres=direct_thres,spread_thres=spread_thres,weight_indirect=weight_indirect)
        massey_matrix,massey_b = pyrankability.construct.map_vectorized(game_df_sample,map_func)
        massey_matrix = massey_matrix.reindex(index=team_range,columns=team_range)
        massey_b = massey_b.reindex(team_range)
        mask = massey_b.isna()
        massey_b = massey_b.loc[~mask]
        massey_matrix = massey_matrix.loc[~mask,~mask]    
        inxs = list(np.where(mask)[0])    
        ranking2,r2 = pyrankability.construct.ranking_from_matrices(massey_matrix.fillna(0),massey_b,inxs)
        ranking_values1 = [frac,direct_thres,spread_thres,weight_indirect]+list(ranking1)
        ranking_values2 = [frac,direct_thres,spread_thres,weight_indirect]+list(ranking2)
        r_values1 = [frac,direct_thres,spread_thres,weight_indirect]+list(r1)
        r_values2 = [frac,direct_thres,spread_thres,weight_indirect]+list(r2)
        return pd.Series(ranking_values1,index=columns),pd.Series(ranking_values2,index=columns),pd.Series(r_values1,index=columns),pd.Series(r_values2,index=columns)

    #frac,direct_thres,spread_thres,weight_indirect = keys[0]
    #compute(frac,direct_thres,spread_thres,weight_indirect)
    results = Parallel(n_jobs=-1)(delayed(compute)(frac,direct_thres,spread_thres,weight_indirect) for frac,direct_thres,spread_thres,weight_indirect in keys)

    c = 0
    for i,key in enumerate(keys):
        frac,direct_thres,spread_thres,weight_indirect = key
        massey,colley,massey_r,colley_r = results[i]
        massey.name = c
        colley.name = c
        colley_r.name=c
        massey_r.name=c
        massey_rankings[(domain_range,year)] = massey_rankings[(domain_range,year)].append(massey)
        colley_rankings[(domain_range,year)] = colley_rankings[(domain_range,year)].append(colley)
        massey_rs[(domain_range,year)] = massey_rs[(domain_range,year)].append(massey_r)
        colley_rs[(domain_range,year)] = colley_rs[(domain_range,year)].append(colley_r)
        c+=1

100%|██████████| 34/34 [01:10<00:00,  3.06it/s]


In [71]:
top_k = 10
feature_name = f'top{top_k}_intersection'
ms = pd.DataFrame(columns=['frac1','frac2','domain','range',"direct_thres","spread_thres","weight_indirect",'Method','Year',feature_name])

keys = list(itertools.product(domains_ranges,direct_thress,spread_thress,weight_indirects,years))
pair_fracs = [sorted(fracs) for fracs in list(itertools.combinations(fracs,2))]

c=0
for domain_range,dt,st,iw,year in keys:
    dom = domain_range[0]
    ran = domain_range[1]
    for frac1,frac2 in pair_fracs:
        method = 'Massey'
        rankings = massey_rankings[(domain_range,year)].set_index(["direct_thres","spread_thres","weight_indirect"]).loc[(dt,st,iw)]
        rankings1 = rankings.set_index('frac').loc[frac1].T
        rankings1 = rankings1.loc[rankings1 < top_k]
        rankings2 = rankings.set_index('frac').loc[frac2].T
        rankings2 = rankings2.loc[rankings2 < top_k]
        val = len(set(rankings1.index).intersection(set(rankings2.index)))/top_k
        entry = pd.Series([frac1,frac2,dom,ran,dt,st,iw,method,year,val],name=c,index=ms.columns)
        c+=1
        ms=ms.append(entry)
        
        method = 'Colley'
        rankings = colley_rankings[(domain_range,year)].set_index(["direct_thres","spread_thres","weight_indirect"]).loc[(dt,st,iw)]
        rankings1 = rankings.set_index('frac').loc[frac1].T
        rankings1 = rankings1.loc[rankings1 < top_k]
        rankings2 = rankings.set_index('frac').loc[frac2].T
        rankings2 = rankings2.loc[rankings2 < top_k]
        val = len(set(rankings1.index).intersection(set(rankings2.index)))/top_k
        entry = pd.Series([frac1,frac2,dom,ran,dt,st,iw,method,year,val],name=c,index=ms.columns)
        c+=1
        ms=ms.append(entry)

In [92]:
ms.head()

Unnamed: 0,frac1,frac2,domain,range,direct_thres,spread_thres,weight_indirect,Method,Year,top10_intersection
0,0.5,0.6,all,madness,1.0,3.0,0.25,Massey,2002,0.2
1,0.5,0.6,all,madness,1.0,3.0,0.25,Colley,2002,0.4
2,0.5,0.7,all,madness,1.0,3.0,0.25,Massey,2002,0.1
3,0.5,0.7,all,madness,1.0,3.0,0.25,Colley,2002,0.3
4,0.5,0.8,all,madness,1.0,3.0,0.25,Massey,2002,0.1


In [94]:
import altair as alt

graph_df = ms.copy()
graph_df["Interval"] = graph_df["frac1"].astype(str) +"-"+graph_df["frac2"].astype(str)
graph_df["Interval Width"] = -(graph_df["frac1"] - graph_df["frac2"])
graph_df[f"Intersection in top {top_k}"] = graph_df[feature_name]

alt.Chart(graph_df).mark_bar().encode(
    x='Interval',
    y=f"average(Intersection in top {top_k})",
    color='Interval Width'
).facet(
    column='Method'
)


In [96]:
sensitivity_target = ms.copy()

sensitivity_data = {}
for year in tqdm(years):
    sensitivity_data[year] = {}
    for frac in fracs:
        game_df = pd.DataFrame({"team1_name":games[year]['team1_name'],
                                "team1_score":games[year]['points1'],
                                "team1_H_A_N": games[year]['H_A_N1'],
                                "team2_name":games[year]['team2_name'],
                                "team2_score":games[year]['points2'],
                                "team2_H_A_N": games[year]['H_A_N1'],
                                "date": games[year]['date']
                               }).sort_values(by='date').drop('date',axis=1)
        mask = game_df.team1_name.isin(team_domain) & game_df.team2_name.isin(team_domain)
        game_df = game_df.loc[mask]
        upper = int(len(game_df)*frac)
        game_df_sample = game_df.iloc[:upper,:]
        sensitivity_data[year][f"frac={frac}"]=game_df_sample
        
description = """
A practitioner wants to predict the degree to which a the rankings during season 
of the NCAA Men’s Basketball are likely to change as more games are played (i.e., sensitivity to more games). 
They want to start the analysis after a minimum of 50% of the games are played. 
They want to run Massey and Colley.

Sensitivity of new games will be measured as the intersection of between two 
rankings derived from before and after the new games are included.
"""
joblib.dump({'description':description,'target':sensitivity_target,'data':sensitivity_data,'other':{'madness_teams':madness_teams,'remaining_games':remaining_games}},"/disk/RPLib/problem_0001.joblib.z")

100%|██████████| 17/17 [00:00<00:00, 51.92it/s]


['/disk/RPLib/problem_0001.joblib.z']