# RPLib
## NCAA Men's Basketball Dataset, ACC

1. We need a team list for every year. We need to adapt the code below so it handles a dynamic team list for each year.
2. There is an overall date range:
   (selection_sunday - 35 TO selection_sunday - 7) of games to consider. but within that range, let's move one game at a time. 

In [2]:
major_description = "Study of ACC"
print(major_description)

Study of ACC


In [3]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
import copy
import os
import pandas as pd
import numpy as np
import networkx as nx
from scipy.stats import pearsonr
from scipy.stats import skew
from tqdm import tqdm
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import itertools
import joblib
from scipy import stats

In [5]:
from pathlib import Path
home = str(Path.home())
home

'/home/jupyter-pander14'

In [6]:
# pyt in .ipython

In [7]:
import pyrankability

In [8]:
import marchmadness_study.base as base

In [9]:
import pyrplib

In [10]:
games={}
remaining_games={}
madness_teams={}
all_teams={}
years = ["2002","2003","2004","2005","2006","2007","2008","2009","2010","2011","2012","2013","2014","2015","2016","2017","2018"]
for year in years:
    games[year],remaining_games[year] = base.read_data(f'{home}/marchmadness_study/data/%steams.txt'%year,f'{home}/marchmadness_study/data/%sgames.txt'%year,f'{home}/marchmadness_study/data/%sMadnessTeams.txt'%year)
    madness_teams[year] = list(np.unique(list(games[year].team1_name.loc[games[year].team1_madness == 1]) + list(games[year].team2_name.loc[games[year].team2_madness == 1])))
    all_teams[year] = list(np.unique(list(games[year].team1_name) + list(games[year].team2_name)))
print(year)
games[year]

2018


Unnamed: 0,team2,team1,notsure1,date,H_A_N1,points1,H_A_N2,points2,team1_name,team2_name,team1_madness,team2_madness
4147,256,170,737008,2017-11-10,1,92,-1,77,Minnesota,SC_Upstate,0,0
4288,265,293,737008,2017-11-10,1,75,-1,50,Texas_Tech,South_Alabama,1,0
925,56,326,737008,2017-11-10,1,75,-1,60,Villanova,Columbia,1,0
521,32,176,737008,2017-11-10,1,79,-1,78,Monmouth_NJ,Bucknell,0,1
4252,263,235,737008,2017-11-10,1,105,-1,74,Purdue,SIUE,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1985,121,152,737122,2018-03-04,0,65,0,49,Loyola-Chicago,Illinois_St,1,0
933,57,114,737122,2018-03-04,1,81,-1,71,Houston,Connecticut,1,0
1370,81,164,737122,2018-03-04,1,90,-1,70,Memphis,East_Carolina,0,0
1483,88,143,737122,2018-03-04,-1,108,1,96,Lipscomb,FL_Gulf_Coast,1,0


## What does a dataset look like?

In [11]:
remaining_games[year].head()

Unnamed: 0,team2,team1,notsure1,date,H_A_N1,points1,H_A_N2,points2,team1_name,team2_name,team1_madness,team2_madness
2,1,73,737130,2018-03-12,1,80,-1,73,Drake,Abilene_Chr,0,0
32,2,313,737125,2018-03-07,1,97,-1,90,UNLV,Air_Force,0,0
42,3,79,737126,2018-03-08,0,67,0,58,E_Michigan,Akron,0,0
59,4,137,737128,2018-03-10,0,86,0,63,Kentucky,Alabama,1,1
68,4,326,737135,2018-03-17,0,81,0,58,Villanova,Alabama,1,1


## Select just the teams from the ACC

In [12]:
#acc_teams = ["Duke","Miami_FL","Wake_Forest","Boston_College","Notre_Dame","Pittsburgh","Virginia","Florida_St","Virginia_Tech","Georgia_Tech","Clemson","North_Carolina","Louisville","Syracuse","NC_State"]
#len(acc_teams)

In [13]:
acc_teams_by_year = {}
for year in years:
    acc_teams_by_year[year] = ["Duke","Miami_FL","Wake_Forest","Boston_College","Notre_Dame","Pittsburgh","Virginia","Florida_St","Virginia_Tech","Georgia_Tech","Clemson","North_Carolina","Louisville","Syracuse","NC_State"]

## Restrict to games where one of these teams played at least

In [14]:
for year, acc_teams in acc_teams_by_year.items():
    team1_name = games[year].team1_name
    team2_name = games[year].team2_name
    games[year] = games[year].loc[team1_name.isin(acc_teams) | team2_name.isin(acc_teams)]
    
    team1_name = remaining_games[year].team1_name
    team2_name = remaining_games[year].team2_name
    remaining_games[year] = remaining_games[year].loc[team1_name.isin(acc_teams) | team2_name.isin(acc_teams)]

In [15]:
len(games['2018'])

324

## Run Massey and Colley
Parameters are selected below

In [16]:
direct_thress = [0] # might be of interest to see how sensitive to preprocessing, but not now
spread_thress = [0]
weight_indirects = [0,0.1,0.5,1]

### Set the date to start and the date to end in terms of delta from the end of the season

In [17]:
from datetime import timedelta

days_to_subtracts = [int(d) for d in np.arange(7+28,0,-7)]

start_delta = timedelta(days=days_to_subtracts[0])
end_delta = timedelta(days=days_to_subtracts[-1])

start_delta,end_delta

(datetime.timedelta(days=35), datetime.timedelta(days=7))

In [18]:
import ray
ray.init(address='auto', _redis_password='5241590000000000')

2021-04-21 02:26:31,275	INFO worker.py:651 -- Connecting to existing Ray cluster at address: 129.65.191.235:6379


RuntimeError: Maybe you called ray.init twice by accident? This error can be suppressed by passing in 'ignore_reinit_error=True' or by calling 'ray.shutdown()' prior to 'ray.init()'.

In [27]:
@ray.remote
def compute(dt,st,iw,game_df_sample,team_range,method):
    if method == 'colley':
        map_func = lambda linked: pyrankability.construct.colley_matrices(linked,direct_thres=dt,spread_thres=st)
    
    matrix,b,indirect_matrix,indirect_b = pyrankability.construct.map_vectorized(game_df_sample,map_func)
    matrix = matrix.reindex(index=team_range,columns=team_range)
    b = b.reindex(team_range)
    indirect_matrix = indirect_matrix.reindex(index=team_range,columns=team_range)
    indirect_b = indirect_b.reindex(team_range)
    ranking1,r1,perm1 = pyrankability.rank.ranking_from_matrices(matrix.fillna(0),b.fillna(0))
    indirect_ranking1,indirect_r1,indirect_perm1 = pyrankability.rank.ranking_from_matrices(indirect_matrix.fillna(0),indirect_b.fillna(0))
    if iw > 0:
        r1 = r1+iw*indirect_r1
        perm1, ranking1 = pyrankability.rank.perm_ranking_from_r(r1)
    return ranking1, r1, perm1

In [37]:
ix_cols = ["year","direct_thres","spread_thres","weight_indirect","method","new_games"]
all_results = pd.DataFrame(columns = ix_cols+["rankings","rs","perms"]).set_index(ix_cols)

for year in tqdm(years):
    team_domain = acc_teams_by_year[year]
    team_range = team_domain
    
    game_df = pd.DataFrame({"team1_name":games[year]['team1_name'],
                            "team1_score":games[year]['points1'],
                            "team1_H_A_N": games[year]['H_A_N1'],
                            "team2_name":games[year]['team2_name'],
                            "team2_score":games[year]['points2'],
                            "team2_H_A_N": games[year]['H_A_N1'],
                            "date": games[year]['date']
                           }).sort_values(by='date')#.drop('date',axis=1)
    mask = game_df.team1_name.isin(team_domain) & game_df.team2_name.isin(team_domain)
    game_df = game_df.loc[mask]
    
    selection_sunday = base.selectionSundays[year]
    end_date = pd.to_datetime(selection_sunday,format="%m/%d/%Y")-end_delta
    start_date = pd.to_datetime(selection_sunday,format="%m/%d/%Y")-start_delta
    game_df_end = game_df.loc[game_df["date"] <= end_date]
    game_indices = list(game_df.index[game_df["date"] < start_date])
    new_game_indices = [None] + list(game_df.index[game_df["date"] >= start_date])
    
    outer_keys = list(itertools.product(direct_thress,spread_thress,weight_indirects))
    result_ids = {}
    game_df2 = game_df_end.copy().loc[game_indices].drop('date',axis=1)
    total_new_games_added = 0
    for new_game_index in new_game_indices:
        if new_game_index is not None:
            game_df2 = game_df2.append(game_df_end.loc[new_game_index].drop('date')) # add the game
        for dt, st, iw in outer_keys:
            result_ids[total_new_games_added,dt,st,iw,"colley"] = compute.remote(dt,st,iw,game_df2,team_range,"colley")
        total_new_games_added += 1
    for key in result_ids.keys():
        total_new_games_added,dt,st,iw,method = key
        results = ray.get(result_ids[key])
        all_results = all_results.append(pd.Series(results,index=all_results.columns,name=(year,dt,st,iw,method,total_new_games_added)))

100%|██████████| 17/17 [00:28<00:00,  2.12s/it]


In [38]:
all_results

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,rankings,rs,perms
year,direct_thres,spread_thres,weight_indirect,method,new_games,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2002,0,0,0.0,colley,0,team2 Duke 1 Miami_FL ...,team2 Duke 7.945684e-01 Miami_FL ...,team2 Duke 0 Miami_FL 1...
2002,0,0,0.1,colley,0,team2 Duke 1 Miami_FL ...,team2 Duke 8.630424e-01 Miami_FL ...,team2 Duke 0 Miami_FL 1...
2002,0,0,0.5,colley,0,team2 Duke 1 Miami_FL ...,team2 Duke 1.136939e+00 Miami_FL ...,team2 Duke 0 Miami_FL 1...
2002,0,0,1.0,colley,0,team2 Duke 1 Miami_FL ...,team2 Duke 1.479309e+00 Miami_FL ...,team2 Duke 0 Miami_FL 1...
2002,0,0,0.0,colley,1,team2 Duke 2 Miami_FL ...,team2 Duke 7.944823e-01 Miami_FL ...,team2 Duke 14 Miami_FL ...
...,...,...,...,...,...,...,...,...
2018,0,0,1.0,colley,57,team2 Duke 2 Miami_FL ...,team2 Duke 1.260866 Miami_FL ...,team2 Duke 6 Miami_FL ...
2018,0,0,0.0,colley,58,team2 Duke 2 Miami_FL ...,team2 Duke 0.668182 Miami_FL ...,team2 Duke 6 Miami_FL ...
2018,0,0,0.1,colley,58,team2 Duke 2 Miami_FL ...,team2 Duke 0.727302 Miami_FL ...,team2 Duke 6 Miami_FL ...
2018,0,0,0.5,colley,58,team2 Duke 2 Miami_FL ...,team2 Duke 0.963785 Miami_FL ...,team2 Duke 6 Miami_FL ...


# TODO: Below this needs to be updated

## What do we have after running

In [21]:
display(colley_rankings[year])

Unnamed: 0,days_left,direct_thres,spread_thres,weight_indirect,Duke,Miami_FL,Wake_Forest,Boston_College,Notre_Dame,Pittsburgh,Virginia,Florida_St,Virginia_Tech,Georgia_Tech,Clemson,North_Carolina,Louisville,Syracuse,NC_State
0,35.0,0.0,0.0,0.0,3.0,6.0,14.0,10.0,13.0,15.0,1.0,8.0,9.0,11.0,2.0,7.0,5.0,12.0,4.0
1,35.0,0.0,0.0,0.1,3.0,6.0,14.0,10.0,13.0,15.0,1.0,8.0,9.0,11.0,2.0,7.0,5.0,12.0,4.0
2,35.0,0.0,0.0,0.5,4.0,6.0,14.0,10.0,13.0,15.0,1.0,8.0,9.0,11.0,2.0,7.0,5.0,12.0,3.0
3,35.0,0.0,0.0,1.0,4.0,6.0,14.0,10.0,13.0,15.0,1.0,8.0,9.0,11.0,2.0,7.0,5.0,12.0,3.0
4,34.0,0.0,0.0,0.0,3.0,5.0,14.0,10.0,13.0,15.0,1.0,7.0,9.0,12.0,2.0,6.0,8.0,11.0,4.0
5,34.0,0.0,0.0,0.1,3.0,5.0,14.0,10.0,13.0,15.0,1.0,7.0,9.0,12.0,2.0,6.0,8.0,11.0,4.0
6,34.0,0.0,0.0,0.5,4.0,5.0,14.0,10.0,13.0,15.0,1.0,7.0,9.0,12.0,2.0,6.0,8.0,11.0,3.0
7,34.0,0.0,0.0,1.0,4.0,6.0,14.0,10.0,13.0,15.0,1.0,7.0,9.0,12.0,2.0,5.0,8.0,11.0,3.0
8,33.0,0.0,0.0,0.0,4.0,5.0,14.0,12.0,13.0,15.0,1.0,8.0,9.0,11.0,2.0,6.0,7.0,10.0,3.0
9,33.0,0.0,0.0,0.1,4.0,5.0,14.0,12.0,13.0,15.0,1.0,8.0,9.0,11.0,2.0,6.0,7.0,10.0,3.0


## Select your parameters

In [22]:
# Constructs a dataframe from selected parameters
# columns is a dictionary of form {col_name: col_value}
def get_sel_df(columns):
    sel_df = pd.DataFrame(columns=list(columns.keys()))
    c = 0
    print(list(columns.values()))
    for values in itertools.product(*columns.values()):
        print(values)
        sel_df = sel_df.append(pd.Series(values, index=sel_df.columns, name=c))
        c += 1

    return sel_df

In [24]:
columns = {'direct_thres': [0], 'spread_thres': [0], 'weight_indirect': [0,0.1,0.5,1], 'Method': ['Massey','Colley']}
sel_df = get_sel_df(columns)
sel_df

[[0], [0], [0, 0.1, 0.5, 1], ['Massey', 'Colley']]
(0, 0, 0, 'Massey')
(0, 0, 0, 'Colley')
(0, 0, 0.1, 'Massey')
(0, 0, 0.1, 'Colley')
(0, 0, 0.5, 'Massey')
(0, 0, 0.5, 'Colley')
(0, 0, 1, 'Massey')
(0, 0, 1, 'Colley')


Unnamed: 0,direct_thres,spread_thres,weight_indirect,Method
0,0,0,0.0,Massey
1,0,0,0.0,Colley
2,0,0,0.1,Massey
3,0,0,0.1,Colley
4,0,0,0.5,Massey
5,0,0,0.5,Colley
6,0,0,1.0,Massey
7,0,0,1.0,Colley


In [None]:
## TODO: add in more options if we want
#direct_thress = [0] # might be of interest to see how sensitive to preprocessing, but not now
#spread_thress = [0]
#weight_indirects = [0,0.1,0.5,1]
#methods = ['Massey','Colley']
#sel_df = pd.DataFrame(columns=['direct_thres','spread_thres','weight_indirect','Method'])
#c = 0
#for dt,st,wi,method in itertools.product(direct_thress,spread_thress,weight_indirects,methods):
#    print([dt,st,wi,method])
#    sel_df = sel_df.append(pd.Series([dt,st,wi,method],index=sel_df.columns,name=c))
#    c+=1
#
#sel_df

In [None]:
top_k = 5
feature_names = [f'top{top_k}_jaccard',f'top{top_k}_union_tau','tau']
ms = pd.DataFrame(columns=['days_left1','days_left2',"direct_thres","spread_thres","weight_indirect",'Method','Year','rankings1','rankings2']+feature_names)

pair_days_to_subtracts = [sorted(days_to_subtracts) for days_to_subtracts in list(itertools.combinations(days_to_subtracts,2))]

c=0
for index,row in sel_df.iterrows():
    dt,st,iw = row.loc['direct_thres'],row.loc['spread_thres'],row.loc['weight_indirect']
    method = row.loc['Method']
    for year in years:
        for days_left1,days_left2 in pair_days_to_subtracts:
            if method == 'Massey':
                rankings = massey_rankings[year].set_index(["direct_thres","spread_thres","weight_indirect"]).loc[(dt,st,iw)]
            elif method == 'Colley':
                rankings = colley_rankings[year].set_index(["direct_thres","spread_thres","weight_indirect"]).loc[(dt,st,iw)]
            else:
                raise Exception('Unsupported')
            rankings1 = rankings.set_index('days_left').loc[days_left1].T
            rankings1_top_k = rankings1.loc[rankings1 < top_k]
            rankings2 = rankings.set_index('days_left').loc[days_left2].T
            rankings2_top_k = rankings2.loc[rankings2 < top_k]
            val = len(set(rankings1_top_k.index).intersection(set(rankings2_top_k.index)))/len(set(rankings1_top_k.index).union(set(rankings2_top_k.index)))#top_k
            union = set(rankings1_top_k.index).union(rankings2_top_k.index)
            rankings1_union = rankings1.loc[union]
            rankings2_union = rankings2.loc[union]
            tau_union, p_value_union = stats.kendalltau(np.argsort(rankings1_union.values), np.argsort(rankings2_union.values))
            tau,p_value = stats.kendalltau(np.argsort(rankings1.values), np.argsort(rankings2.loc[rankings1.index].values))
            entry = pd.Series([days_left1,days_left2,dt,st,iw,method,year,rankings1,rankings2,val,tau_union,tau],name=c,index=ms.columns)
            c+=1
            ms=ms.append(entry)

In [None]:
ms['days_diff'] = ms.days_left2 - ms.days_left1
ms.head()

In [None]:
ms

In [None]:
import altair as alt

alt.Chart(ms).mark_bar().encode(
    alt.X(f'tau', bin=True),
    y='count()',
    row='days_diff'
)

In [None]:
import altair as alt

alt.Chart(ms).mark_bar().encode(
    alt.X(f'top{top_k}_union_tau', bin=True),
    y='count()',
    row='days_diff'
)

In [None]:
import altair as alt

alt.Chart(ms).mark_bar().encode(
    alt.X(f'top{top_k}_jaccard', bin=True),
    y='count()',
    row='days_diff'
)

In [None]:
sensitivity_target = ms.copy()

sensitivity_data = {}
for year in tqdm(years):
    sensitivity_data[year] = {}
    for days_left in days_to_subtracts:
        game_df = pd.DataFrame({"team1_name":games[year]['team1_name'],
                                "team1_score":games[year]['points1'],
                                "team1_H_A_N": games[year]['H_A_N1'],
                                "team2_name":games[year]['team2_name'],
                                "team2_score":games[year]['points2'],
                                "team2_H_A_N": games[year]['H_A_N1'],
                                "date": games[year]['date']
                               }).sort_values(by='date')
        mask = game_df.team1_name.isin(team_domain) & game_df.team2_name.isin(team_domain)
        game_df = game_df.loc[mask]
        delta = timedelta(days=days_left)
        game_df_sample = game_df.loc[game_df["date"] <= pd.to_datetime(base.selectionSundays[year],format="%m/%d/%Y")-delta].drop('date',axis=1)
        sensitivity_data[year][f"days_left={days_left}"]=game_df_sample
        
joblib.dump({'description':major_description,'target':sensitivity_target,'data':sensitivity_data,'other':{'madness_teams':madness_teams,'remaining_games':remaining_games,'best_df':best_df,'top_k':top_k,'feature_names':feature_names}},"generate.joblib.z")