# RPLib
## NCAA Men's Basketball Dataset, Big 12

1. We need a team list for every year. We need to adapt the code below so it handles a dynamic team list for each year.
2. There is an overall date range:
   (selection_sunday - 35 TO selection_sunday - 7) of games to consider. but within that range, let's move one game at a time. 

In [1]:
major_description = "Study of Big 12"
print(major_description)

Study of Big 12


In [2]:
%load_ext autoreload
%autoreload 2 
%matplotlib inline

In [3]:
import copy
import os
import pandas as pd
import numpy as np
import networkx as nx
from scipy.stats import pearsonr
from scipy.stats import skew
from tqdm import tqdm
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import itertools
import joblib
from scipy import stats

In [4]:
from pathlib import Path
home = str(Path.home())
home

'/home/jupyter-pander14'

In [5]:
# pyt in .ipython

In [6]:
import pyrankability

In [7]:
import marchmadness_study.base as base

In [8]:
import pyrplib

In [9]:
games={}
remaining_games={}
madness_teams={}
all_teams={}
years = ["2002","2003","2004","2005","2006","2007","2008","2009","2010","2011","2012","2013","2014","2015","2016","2017","2018"]
for year in years:
    games[year],remaining_games[year] = base.read_data(f'{home}/marchmadness_study/data/%steams.txt'%year,f'{home}/marchmadness_study/data/%sgames.txt'%year,f'{home}/marchmadness_study/data/%sMadnessTeams.txt'%year)
    madness_teams[year] = list(np.unique(list(games[year].team1_name.loc[games[year].team1_madness == 1]) + list(games[year].team2_name.loc[games[year].team2_madness == 1])))
    all_teams[year] = list(np.unique(list(games[year].team1_name) + list(games[year].team2_name)))
print(year)
games[year]

2018


Unnamed: 0,team2,team1,notsure1,date,H_A_N1,points1,H_A_N2,points2,team1_name,team2_name,team1_madness,team2_madness
4147,256,170,737008,2017-11-10,1,92,-1,77,Minnesota,SC_Upstate,0,0
4288,265,293,737008,2017-11-10,1,75,-1,50,Texas_Tech,South_Alabama,1,0
925,56,326,737008,2017-11-10,1,75,-1,60,Villanova,Columbia,1,0
521,32,176,737008,2017-11-10,1,79,-1,78,Monmouth_NJ,Bucknell,0,1
4252,263,235,737008,2017-11-10,1,105,-1,74,Purdue,SIUE,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1985,121,152,737122,2018-03-04,0,65,0,49,Loyola-Chicago,Illinois_St,1,0
933,57,114,737122,2018-03-04,1,81,-1,71,Houston,Connecticut,1,0
1370,81,164,737122,2018-03-04,1,90,-1,70,Memphis,East_Carolina,0,0
1483,88,143,737122,2018-03-04,-1,108,1,96,Lipscomb,FL_Gulf_Coast,1,0


## What does a dataset look like?

In [10]:
remaining_games[year].head()

Unnamed: 0,team2,team1,notsure1,date,H_A_N1,points1,H_A_N2,points2,team1_name,team2_name,team1_madness,team2_madness
2,1,73,737130,2018-03-12,1,80,-1,73,Drake,Abilene_Chr,0,0
32,2,313,737125,2018-03-07,1,97,-1,90,UNLV,Air_Force,0,0
42,3,79,737126,2018-03-08,0,67,0,58,E_Michigan,Akron,0,0
59,4,137,737128,2018-03-10,0,86,0,63,Kentucky,Alabama,1,1
68,4,326,737135,2018-03-17,0,81,0,58,Villanova,Alabama,1,1


## Select just the teams from the conferece

In [11]:
teams_by_year = {}
for year in years:
    #acc_teams_by_year[year] = ["Duke","Miami_FL","Wake_Forest","Boston_College","Notre_Dame","Pittsburgh","Virginia","Florida_St","Virginia_Tech","Georgia_Tech","Clemson","North_Carolina","Louisville","Syracuse","NC_State"]
    teams_by_year[year] = ["TCU","Baylor","Iowa_St","Kansas","Kansas_St","Oklahoma","Oklahoma_St","Texas","Texas_Tech","West_Virginia"]

## Restrict to games where one of these teams played at least

In [12]:
for year, acc_teams in teams_by_year.items():
    team1_name = games[year].team1_name
    team2_name = games[year].team2_name
    games[year] = games[year].loc[team1_name.isin(acc_teams) | team2_name.isin(acc_teams)]
    
    team1_name = remaining_games[year].team1_name
    team2_name = remaining_games[year].team2_name
    remaining_games[year] = remaining_games[year].loc[team1_name.isin(acc_teams) | team2_name.isin(acc_teams)]

In [13]:
len(games['2018'])

217

## Run Massey and Colley
Parameters are selected below

In [14]:
direct_thress = [0] # might be of interest to see how sensitive to preprocessing, but not now
spread_thress = [0]
weight_indirects = [0,0.1,0.5,1]

### Set the date to start and the date to end in terms of delta from the end of the season

In [15]:
from datetime import timedelta

days_to_subtracts = [int(d) for d in np.arange(7+28,0,-7)]

start_delta = timedelta(days=days_to_subtracts[0])
end_delta = timedelta(days=days_to_subtracts[-1])

start_delta,end_delta

(datetime.timedelta(days=35), datetime.timedelta(days=7))

In [16]:
import ray
ray.init(address="auto")

2021-05-05 15:47:49,958	INFO worker.py:651 -- Connecting to existing Ray cluster at address: 129.65.191.235:54457


{'node_ip_address': '129.65.191.235',
 'raylet_ip_address': '129.65.191.235',
 'redis_address': '129.65.191.235:54457',
 'object_store_address': '/tmp/ray/session_2021-04-30_12-38-02_170946_176293/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-04-30_12-38-02_170946_176293/sockets/raylet',
 'webui_url': '127.0.0.1:8266',
 'session_dir': '/tmp/ray/session_2021-04-30_12-38-02_170946_176293',
 'metrics_export_port': 50491,
 'node_id': '78e5e7f90c1715a19bb99afa73902bed5d15557a'}

In [17]:
@ray.remote
def test():
    import sys
    import pyrankability
    return sys.path

id1 = test.remote()
ray.get(id1)

['/opt/tljh/user/lib/python3.7/site-packages/ray/thirdparty_files',
 '/raid/home/jupyter-pander14/RPLib/problems/ACC_NCAA_Men_Basketball',
 '/opt/tljh/user/lib/python3.7/site-packages',
 '/raid/home/jupyter-pander14',
 '/raid/home/jupyter-pander14',
 '/opt/tljh/user/lib/python3.7/site-packages/ray/pickle5_files',
 '/opt/tljh/user/lib/python3.7/site-packages/ray/workers',
 '/raid/home/jupyter-pander14',
 '/home/jupyter-pander14/RPLib',
 '/home/jupyter-pander14/rankability_toolbox_dev',
 '/home/jupyter-pander14/marchmadness_study',
 '/opt/tljh/user/lib/python37.zip',
 '/opt/tljh/user/lib/python3.7',
 '/opt/tljh/user/lib/python3.7/lib-dynload',
 '/opt/tljh/user/lib/python3.7/site-packages',
 '/opt/tljh/user/lib/python3.7/site-packages/IPython/extensions']

In [51]:
@ray.remote
def compute(dt,st,iw,game_df_sample,team_range,method):
    if method == 'colley':
        map_func = lambda linked: pyrankability.construct.colley_matrices(linked,direct_thres=dt,spread_thres=st)
    
    matrix,b,indirect_matrix,indirect_b = pyrankability.construct.map_vectorized(game_df_sample,map_func)
    matrix = matrix.reindex(index=team_range,columns=team_range)
    b = b.reindex(team_range)
    indirect_matrix = indirect_matrix.reindex(index=team_range,columns=team_range)
    indirect_b = indirect_b.reindex(team_range)
    ranking1,r1,perm1 = pyrankability.rank.ranking_from_matrices(matrix.fillna(0),b.fillna(0))
    indirect_ranking1,indirect_r1,indirect_perm1 = pyrankability.rank.ranking_from_matrices(indirect_matrix.fillna(0),indirect_b.fillna(0))
    if iw > 0:
        r1 = r1+iw*indirect_r1
        perm1, ranking1 = pyrankability.rank.perm_ranking_from_r(r1)
    return ranking1, r1, perm1, game_df_sample, None, None

In [52]:
ix_cols = ["year","direct_thres","spread_thres","weight_indirect","method","new_games"]
all_results = pd.DataFrame(columns = ix_cols+["rankings","rs","perms","games_df","D","ID"]).set_index(ix_cols)

for year in tqdm(years):
    team_domain = teams_by_year[year]
    team_range = team_domain
    
    game_df = pd.DataFrame({"team1_name":games[year]['team1_name'],
                            "team1_score":games[year]['points1'],
                            "team1_H_A_N": games[year]['H_A_N1'],
                            "team2_name":games[year]['team2_name'],
                            "team2_score":games[year]['points2'],
                            "team2_H_A_N": games[year]['H_A_N1'],
                            "date": games[year]['date']
                           }).sort_values(by='date')#.drop('date',axis=1)
    mask = game_df.team1_name.isin(team_domain) & game_df.team2_name.isin(team_domain)
    game_df = game_df.loc[mask]
    
    selection_sunday = base.selectionSundays[year]
    end_date = pd.to_datetime(selection_sunday,format="%m/%d/%Y")-end_delta
    start_date = pd.to_datetime(selection_sunday,format="%m/%d/%Y")-start_delta
    game_df_end = game_df.loc[game_df["date"] <= end_date]
    game_indices = list(game_df.index[game_df["date"] < start_date])
    new_game_indices = [None] + list(game_df.index[game_df["date"] >= start_date])
    
    outer_keys = list(itertools.product(direct_thress,spread_thress,weight_indirects))
    result_ids = {}
    game_df2 = game_df_end.copy().loc[game_indices].drop('date',axis=1)
    total_new_games_added = 0
    for new_game_index in new_game_indices:
        if new_game_index is not None:
            game_df2 = game_df2.append(game_df_end.loc[new_game_index].drop('date')) # add the game
        for dt, st, iw in outer_keys:
            result_ids[total_new_games_added,dt,st,iw,"colley"] = compute.remote(dt,st,iw,game_df2,team_range,"colley")
        total_new_games_added += 1
    for key in result_ids.keys():
        total_new_games_added,dt,st,iw,method = key
        results = ray.get(result_ids[key])
        all_results = all_results.append(pd.Series(results,index=all_results.columns,name=(year,dt,st,iw,method,total_new_games_added)))


  0%|          | 0/17 [00:00<?, ?it/s][A
  6%|▌         | 1/17 [00:00<00:14,  1.12it/s][A
 12%|█▏        | 2/17 [00:01<00:13,  1.11it/s][A
 18%|█▊        | 3/17 [00:02<00:12,  1.10it/s][A
 24%|██▎       | 4/17 [00:03<00:11,  1.10it/s][A
 29%|██▉       | 5/17 [00:04<00:11,  1.06it/s][A
 35%|███▌      | 6/17 [00:05<00:10,  1.06it/s][A
 41%|████      | 7/17 [00:06<00:09,  1.09it/s][A
 47%|████▋     | 8/17 [00:07<00:08,  1.06it/s][A
 53%|█████▎    | 9/17 [00:08<00:07,  1.09it/s][A
 59%|█████▉    | 10/17 [00:09<00:06,  1.16it/s][A
 65%|██████▍   | 11/17 [00:10<00:05,  1.07it/s][A
 71%|███████   | 12/17 [00:12<00:06,  1.25s/it][A
 76%|███████▋  | 13/17 [00:13<00:05,  1.38s/it][A
 82%|████████▏ | 14/17 [00:15<00:04,  1.47s/it][A
 88%|████████▊ | 15/17 [00:17<00:03,  1.57s/it][A
 94%|█████████▍| 16/17 [00:19<00:01,  1.63s/it][A
100%|██████████| 17/17 [00:20<00:00,  1.66s/it][A

In [53]:
all_results

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,rankings,rs,perms,games_df,D,ID
year,direct_thres,spread_thres,weight_indirect,method,new_games,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2002,0,0,0.0,colley,0,team2 TCU 7 Baylor 9 ...,team2 TCU 0.389083 Baylor ...,team2 TCU 3 Baylor 8 Io...,team1_name team1_score team1_H_A_N ...,,
2002,0,0,0.1,colley,0,team2 TCU 7 Baylor 8 ...,team2 TCU 0.429077 Baylor ...,team2 TCU 3 Baylor 8 Io...,team1_name team1_score team1_H_A_N ...,,
2002,0,0,0.5,colley,0,team2 TCU 7 Baylor 8 ...,team2 TCU 0.589050 Baylor ...,team2 TCU 3 Baylor 5 Io...,team1_name team1_score team1_H_A_N ...,,
2002,0,0,1.0,colley,0,team2 TCU 7 Baylor 8 ...,team2 TCU 0.789016 Baylor ...,team2 TCU 3 Baylor 5 Io...,team1_name team1_score team1_H_A_N ...,,
2002,0,0,0.0,colley,1,team2 TCU 7 Baylor 8 ...,team2 TCU 0.387684 Baylor ...,team2 TCU 3 Baylor 5 Io...,team1_name team1_score team1_H_A_N ...,,
...,...,...,...,...,...,...,...,...,...,...,...
2018,0,0,1.0,colley,39,team2 TCU 5 Baylor 6 ...,team2 TCU 0.998428 Baylor ...,team2 TCU 3 Baylor 8 Io...,team1_name team1_score team1_H_A_N ...,,
2018,0,0,0.0,colley,40,team2 TCU 5 Baylor 6 ...,team2 TCU 0.500000 Baylor ...,team2 TCU 3 Baylor 8 Io...,team1_name team1_score team1_H_A_N ...,,
2018,0,0,0.1,colley,40,team2 TCU 5 Baylor 7 ...,team2 TCU 0.550000 Baylor ...,team2 TCU 3 Baylor 8 Io...,team1_name team1_score team1_H_A_N ...,,
2018,0,0,0.5,colley,40,team2 TCU 5 Baylor 7 ...,team2 TCU 0.750000 Baylor ...,team2 TCU 3 Baylor 8 Io...,team1_name team1_score team1_H_A_N ...,,


### Add in the D matrix

In [54]:
for index,row in all_results.iterrows():
    year,dt,st,iw,method,new_games = index
    games_df = row["games_df"]
    D,ID = pyrplib.transformers.ComputeDTransformer(dt, st).fit_transform(games_df)
    row["D"] = D
    row["ID"] = ID

In [55]:
all_results.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,rankings,rs,perms,games_df,D,ID
year,direct_thres,spread_thres,weight_indirect,method,new_games,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2002,0,0,0.0,colley,0,team2 TCU 7 Baylor 9 ...,team2 TCU 0.389083 Baylor ...,team2 TCU 3 Baylor 8 Io...,team1_name team1_score team1_H_A_N ...,team2 Baylor Iowa_St Kansas Kansas...,team2 Baylor Iowa_St Kansas Kansas...
2002,0,0,0.1,colley,0,team2 TCU 7 Baylor 8 ...,team2 TCU 0.429077 Baylor ...,team2 TCU 3 Baylor 8 Io...,team1_name team1_score team1_H_A_N ...,team2 Baylor Iowa_St Kansas Kansas...,team2 Baylor Iowa_St Kansas Kansas...
2002,0,0,0.5,colley,0,team2 TCU 7 Baylor 8 ...,team2 TCU 0.589050 Baylor ...,team2 TCU 3 Baylor 5 Io...,team1_name team1_score team1_H_A_N ...,team2 Baylor Iowa_St Kansas Kansas...,team2 Baylor Iowa_St Kansas Kansas...
2002,0,0,1.0,colley,0,team2 TCU 7 Baylor 8 ...,team2 TCU 0.789016 Baylor ...,team2 TCU 3 Baylor 5 Io...,team1_name team1_score team1_H_A_N ...,team2 Baylor Iowa_St Kansas Kansas...,team2 Baylor Iowa_St Kansas Kansas...
2002,0,0,0.0,colley,1,team2 TCU 7 Baylor 8 ...,team2 TCU 0.387684 Baylor ...,team2 TCU 3 Baylor 5 Io...,team1_name team1_score team1_H_A_N ...,team2 Baylor Iowa_St Kansas Kansas...,team2 Baylor Iowa_St Kansas Kansas...


In [None]:
all_results.tail()

## Select your parameters
Default is to use all the parameters

In [56]:
# Constructs a dataframe from selected parameters
# columns is a dictionary of form {col_name: col_value}
def get_sel_df(columns):
    sel_df = pd.DataFrame(columns=list(columns.keys()))
    c = 0
    for values in itertools.product(*columns.values()):
        sel_df = sel_df.append(pd.Series(values, index=sel_df.columns, name=c))
        c += 1

    return sel_df

In [57]:
columns = {'direct_thres': direct_thress, 'spread_thres': spread_thress, 'weight_indirect': 
           weight_indirects, 'method': ['colley']}
sel_df = get_sel_df(columns)
sel_df

Unnamed: 0,direct_thres,spread_thres,weight_indirect,method
0,0,0,0.0,colley
1,0,0,0.1,colley
2,0,0,0.5,colley
3,0,0,1.0,colley


In [72]:
top_k = 5
feature_names = [f'top{top_k}_jaccard',f'top{top_k}_union_tau','tau','weighted_tau',"delta_lop","nfrac_upper_lop"]
ms = pd.DataFrame(columns=['new_games1','new_games2',"direct_thres","spread_thres","weight_indirect",'method',
                           'year']+feature_names)

pairs_new_games = [sorted(p) for p in list(itertools.combinations(list(all_results.reset_index()['new_games'].unique()),2))]

def features_details(details,D):
    x = pd.DataFrame(details['x'],index=D.index,columns=D.columns)
    r = x.sum(axis=0)
    order = np.argsort(r)
    xstar = x.iloc[order,:].iloc[:,order]
    xstar.loc[:,:] = pyrankability.common.threshold_x(xstar.values)
    inxs = np.triu_indices(len(xstar),k=1)
    xstar_upper = xstar.values[inxs[0],inxs[1]]
    nfrac_upper = sum((xstar_upper > 0) & (xstar_upper < 1))
    return pd.Series([nfrac_upper],index=["nfrac_upper"])

@ray.remote
def compute_entry(rankings1,rankings2,c,index_vals,D1,D2):
    rankings1_top_k = rankings1.loc[rankings1 < top_k]
    rankings2_top_k = rankings2.loc[rankings2 < top_k]
    val = len(set(rankings1_top_k.index).intersection(set(rankings2_top_k.index)))/len(set(rankings1_top_k.index).union(set(rankings2_top_k.index)))#top_k
    union = set(rankings1_top_k.index).union(rankings2_top_k.index)
    rankings1_union = rankings1.loc[union]
    rankings2_union = rankings2.loc[union]
    tau_union, p_value_union = stats.kendalltau(np.argsort(rankings1_union.values), np.argsort(rankings2_union.values))
    tau,p_value = stats.kendalltau(np.argsort(rankings1.values), np.argsort(rankings2.loc[rankings1.index].values))
    weighted_tau,weighted_p_value = stats.weightedtau(np.argsort(rankings1.values), np.argsort(rankings2.loc[rankings1.index].values))
    delta_lop,details_lop = pyrankability.rank.solve(D1.fillna(0),method="lop",cont=True)
    f1 = features_details(details_lop,D1)
    entry = pd.Series(index_vals+[val,tau_union,tau,weighted_tau,delta_lop,f1.loc["nfrac_upper"]],name=c,index=ms.columns)
    return entry

c=0
for index,row in tqdm(list(sel_df.iterrows())):
    dt,st,iw = row.loc['direct_thres'],row.loc['spread_thres'],row.loc['weight_indirect']
    method = row.loc['method']
    for year in years:
        result_ids = []
        for new_games1,new_games2 in pairs_new_games:
            if (year,dt,st,iw,method,new_games1) not in all_results.index or (year,dt,st,iw,method,new_games2) not in all_results.index:
                continue
            rankings1 = all_results.loc[(year,dt,st,iw,method,new_games1),"rankings"]
            rankings2 = all_results.loc[(year,dt,st,iw,method,new_games2),"rankings"]
            D1 = all_results.loc[(year,dt,st,iw,method,new_games1),"D"]
            D2 = all_results.loc[(year,dt,st,iw,method,new_games2),"D"]
            index_vals = [new_games1,new_games2,dt,st,iw,method,year]
            result_ids.append(compute_entry.remote(rankings1,rankings2,c,index_vals,D1,D2))
            c+=1
        
        for rid in result_ids:
            entry = ray.get(rid)
            ms=ms.append(entry)





  0%|          | 0/4 [00:00<?, ?it/s][A[A[A[A



 25%|██▌       | 1/4 [00:43<02:11, 43.93s/it][A[A[A[A



 50%|█████     | 2/4 [01:38<01:34, 47.05s/it][A[A[A[A



 75%|███████▌  | 3/4 [02:42<00:52, 52.26s/it][A[A[A[A



100%|██████████| 4/4 [03:55<00:00, 58.35s/it][A[A[A[A

In [74]:
ms

Unnamed: 0,new_games1,new_games2,direct_thres,spread_thres,weight_indirect,method,year,top5_jaccard,top5_union_tau,tau,weighted_tau,delta_lop,nfrac_upper_lop
0,0,1,0,0,0,colley,2002,1.0,0.666667,0.111111,-0.182917,3.0,18
1,0,2,0,0,0,colley,2002,1.0,0.333333,0.155556,-0.124750,3.0,18
2,0,3,0,0,0,colley,2002,1.0,-0.333333,0.822222,0.810324,3.0,18
3,0,4,0,0,0,colley,2002,1.0,0.333333,0.822222,0.810324,3.0,18
4,0,5,0,0,0,colley,2002,1.0,-0.333333,0.822222,0.810324,3.0,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29547,37,39,0,0,1,colley,2018,1.0,0.000000,0.911111,0.852053,23.0,15
29548,37,40,0,0,1,colley,2018,1.0,0.666667,0.555556,0.632691,23.0,15
29549,38,39,0,0,1,colley,2018,1.0,1.000000,0.911111,0.920336,24.0,15
29550,38,40,0,0,1,colley,2018,1.0,1.000000,0.466667,0.629319,24.0,15


In [75]:
ms.head()

Unnamed: 0,new_games1,new_games2,direct_thres,spread_thres,weight_indirect,method,year,top5_jaccard,top5_union_tau,tau,weighted_tau,delta_lop,nfrac_upper_lop
0,0,1,0,0,0,colley,2002,1.0,0.666667,0.111111,-0.182917,3.0,18
1,0,2,0,0,0,colley,2002,1.0,0.333333,0.155556,-0.12475,3.0,18
2,0,3,0,0,0,colley,2002,1.0,-0.333333,0.822222,0.810324,3.0,18
3,0,4,0,0,0,colley,2002,1.0,0.333333,0.822222,0.810324,3.0,18
4,0,5,0,0,0,colley,2002,1.0,-0.333333,0.822222,0.810324,3.0,18


In [76]:
ms.tail()

Unnamed: 0,new_games1,new_games2,direct_thres,spread_thres,weight_indirect,method,year,top5_jaccard,top5_union_tau,tau,weighted_tau,delta_lop,nfrac_upper_lop
29547,37,39,0,0,1,colley,2018,1.0,0.0,0.911111,0.852053,23.0,15
29548,37,40,0,0,1,colley,2018,1.0,0.666667,0.555556,0.632691,23.0,15
29549,38,39,0,0,1,colley,2018,1.0,1.0,0.911111,0.920336,24.0,15
29550,38,40,0,0,1,colley,2018,1.0,1.0,0.466667,0.629319,24.0,15
29551,39,40,0,0,1,colley,2018,1.0,1.0,0.555556,0.711933,25.0,15


In [77]:
ms['games_diff'] = ms.new_games2 - ms.new_games1
ms['games1_games2'] = ms.new_games1.astype(str) + " to " + ms.new_games2.astype(str)

ms.head()

Unnamed: 0,new_games1,new_games2,direct_thres,spread_thres,weight_indirect,method,year,top5_jaccard,top5_union_tau,tau,weighted_tau,delta_lop,nfrac_upper_lop,games_diff,games1_games2
0,0,1,0,0,0,colley,2002,1.0,0.666667,0.111111,-0.182917,3.0,18,1,0 to 1
1,0,2,0,0,0,colley,2002,1.0,0.333333,0.155556,-0.12475,3.0,18,2,0 to 2
2,0,3,0,0,0,colley,2002,1.0,-0.333333,0.822222,0.810324,3.0,18,3,0 to 3
3,0,4,0,0,0,colley,2002,1.0,0.333333,0.822222,0.810324,3.0,18,4,0 to 4
4,0,5,0,0,0,colley,2002,1.0,-0.333333,0.822222,0.810324,3.0,18,5,0 to 5


### Plots plots plots

In [79]:
index_cols = ['direct_thres','spread_thres','weight_indirect']
index_vals = (0,0,0)

In [82]:
for_source

Unnamed: 0,new_games1,new_games2,year,top5_jaccard,top5_union_tau,tau,weighted_tau,delta_lop,nfrac_upper_lop,games_diff,games1_games2
0,0,1,2002,1.0,0.666667,0.111111,-0.182917,3.0,18,1,0 to 1
20,1,2,2002,1.0,0.000000,0.955556,0.936775,3.0,16,1,1 to 2
39,2,3,2002,1.0,1.000000,0.333333,0.064926,5.0,17,1,2 to 3
57,3,4,2002,1.0,1.000000,1.000000,1.000000,5.0,17,1,3 to 4
74,4,5,2002,1.0,1.000000,1.000000,1.000000,4.0,16,1,4 to 5
...,...,...,...,...,...,...,...,...,...,...,...
7373,35,36,2018,1.0,1.000000,1.000000,1.000000,22.0,5,1,35 to 36
7378,36,37,2018,1.0,1.000000,1.000000,1.000000,23.0,15,1,36 to 37
7382,37,38,2018,1.0,0.666667,0.822222,0.767331,23.0,15,1,37 to 38
7385,38,39,2018,1.0,1.000000,0.866667,0.881136,24.0,15,1,38 to 39


In [99]:
import altair as alt
subset_ms = ms.set_index(index_cols).loc[index_vals].reset_index()
for_source = subset_ms.loc[subset_ms['games_diff'] == 10].drop(['direct_thres','spread_thres','weight_indirect','method'],axis=1)

alt.Chart(for_source).mark_point().encode(
    x=alt.X('tau'),#,sort='new_games1:Q'),
    y=alt.Y('delta_lop')
)

In [66]:
subset_all_results = all_results.reset_index().set_index(index_cols).loc[index_vals].reset_index()
subset_rankings = pd.DataFrame(subset_all_results['rankings'].tolist()).join(subset_all_results[['method','new_games','year']])
subset_rankings

  """Entry point for launching an IPython kernel.


Unnamed: 0,TCU,Baylor,Iowa_St,Kansas,Kansas_St,Oklahoma,Oklahoma_St,Texas,Texas_Tech,West_Virginia,method,new_games,year
0,7,9,10,1,5,3,6,4,2,8,colley,0,2002
1,7,8,10,1,5,2,6,4,3,9,colley,1,2002
2,7,8,10,1,5,2,6,3,4,9,colley,2,2002
3,7,9,10,1,5,2,6,3,4,8,colley,3,2002
4,7,9,10,1,5,2,6,3,4,8,colley,4,2002
...,...,...,...,...,...,...,...,...,...,...,...,...,...
481,5,6,10,1,4,7,9,8,3,2,colley,36,2018
482,5,6,10,1,4,7,9,8,3,2,colley,37,2018
483,5,6,10,1,4,8,9,7,2,3,colley,38,2018
484,5,6,10,1,4,7,9,8,2,3,colley,39,2018


In [62]:
import altair as alt

source = subset_rankings.melt(id_vars=["method","new_games","year"])

for year in years:
    print(year)
    g = alt.Chart(source.set_index('year').loc[year]).mark_bar().encode(
        x=alt.X('value',title='Ranking'),
        y=alt.Y('variable',title='Team'),
        color='variable',
        column='new_games'
    )
    display(g)

2002


2003


2004


2005


2006


2007


2008


2009


2010


2011


2012


2013


2014


2015


2016


2017


2018


In [66]:
import altair as alt

source = subset_rankings.melt(id_vars=["method","new_games","year"])

for year in years:
    print(year)
    g = alt.Chart(source.set_index('year').loc[year]).mark_bar().encode(
        x=alt.X('new_games',title='New games'),
        y=alt.Y('value',title='Ranking'),
        color='variable',
        column='variable'
    )
    display(g)

2002


2003


2004


2005


2006


2007


2008


2009


2010


2011


2012


2013


2014


2015


2016


2017


2018


In [None]:
g = alt.Chart(source).mark_bar().encode(
    x=alt.X('new_games',title='New games'),
    y=alt.Y('value',title='Ranking'),
    color='variable',
    row='variable',
    column='year'
)
g

In [73]:
for_source

Unnamed: 0,new_games1,new_games2,year,top5_jaccard,top5_union_tau,tau,weighted_tau,delta_lop,games_diff,games1_games2
0,0,1,2002,1.0,0.000000,0.111111,-0.182917,3.0,1,0 to 1
20,1,2,2002,1.0,0.666667,0.955556,0.936775,3.0,1,1 to 2
39,2,3,2002,1.0,1.000000,0.333333,0.064926,5.0,1,2 to 3
57,3,4,2002,1.0,1.000000,1.000000,1.000000,5.0,1,3 to 4
74,4,5,2002,1.0,1.000000,1.000000,1.000000,4.0,1,4 to 5
...,...,...,...,...,...,...,...,...,...,...
7373,35,36,2018,1.0,1.000000,1.000000,1.000000,22.0,1,35 to 36
7378,36,37,2018,1.0,1.000000,1.000000,1.000000,23.0,1,36 to 37
7382,37,38,2018,1.0,0.000000,0.822222,0.767331,23.0,1,37 to 38
7385,38,39,2018,1.0,1.000000,0.866667,0.881136,24.0,1,38 to 39


In [69]:
source = for_source.melt(id_vars=['year','games1_games2','games_diff','new_games1','new_games2'])
source

Unnamed: 0,year,games1_games2,games_diff,new_games1,new_games2,variable,value
0,2002,0 to 1,1,0,1,top5_jaccard,1.0
1,2002,1 to 2,1,1,2,top5_jaccard,1.0
2,2002,2 to 3,1,2,3,top5_jaccard,1.0
3,2002,3 to 4,1,3,4,top5_jaccard,1.0
4,2002,4 to 5,1,4,5,top5_jaccard,1.0
...,...,...,...,...,...,...,...
2340,2018,35 to 36,1,35,36,delta_lop,22.0
2341,2018,36 to 37,1,36,37,delta_lop,23.0
2342,2018,37 to 38,1,37,38,delta_lop,23.0
2343,2018,38 to 39,1,38,39,delta_lop,24.0


In [89]:
import altair as alt

alt.Chart(source.loc[source['variable']=='weighted_tau']).mark_bar().encode(
    x=alt.X('games1_games2',sort= {"field": "new_games1:Q"}),#,sort='new_games1:Q'),
    y=alt.Y('value:Q'),
    row='year'
)

In [None]:
subset_

In [131]:
means = subset_ms.groupby(['direct_thres','spread_thres','weight_indirect','method','games_diff'])[feature_names].mean()
means['metric'] = 'mean'
maxs = subset_ms.groupby(['direct_thres','spread_thres','weight_indirect','method','games_diff'])[feature_names].max()
maxs['metric'] = 'max'
mins = subset_ms.groupby(['direct_thres','spread_thres','weight_indirect','method','games_diff'])[feature_names].min()
mins['metric'] = 'min'
stdevs = subset_ms.groupby(['direct_thres','spread_thres','weight_indirect','method','games_diff'])[feature_names].std()
stdevs['metric'] = 'stdev'
medians = subset_ms.groupby(['direct_thres','spread_thres','weight_indirect','method','games_diff'])[feature_names].median()
medians['metric'] = 'median'

In [132]:
summary = means.reset_index().append(maxs.reset_index()).append(mins.reset_index()).append(stdevs.reset_index()).append(medians.reset_index())
summary

Unnamed: 0,direct_thres,spread_thres,weight_indirect,method,games_diff,top5_jaccard,top5_union_tau,tau,metric
0,0,0,0.0,colley,1,0.938798,0.780510,0.773484,mean
1,0,0,0.0,colley,2,0.898368,0.654639,0.638628,mean
2,0,0,0.0,colley,3,0.870487,0.565043,0.553718,mean
3,0,0,0.0,colley,4,0.847283,0.497504,0.489183,mean
4,0,0,0.0,colley,5,0.824699,0.432129,0.449025,mean
...,...,...,...,...,...,...,...,...,...
53,0,0,0.0,colley,54,0.600000,0.200000,0.047619,median
54,0,0,0.0,colley,55,0.600000,0.333333,0.047619,median
55,0,0,0.0,colley,56,0.600000,-0.200000,0.028571,median
56,0,0,0.0,colley,57,0.600000,0.000000,0.047619,median


In [133]:
import altair as alt

alt.Chart(summary).mark_line().encode(
    x=alt.X('games_diff'),
    y=alt.Y('tau:Q'),
    color='metric:N'
).properties(width=300, height=200)

In [161]:
dt = ms.loc[ms['games_diff']==1]
dt

Unnamed: 0,new_games1,new_games2,direct_thres,spread_thres,weight_indirect,method,year,top5_jaccard,top5_union_tau,tau,games_diff
0,0,1,0,0,0,colley,2002,1.0,0.0,0.466667,1
26,1,2,0,0,0,colley,2002,0.6,-0.4,0.619048,1
51,2,3,0,0,0,colley,2002,1.0,1.0,0.847619,1
75,3,4,0,0,0,colley,2002,0.6,0.2,0.028571,1
98,4,5,0,0,0,colley,2002,1.0,1.0,1.000000,1
...,...,...,...,...,...,...,...,...,...,...,...
67429,53,54,0,0,1,colley,2018,1.0,1.0,1.000000,1
67434,54,55,0,0,1,colley,2018,1.0,1.0,1.000000,1
67438,55,56,0,0,1,colley,2018,0.6,-0.2,0.561905,1
67441,56,57,0,0,1,colley,2018,1.0,1.0,1.000000,1


In [164]:
import altair as alt

alt.Chart(dt).mark_line().encode(
    x=alt.X('new_games2'),
    y=alt.Y('top5_jaccard:Q'),
    row='year:N'
).properties(width=300, height=200)

In [None]:
import altair as alt

alt.Chart(summary).mark_line().encode(
    x=alt.X(''),
    y=alt.Y('tau:Q'),
    color='metric:N'
).properties(width=300, height=200)

In [135]:
import altair as alt

alt.Chart(summary).mark_line().encode(
    x=alt.X('games_diff'),
    y=alt.Y('top5_jaccard:Q'),
    color='metric:N'
).properties(width=300, height=200)

In [136]:
import altair as alt

alt.Chart(summary).mark_line().encode(
    x=alt.X('games_diff'),
    y=alt.Y('top5_union_tau:Q'),
    color='metric:N'
).properties(width=300, height=200)

### Let's find those extreme examples!

In [137]:
subset_ms.set_index('games_diff').loc[1].set_index(['new_games1','new_games2','year'])['tau'].min()

-0.18095238095238098

In [144]:
new_games1,new_games2,year = subset_ms.set_index('games_diff').loc[1].set_index(['new_games1','new_games2','year'])['tau'].idxmin()

In [145]:
year

'2018'

In [146]:
rankings1 = all_results.loc[(year,0,0,0,'colley',new_games1),'rankings']
rankings1

team2
Duke               7
Miami_FL           4
Wake_Forest       14
Boston_College    11
Notre_Dame        12
Pittsburgh        15
Virginia           1
Florida_St         9
Virginia_Tech      8
Georgia_Tech      13
Clemson            2
North_Carolina     3
Louisville         6
Syracuse          10
NC_State           5
dtype: int64

In [147]:
rankings2 = all_results.loc[(year,0,0,0,'colley',new_games2),'rankings']
rankings2

team2
Duke               6
Miami_FL           5
Wake_Forest       14
Boston_College    12
Notre_Dame        10
Pittsburgh        15
Virginia           1
Florida_St         9
Virginia_Tech      8
Georgia_Tech      13
Clemson            2
North_Carolina     3
Louisville         7
Syracuse          11
NC_State           4
dtype: int64

In [148]:
tau,p_value = stats.kendalltau(np.argsort(rankings1.values), np.argsort(rankings2.loc[rankings1.index].values))
tau

-0.18095238095238098

In [149]:
year

'2018'

In [150]:
def compute_D(game_df,team_range,direct_thres,spread_thres):
    map_func = lambda linked: pyrankability.construct.support_map_vectorized_direct_indirect(linked,direct_thres=direct_thres,spread_thres=spread_thres)
    Ds = pyrankability.construct.V_count_vectorized(game_df,map_func)
    for i in range(len(Ds)):
        Ds[i] = Ds[i].reindex(index=team_range,columns=team_range)
    return Ds

In [151]:
dt, st, iw = 0,0,0

team_domain = acc_teams_by_year[year]
team_range = team_domain

game_df = pd.DataFrame({"team1_name":games[year]['team1_name'],
                        "team1_score":games[year]['points1'],
                        "team1_H_A_N": games[year]['H_A_N1'],
                        "team2_name":games[year]['team2_name'],
                        "team2_score":games[year]['points2'],
                        "team2_H_A_N": games[year]['H_A_N1'],
                        "date": games[year]['date']
                       }).sort_values(by='date')#.drop('date',axis=1)
mask = game_df.team1_name.isin(team_domain) & game_df.team2_name.isin(team_domain)
game_df = game_df.loc[mask]

selection_sunday = base.selectionSundays[year]
end_date = pd.to_datetime(selection_sunday,format="%m/%d/%Y")-end_delta
start_date = pd.to_datetime(selection_sunday,format="%m/%d/%Y")-start_delta
game_df_end = game_df.loc[game_df["date"] <= end_date]
game_indices = list(game_df.index[game_df["date"] < start_date])
new_game_indices = [None] + list(game_df.index[game_df["date"] >= start_date])
    
result_ids = {}
game_df2 = game_df_end.copy().loc[game_indices].drop('date',axis=1)
total_new_games_added = 0
for new_game_index in new_game_indices:
    if new_game_index is not None:
        game_df2 = game_df2.append(game_df_end.loc[new_game_index].drop('date')) # add the game
    if total_new_games_added == new_games1:
        D1s = compute_D(game_df2,team_range,dt,st)
    if total_new_games_added == new_games2:
        D2s = compute_D(game_df2,team_range,dt,st)
        break
    total_new_games_added += 1

In [156]:
D1 = D1s[0].fillna(0)
D2 = D2s[0].fillna(0)

In [157]:
D1.to_csv('D1.csv')
D2.to_csv('D2.csv')

In [158]:
D1-D2

team2,Duke,Miami_FL,Wake_Forest,Boston_College,Notre_Dame,Pittsburgh,Virginia,Florida_St,Virginia_Tech,Georgia_Tech,Clemson,North_Carolina,Louisville,Syracuse,NC_State
team1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Duke,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Miami_FL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Wake_Forest,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Boston_College,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Notre_Dame,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Pittsburgh,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Virginia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Florida_St,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Virginia_Tech,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Georgia_Tech,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
sensitivity_target = ms.copy()

sensitivity_data = {}
for year in tqdm(years):
    sensitivity_data[year] = {}
    for days_left in days_to_subtracts:
        game_df = pd.DataFrame({"team1_name":games[year]['team1_name'],
                                "team1_score":games[year]['points1'],
                                "team1_H_A_N": games[year]['H_A_N1'],
                                "team2_name":games[year]['team2_name'],
                                "team2_score":games[year]['points2'],
                                "team2_H_A_N": games[year]['H_A_N1'],
                                "date": games[year]['date']
                               }).sort_values(by='date')
        mask = game_df.team1_name.isin(team_domain) & game_df.team2_name.isin(team_domain)
        game_df = game_df.loc[mask]
        delta = timedelta(days=days_left)
        game_df_sample = game_df.loc[game_df["date"] <= pd.to_datetime(base.selectionSundays[year],format="%m/%d/%Y")-delta].drop('date',axis=1)
        sensitivity_data[year][f"days_left={days_left}"]=game_df_sample
        
joblib.dump({'description':major_description,'target':sensitivity_target,'data':sensitivity_data,'other':{'madness_teams':madness_teams,'remaining_games':remaining_games,'best_df':best_df,'top_k':top_k,'feature_names':feature_names}},"generate.joblib.z")