# RPLib Problem 0001 - Baseline

Provides the baseline approach to rankability problem 0001. 

In [289]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [290]:
import copy
import os

import pandas as pd
import numpy as np

from scipy.stats import pearsonr

from tqdm import tqdm
#import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import joblib
import itertools
from pathlib import Path

from IPython.display import display, Markdown, Latex

**All packages are relative to the home directory of the user**

In [291]:
home = str(Path.home())

**Import the main rankability package**

In [292]:
import sys
sys.path.insert(0,"%s/rankability_toolbox_dev"%home)
import pyrankability

**Load the problem information**

In [293]:
problem = joblib.load("/disk/RPLib/problem_0001.joblib.z")

## Explore and setup the problem

In [294]:
problem.keys()

dict_keys(['description', 'target', 'data', 'other'])

In [295]:
print(problem["description"])


A practitioner wants to predict the degree to which a the rankings during season 
of the NCAA Men’s Basketball are likely to change as more games are played (i.e., sensitivity to more games). 
They want to start the analysis after a minimum of 50% of the games are played. 
They want to run Massey and Colley.

Sensitivity of new games will be measured as the intersection of between two 
rankings derived from before and after the new games are included.



In [296]:
problem['target']

Unnamed: 0,frac1,frac2,domain,range,direct_thres,spread_thres,weight_indirect,Method,Year,top15_intersection
0,0.5,0.6,all,madness,3.0,0,0,Colley,2002,0.800000
1,0.5,0.7,all,madness,3.0,0,0,Colley,2002,0.733333
2,0.5,0.8,all,madness,3.0,0,0,Colley,2002,0.666667
3,0.5,0.9,all,madness,3.0,0,0,Colley,2002,0.666667
4,0.5,1.0,all,madness,3.0,0,0,Colley,2002,0.600000
...,...,...,...,...,...,...,...,...,...,...
3055,0.7,0.9,all,madness,0.0,0,0,Massey,2018,0.933333
3056,0.7,1.0,all,madness,0.0,0,0,Massey,2018,0.866667
3057,0.8,0.9,all,madness,0.0,0,0,Massey,2018,0.866667
3058,0.8,1.0,all,madness,0.0,0,0,Massey,2018,0.866667


In [297]:
problem['data'].keys()

dict_keys(['2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018'])

In [298]:
problem['data']['2002'].keys()

dict_keys(['frac=0.5', 'frac=0.6', 'frac=0.7', 'frac=0.8', 'frac=0.9', 'frac=1.0'])

**Create easier to reference variables**

In [299]:
years = list(problem['data'].keys())
frac_keys = list(problem['data'][years[0]].keys())
remaining_games = problem['other']['remaining_games']
madness_teams = problem['other']['madness_teams']
best_df = problem['other']['best_df']
top_k = problem['other']['top_k']
target_column = f"top{top_k}_intersection"
best_pred_df = problem['other']['best_pred_df']

## Define helper functions

**Function to compute a D matrix from games using hyperparameters**

In [300]:
def compute_D(game_df,team_range,direct_thres,spread_thres,weight_indirect):
    map_func = lambda linked: pyrankability.construct.support_map_vectorized_direct_indirect_weighted(linked,direct_thres=direct_thres,spread_thres=spread_thres,weight_indirect=weight_indirect)
    D = pyrankability.construct.V_count_vectorized(game_df,map_func).reindex(index=team_range,columns=team_range)
    return D

In [301]:
def process(data,target,best_df_all):
    index_cols = ["Year","frac_key","direct_thres","spread_thres","weight_indirect","range"]
    Ds = pd.DataFrame(columns=["D"]+index_cols)
    Ds.set_index(index_cols,inplace=True)
    for frac_key,year in tqdm(itertools.product(frac_keys,years)):
        frac = float(frac_key.split("=")[1])
        best_df = best_df_all.set_index('frac').loc[frac]
        for index,row in best_df.iterrows():
            dom,ran,dt,st,iw = row.loc['domain'],row.loc['range'],row.loc['direct_thres'],row.loc['spread_thres'],row.loc['weight_indirect']
            # set the team_range
            team_range = None
            if ran == 'madness':
                team_range = madness_teams[year]
            elif ran == 'all':
                team_range = all_teams[year]
            else:
                raise Exception(f"range={ran} not supported")
            D = compute_D(data[year][frac_key],team_range,dt,st,iw)
            Ds = Ds.append(pd.Series([D],index=["D"],name=(year,frac_key,dt,st,iw,ran))) 
    return Ds

## Create D matrices

In [302]:
best_df

Unnamed: 0,frac,Method,domain,range,direct_thres,spread_thres,weight_indirect
0,0.5,Colley,all,madness,3.0,0,0
1,0.5,Massey,all,madness,0.0,0,0
2,0.6,Colley,all,madness,0.0,0,0
3,0.6,Massey,all,madness,0.0,0,0
4,0.7,Colley,all,madness,0.0,0,0
5,0.7,Massey,all,madness,3.0,0,0
6,0.8,Colley,all,madness,0.0,0,0
7,0.8,Massey,all,madness,3.0,0,0
8,0.9,Colley,all,madness,0.0,0,0
9,0.9,Massey,all,madness,3.0,0,0


In [303]:
Ds = process(problem['data'],problem['target'],best_df)









0it [00:00, ?it/s][A[A[A[A[A[A[A[A







1it [00:00,  2.46it/s][A[A[A[A[A[A[A[A







2it [00:00,  2.49it/s][A[A[A[A[A[A[A[A







3it [00:01,  2.52it/s][A[A[A[A[A[A[A[A







4it [00:01,  2.51it/s][A[A[A[A[A[A[A[A







5it [00:01,  2.48it/s][A[A[A[A[A[A[A[A







6it [00:02,  2.39it/s][A[A[A[A[A[A[A[A







7it [00:02,  2.28it/s][A[A[A[A[A[A[A[A







8it [00:03,  2.25it/s][A[A[A[A[A[A[A[A







9it [00:03,  2.19it/s][A[A[A[A[A[A[A[A







10it [00:04,  2.17it/s][A[A[A[A[A[A[A[A







11it [00:04,  2.04it/s][A[A[A[A[A[A[A[A







12it [00:05,  2.05it/s][A[A[A[A[A[A[A[A







13it [00:05,  2.07it/s][A[A[A[A[A[A[A[A







14it [00:06,  2.08it/s][A[A[A[A[A[A[A[A







15it [00:06,  2.09it/s][A[A[A[A[A[A[A[A







16it [00:07,  2.09it/s][A[A[A[A[A[A[A[A







17it [00:07,  2.07it/s][A[A[A[A[A[A[A[A







18it [00:08,  2.

In [304]:
Ds

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,D
Year,frac_key,direct_thres,spread_thres,weight_indirect,range,Unnamed: 6_level_1
2002,frac=0.5,3.0,0,0,madness,team2 Alabama Alcorn_St Arizona B...
2002,frac=0.5,0.0,0,0,madness,team2 Alabama Alcorn_St Arizona B...
2003,frac=0.5,3.0,0,0,madness,team2 Alabama Arizona Arizona_St Aub...
2003,frac=0.5,0.0,0,0,madness,team2 Alabama Arizona Arizona_St Aub...
2004,frac=0.5,3.0,0,0,madness,team2 Air_Force Alabama Alabama_St A...
...,...,...,...,...,...,...
2016,frac=1.0,0.0,0,0,madness,team2 Akron Arizona Ark_Little_Ro...
2017,frac=1.0,3.0,0,0,madness,team2 Arizona Arkansas Baylor Buck...
2017,frac=1.0,0.0,0,0,madness,team2 Arizona Arkansas Baylor Buck...
2018,frac=1.0,3.0,0,0,madness,team2 Alabama Arizona Arkansas Aub...


In [305]:
Ds.iloc[[0,-1]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,D
Year,frac_key,direct_thres,spread_thres,weight_indirect,range,Unnamed: 6_level_1
2002,frac=0.5,3.0,0,0,madness,team2 Alabama Alcorn_St Arizona B...
2018,frac=1.0,0.0,0,0,madness,team2 Alabama Arizona Arkansas Aub...


In [306]:
Ds.index.names

FrozenList(['Year', 'frac_key', 'direct_thres', 'spread_thres', 'weight_indirect', 'range'])

### To minimize computation time, limit to 0.5

In [307]:
feature_columns = ["delta_lop","delta_hillside","nfrac_xstar_lop","nfrac_xstar_hillside","diameter_lop","diameter_hillside"]

def compute_features(D,rankings,top_k):
    top_teams = list(rankings.sort_values().index[:top_k])
    D = D.loc[top_teams,top_teams]
    
    delta_lop,details_lop = pyrankability.rank.solve(D.fillna(0),method='lop',cont=True)

    x = pd.DataFrame(details_lop['x'],index=D.index,columns=D.columns)
    r = x.sum(axis=0)
    order = np.argsort(r)
    xstar = x.iloc[order,:].iloc[:,order]
    xstar.loc[:,:] = pyrankability.common.threshold_x(xstar.values)
    inxs = np.triu_indices(len(xstar),k=1)
    xstar_upper = xstar.values[inxs[0],inxs[1]]
    nfrac_upper_lop = sum((xstar_upper > 0) & (xstar_upper < 1))
    
    top_teams = xstar.columns[:top_k]
    
    k_two_distant,details_two_distant = pyrankability.search.solve_pair_max_tau(D.fillna(0),method='lop',cont=False,verbose=False)
    d_lop = details_two_distant['tau']
    
    delta_hillside,details_hillside = pyrankability.rank.solve(D,method='hillside',cont=True)
    
    x = pd.DataFrame(details_hillside['x'],index=D.index,columns=D.columns)
    r = x.sum(axis=0)
    order = np.argsort(r)
    xstar = x.iloc[order,:].iloc[:,order]
    xstar.loc[:,:] = pyrankability.common.threshold_x(xstar.values)
    inxs = np.triu_indices(len(xstar),k=1)
    xstar_upper = xstar.values[inxs[0],inxs[1]]
    nfrac_upper_hillside = sum((xstar_upper > 0) & (xstar_upper < 1))
    
    top_teams = xstar.columns[:top_k]
    
    k_two_distant,details_two_distant = pyrankability.search.solve_pair_max_tau(D,method='hillside',verbose=False,cont=False)
    d_hillside = details_two_distant['tau']
    
    features = pd.Series([delta_lop,delta_hillside,2*nfrac_upper_lop,2*nfrac_upper_hillside,d_lop,d_hillside],index=feature_columns)

    return features

In [308]:
best_pred_df = best_pred_df.reset_index()
best_pred_df['frac_key'] = "frac="+best_pred_df['frac'].astype(str)
best_pred_df

Unnamed: 0,domain,range,direct_thres,spread_thres,weight_indirect,frac,Method,Year,Predictability,rankings,frac_key
0,all,madness,3.0,0,0,0.5,Colley,2002,64.044944,Alabama 32.0 Alcorn_St 64.0 ...,frac=0.5
1,all,madness,3.0,0,0,0.5,Colley,2003,67.857143,Alabama 7.0 Arizona 3.0 Arizon...,frac=0.5
2,all,madness,3.0,0,0,0.5,Colley,2004,56.989247,Air_Force 36.0 Alabama 22.0 Alabam...,frac=0.5
3,all,madness,3.0,0,0,0.5,Colley,2005,65.934066,Alabama 26.0 Alabama_A&M 65.0 ...,frac=0.5
4,all,madness,3.0,0,0,0.5,Colley,2006,63.953488,Air_Force 19.0 Alabama 47.0 Al...,frac=0.5
...,...,...,...,...,...,...,...,...,...,...,...
199,all,madness,0.0,0,0,1.0,Massey,2014,65.000000,Albany_NY 64.0 American_Univ 59.0 Ar...,frac=1.0
200,all,madness,0.0,0,0,1.0,Massey,2015,69.387755,Albany_NY 61.0 Arizona 3.0 Arkansas...,frac=1.0
201,all,madness,0.0,0,0,1.0,Massey,2016,70.238095,Akron 53.0 Arizona 12....,frac=1.0
202,all,madness,0.0,0,0,1.0,Massey,2017,54.838710,Arizona 26.0 Arkansas 35.0 Ba...,frac=1.0


In [309]:
def create_features(Ds,best_pred_df,top_k):
    index_cols = list(Ds.index.names)+["Method"]
    X = pd.DataFrame(columns=index_cols + feature_columns)
    X.set_index(index_cols,inplace=True)
    for index,row in tqdm(Ds.iterrows()):
        year,frac_key,dt,st,iw,ran = index
        frac = float(frac_key.split("=")[1])
        D = Ds.loc[(year,frac_key,dt,st,iw,ran),"D"][0]
        spec_best_pred_df = best_pred_df.set_index(list(Ds.index.names)).loc[(year,frac_key,dt,st,iw,ran)]
        methods = spec_best_pred_df["Method"].unique()
        for method in methods:
            rankings = spec_best_pred_df.set_index('Method').loc[method,'rankings']
            features = compute_features(D,rankings,top_k)
            features.name = tuple(list(index)+[method])
            X = X.append(features)
    return X

In [None]:
X = create_features(Ds,best_pred_df.reset_index(),top_k*2)









  return self._getitem_tuple(key)
  if __name__ == '__main__':








1it [00:08,  8.15s/it][A[A[A[A[A[A[A[A







2it [00:41, 15.84s/it][A[A[A[A[A[A[A[A







3it [00:47, 12.79s/it][A[A[A[A[A[A[A[A







4it [00:55, 11.28s/it][A[A[A[A[A[A[A[A







5it [00:58,  8.75s/it][A[A[A[A[A[A[A[A







6it [01:01,  7.03s/it][A[A[A[A[A[A[A[A







7it [01:33, 14.72s/it][A[A[A[A[A[A[A[A







8it [01:47, 14.34s/it][A[A[A[A[A[A[A[A







9it [01:53, 11.83s/it][A[A[A[A[A[A[A[A







10it [02:02, 10.99s/it][A[A[A[A[A[A[A[A







11it [02:24, 14.49s/it][A[A[A[A[A[A[A[A







12it [02:37, 13.96s/it][A[A[A[A[A[A[A[A







13it [02:43, 11.38s/it][A[A[A[A[A[A[A[A







14it [02:55, 11.73s/it][A[A[A[A[A[A[A[A







15it [03:39, 21.36s/it][A[A[A[A[A[A[A[A







16it [03:59, 20.90s/it][A[A[A[A[A[A[A[A







17it [04:05, 16.41s/it][A[A[A[A[A[A[A[A




In [None]:
X

## Refine the target dataset

In [None]:
target = problem['target'].groupby(['frac1','frac2','Method','Year','direct_thres','spread_thres','weight_indirect'])[target_column].mean().to_frame()
target

In [None]:
X_for_join = X.copy().reset_index()
X_for_join['frac1']= X_for_join['frac_key'].str.replace("frac=","").astype(float)
X_for_join

In [None]:
target

In [None]:
Xy = target.reset_index().set_index(['Method','frac1','Year','direct_thres','spread_thres','weight_indirect']).join(X_for_join.set_index(['Method','frac1','Year','direct_thres','spread_thres','weight_indirect'])).dropna()
Xy = Xy.reset_index()
Xy

## 0.5 to 0.6

In [None]:
data = Xy.set_index(['frac1','frac2']).loc[(0.5,0.6)].reset_index()
for_corr = data.set_index(['Method'])
for ix in for_corr.index.unique():
    display(pd.Series(ix,index=for_corr.index.names))
    display(for_corr.loc[ix][[target_column]+feature_columns].corr())

## 0.6 to 0.7

In [None]:
data = Xy.set_index(['frac1','frac2']).loc[(0.6,0.7)].reset_index()
for_corr = data.set_index(['Method'])
for ix in for_corr.index.unique():
    display(pd.Series(ix,index=for_corr.index.names))
    display(for_corr.loc[ix][[target_column]+feature_columns].corr())

### 0.7 to 0.8

In [None]:
data = Xy.set_index(['frac1','frac2']).loc[(0.7,0.8)].reset_index()
for_corr = data.set_index(['Method'])
for ix in for_corr.index.unique():
    display(pd.Series(ix,index=for_corr.index.names))
    display(for_corr.loc[ix][[target_column]+feature_columns].corr())

### 0.8 to 0.9

In [None]:
data = Xy.set_index(['frac1','frac2']).loc[(0.8,0.9)].reset_index()
for_corr = data.set_index(['Method'])
for ix in for_corr.index.unique():
    display(pd.Series(ix,index=for_corr.index.names))
    display(for_corr.loc[ix][[target_column]+feature_columns].corr())

### 0.9 to 1.

In [None]:
data = Xy.set_index(['frac1','frac2']).loc[(0.9,1.)].reset_index()
for_corr = data.set_index(['Method'])
for ix in for_corr.index.unique():
    display(pd.Series(ix,index=for_corr.index.names))
    display(for_corr.loc[ix][[target_column]+feature_columns].corr())

In [72]:
for_corr = data.set_index(['Method','direct_thres','spread_thres','weight_indirect'])
for_display = pd.DataFrame(columns=feature_columns+list(for_corr.index.names))
for_display.set_index(list(for_corr.index.names),inplace=True)
for ix in for_corr.index.unique():
    dt = for_corr.loc[ix][[target_column]+feature_columns].corr().loc[target_column,feature_columns]
    dt.name = ix
    for_display = for_display.append(dt)

  """


In [77]:
for_display.T

Method,Massey,Colley
direct_thres,0.0,3.0
spread_thres,3.0,3.0
weight_indirect,0.25,0.00
delta_lop,-0.181122,-0.045966
delta_hillside,-0.118407,0.284547
nfrac_xstar_lop,0.067466,-0.065937
nfrac_xstar_hillside,-0.08798,0.058396
diameter_lop,-0.079867,0.376589
diameter_hillside,0.42151,0.102595


In [76]:
print(for_display.T.to_latex())

\begin{tabular}{lrr}
\toprule
Method &    Massey &    Colley \\
direct\_thres &       0.0 &       3.0 \\
spread\_thres &       3.0 &       3.0 \\
weight\_indirect &      0.25 &      0.00 \\
\midrule
delta\_lop            & -0.181122 & -0.045966 \\
delta\_hillside       & -0.118407 &  0.284547 \\
nfrac\_xstar\_lop      &  0.067466 & -0.065937 \\
nfrac\_xstar\_hillside & -0.087980 &  0.058396 \\
diameter\_lop         & -0.079867 &  0.376589 \\
diameter\_hillside    &  0.421510 &  0.102595 \\
\bottomrule
\end{tabular}

