# RPLib Problem 0001 - Baseline

Provides the baseline version to rankability problem 0001. 

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import copy
import os

import pandas as pd
import numpy as np

from scipy.stats import pearsonr

from tqdm import tqdm
#import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import joblib
import itertools
from pathlib import Path
from sklearn.pipeline import Pipeline

from IPython.display import display, Markdown, Latex

**All packages are relative to the home directory of the user**

In [3]:
home = str(Path.home())

**Import the main rankability package**

In [4]:
import sys
sys.path.insert(0,"%s/rankability_toolbox_dev"%home)
import pyrankability

In [5]:
sys.path.insert(0,"%s/RPLib"%home)
import pyrplib

**Load the problem information**

In [6]:
problem = joblib.load("generate.joblib.z")

## Explore and setup the problem

In [7]:
problem.keys()

dict_keys(['description', 'target', 'data', 'other'])

In [8]:
print(problem["description"])

First representative example for the rankability library. Built around the study of NCAA Men's Basketball league.


In [9]:
problem['target']

Unnamed: 0,days_to_subtract1,days_to_subtract2,domain,range,direct_thres,spread_thres,weight_indirect,Method,Year,rankings1,rankings2,top10_jaccard,top10_tau,days_diff
0,28,35,all,madness,0,0,0.1,Colley,2002,Alabama 6.0 Arizona 7.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.800000,0.388889,7
1,21,35,all,madness,0,0,0.1,Colley,2002,Alabama 5.0 Arizona 9.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.636364,-0.111111,14
2,14,35,all,madness,0,0,0.1,Colley,2002,Alabama 6.0 Arizona 9.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.800000,-0.055556,21
3,7,35,all,madness,0,0,0.1,Colley,2002,Alabama 6.0 Arizona 7.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.800000,0.000000,28
4,21,28,all,madness,0,0,0.1,Colley,2002,Alabama 5.0 Arizona 9.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.800000,-0.722222,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1695,14,28,all,madness,0,0,0.1,Massey,2018,Cincinnati 8.0 Duke 2.0 Ka...,Cincinnati 6.0 Duke 4.0 Ka...,0.800000,0.222222,14
1696,7,28,all,madness,0,0,0.1,Massey,2018,Cincinnati 7.0 Duke 2.0 Ka...,Cincinnati 6.0 Duke 4.0 Ka...,0.800000,-0.111111,21
1697,14,21,all,madness,0,0,0.1,Massey,2018,Cincinnati 8.0 Duke 2.0 Ka...,Cincinnati 7.0 Duke 3.0 Ka...,0.800000,0.222222,7
1698,7,21,all,madness,0,0,0.1,Massey,2018,Cincinnati 7.0 Duke 2.0 Ka...,Cincinnati 7.0 Duke 3.0 Ka...,0.800000,0.222222,14


In [10]:
problem['data'].keys()

dict_keys(['2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018'])

In [11]:
problem['data']['2002'].keys()

dict_keys(['days_to_subtract=35', 'days_to_subtract=28', 'days_to_subtract=21', 'days_to_subtract=14', 'days_to_subtract=7'])

**Create easier to reference variables**

In [12]:
years = list(problem['data'].keys())
days_to_subtract_keys = list(problem['data'][years[0]].keys())
remaining_games = problem['other']['remaining_games']
madness_teams = problem['other']['madness_teams']
best_df = problem['other']['best_df']
top_k = problem['other']['top_k']
#target_column = f"top{top_k}_intersection"
feature_names = problem['other']['feature_names']

In [13]:
days_to_subtract_keys

['days_to_subtract=35',
 'days_to_subtract=28',
 'days_to_subtract=21',
 'days_to_subtract=14',
 'days_to_subtract=7']

In [14]:
target = problem['target']

In [15]:
target

Unnamed: 0,days_to_subtract1,days_to_subtract2,domain,range,direct_thres,spread_thres,weight_indirect,Method,Year,rankings1,rankings2,top10_jaccard,top10_tau,days_diff
0,28,35,all,madness,0,0,0.1,Colley,2002,Alabama 6.0 Arizona 7.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.800000,0.388889,7
1,21,35,all,madness,0,0,0.1,Colley,2002,Alabama 5.0 Arizona 9.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.636364,-0.111111,14
2,14,35,all,madness,0,0,0.1,Colley,2002,Alabama 6.0 Arizona 9.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.800000,-0.055556,21
3,7,35,all,madness,0,0,0.1,Colley,2002,Alabama 6.0 Arizona 7.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.800000,0.000000,28
4,21,28,all,madness,0,0,0.1,Colley,2002,Alabama 5.0 Arizona 9.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.800000,-0.722222,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1695,14,28,all,madness,0,0,0.1,Massey,2018,Cincinnati 8.0 Duke 2.0 Ka...,Cincinnati 6.0 Duke 4.0 Ka...,0.800000,0.222222,14
1696,7,28,all,madness,0,0,0.1,Massey,2018,Cincinnati 7.0 Duke 2.0 Ka...,Cincinnati 6.0 Duke 4.0 Ka...,0.800000,-0.111111,21
1697,14,21,all,madness,0,0,0.1,Massey,2018,Cincinnati 8.0 Duke 2.0 Ka...,Cincinnati 7.0 Duke 3.0 Ka...,0.800000,0.222222,7
1698,7,21,all,madness,0,0,0.1,Massey,2018,Cincinnati 7.0 Duke 2.0 Ka...,Cincinnati 7.0 Duke 3.0 Ka...,0.800000,0.222222,14


## Define helper functions

**Function to compute a D matrix from games using hyperparameters**

In [16]:
#def compute_D(game_df,team_range,direct_thres,spread_thres):
#    map_func = lambda linked: pyrankability.construct.support_map_vectorized_direct_indirect(linked,direct_thres=direct_thres,spread_thres=spread_thres)
#    Ds = pyrankability.construct.V_count_vectorized(game_df,map_func)
#    for i in range(len(Ds)):
#        Ds[i] = Ds[i].reindex(index=team_range,columns=team_range)
#    return Ds

In [17]:
#def process(data,target,best_df_all):
#    index_cols = ["Year","days_to_subtract_key","direct_thres","spread_thres","weight_indirect","range","Method"]
#    Ds = pd.DataFrame(columns=["D"]+index_cols)
#    Ds.set_index(index_cols,inplace=True)
#    for days_to_subtract_key,year in tqdm(itertools.product(days_to_subtract_keys,years)):
#        days_to_subtract = float(days_to_subtract_key.split("=")[1])
#        best_df = best_df_all.set_index('days_to_subtract').loc[days_to_subtract]
#        for index,row in best_df.iterrows():
#            dom,ran,dt,st,iw,method = row.loc['domain'],row.loc['range'],row.loc['direct_thres'],row.loc['spread_thres'],row.loc['weight_indirect'],row.loc['Method']
#            iw = .1 # Set this so we get both direct and indirect D matrices
#            # set the team_range
#            team_range = None
#            if ran == 'madness':
#                team_range = madness_teams[year]
#            elif ran == 'all':
#                team_range = all_teams[year]
#            else:
#                raise Exception(f"range={ran} not supported")
#            name = (year,days_to_subtract_key,dt,st,iw,ran,method)
#            if iw == 0:
#                st = np.Inf
#            pipe = Pipeline([('compute_D', pyrplib.transformers.ComputeDTransformer(team_range, dt, st))])
#            X = data[year][days_to_subtract_key]
#            pipe.fit(X)
#            D = pipe.transform(X)
#            #D = compute_D(data[year][days_to_subtract_key],team_range,dt,st)
#            Ds = Ds.append(pd.Series([D],index=["D"],name=name)) 
#    return Ds

In [19]:
index_cols = ["Year","days_to_subtract_key","direct_thres","spread_thres","weight_indirect","range","Method"]
process_pipe = Pipeline([('process_Ds', pyrplib.transformers.ProcessTransformer(index_cols, days_to_subtract_keys, years, 'madness', madness_teams, best_df))])

TypeError: ProcessTransformer() takes 3 positional arguments but 6 were given

## Create D matrices

In [None]:
best_df

In [None]:
process_pipe.fit(problem['data'])
Ds = process_pipe.transform(problem['data'])
#Ds = process(problem['data'],problem['target'],best_df)

In [None]:
Ds

In [None]:
Ds.iloc[[0,-1]]

In [None]:
Ds.loc['2002',"D"][0][0]

In [None]:
Ds.loc['2002',"D"][0][1]

In [None]:
Ds.index.names

In [None]:
rankings = {}
dt = target.set_index(['days_to_subtract2','Year','direct_thres','spread_thres','weight_indirect','range','Method'])
for index in dt.index:
    rankings[index] = dt.loc[index].iloc[0]['rankings2']
dt = target.set_index(['days_to_subtract1','Year','direct_thres','spread_thres','weight_indirect','range','Method'])
for index in dt.index:
    rankings[index] = dt.loc[index].iloc[0]['rankings1']

In [None]:
rankings_df = pd.DataFrame(rankings).T

### Compute the features

In [None]:
feature_columns = ["delta_lop","delta_hillside","nfrac_xstar_lop","nfrac_xstar_hillside","diameter_lop","diameter_hillside"]

#def compute_features(D,rankings,top_k):
#    top_teams = list(rankings.sort_values().index[:top_k])
#    D = D.loc[top_teams,top_teams]
#    delta_lop,details_lop = pyrankability.rank.solve(D.fillna(0),method='lop',cont=True)
#
#    x = pd.DataFrame(details_lop['x'],index=D.index,columns=D.columns)
#    r = x.sum(axis=0)
#    order = np.argsort(r)
#    xstar = x.iloc[order,:].iloc[:,order]
#    xstar.loc[:,:] = pyrankability.common.threshold_x(xstar.values)
#    inxs = np.triu_indices(len(xstar),k=1)
#    xstar_upper = xstar.values[inxs[0],inxs[1]]
#    nfrac_upper_lop = sum((xstar_upper > 0) & (xstar_upper < 1))
#    
#    top_teams = xstar.columns[:top_k]
#    
#    k_two_distant,details_two_distant = pyrankability.search.solve_pair(D.fillna(0),method='lop',minimize=False,verbose=False)
#    d_lop = k_two_distant#details_two_distant['tau']
#    
#    delta_hillside,details_hillside = pyrankability.rank.solve(D,method='hillside',cont=True)
#    
#    x = pd.DataFrame(details_hillside['x'],index=D.index,columns=D.columns)
#    r = x.sum(axis=0)
#    order = np.argsort(r)
#    xstar = x.iloc[order,:].iloc[:,order]
#    xstar.loc[:,:] = pyrankability.common.threshold_x(xstar.values)
#    inxs = np.triu_indices(len(xstar),k=1)
#    xstar_upper = xstar.values[inxs[0],inxs[1]]
#    nfrac_upper_hillside = sum((xstar_upper > 0) & (xstar_upper < 1))
#    
#    top_teams = xstar.columns[:top_k]
#    
#    k_two_distant,details_two_distant = pyrankability.search.solve_pair(D,method='hillside',minimize=False,verbose=False)
#    d_hillside = k_two_distant#details_two_distant['tau']
#    
#    features = pd.Series([delta_lop,delta_hillside,2*nfrac_upper_lop,2*nfrac_upper_hillside,d_lop,d_hillside],index=feature_columns)
#
#    return features

In [None]:
def create_features(Ds,rankings_df,top_k):
    index_cols = list(Ds.index.names)+["Construction"]
    X = pd.DataFrame(columns=index_cols + feature_columns)
    X.set_index(index_cols,inplace=True)
    #target = target.set_index(['days_to_subtract2','Year','direct_thres','spread_thres','weight_indirect','range','Method'])
    for index,row in tqdm(Ds.iterrows()):
        sum_D = None
        year,days_to_subtract_key,dt,st,iw,ran,method = index
        days_to_subtract = int(days_to_subtract_key.split("=")[1])
        print(days_to_subtract,year,dt,st,iw,ran,method)
        rankings = rankings_df.loc[days_to_subtract,year,dt,st,iw,ran,method].dropna() #spec_best_pred_df = best_pred_df.set_index(['Year','days_to_subtract_key',"Method"]).loc[[(year,days_to_subtract_key,method)]]
        for i,D in enumerate(Ds.loc[(year,days_to_subtract_key,dt,st,iw,ran,method),"D"]):
            if sum_D is None:
                sum_D = D
            else:
                sum_D = sum_D.add(iw*D,fill_value=0)
            if i == 0:
                construction = "Direct"
            elif i == 1:
                construction = "Indirect"
            else:
                raise Exception("Error")
            features = pyrplib.utils.compute_features(D,rankings,top_k,feature_columns)
            features.name = tuple(list(index)+[construction])
            X = X.append(features)
            
            if i == 1:
                construction = "Both"
                features = pyrplib.utils.compute_features(sum_D,rankings,top_k,feature_columns)
                features.name = tuple(list(index)+[construction])
                X = X.append(features)
    return X

In [None]:
X = create_features(Ds,rankings_df,top_k)

In [None]:
X

## Refine the target dataset

In [None]:
target = problem['target'].groupby(['days_to_subtract1','days_to_subtract2','Method','Year','direct_thres','spread_thres','weight_indirect'])[feature_names].mean()
target

In [None]:
target.corr()

In [None]:
X_for_join = X.copy().reset_index()
X_for_join['days_to_subtract1']= X_for_join['days_to_subtract_key'].str.replace("days_to_subtract=","").astype(float)
X_for_join.weight_indirect=0.1
X_for_join

In [None]:
target

In [None]:
Xy = target.reset_index().set_index(['Method','days_to_subtract1','Year','direct_thres','spread_thres','weight_indirect']).join(X_for_join.set_index(['Method','days_to_subtract1','Year','direct_thres','spread_thres','weight_indirect'])).dropna()
Xy = Xy.reset_index()
Xy

## Process results

In [None]:
pairs_by_width = {}
for f1,f2 in itertools.combinations(X_for_join['days_to_subtract1'].unique().astype(int),2):
    if f2 < f1:
        f1,f2 = f2,f1
    width = f2-f1#round(100*(f2-f1))
    if width not in pairs_by_width:
        pairs_by_width[width] = []
    pairs_by_width[width].append((f1,f2))

In [None]:
pairs_by_width

In [None]:
["days_to_subtract1","days_to_subtract2","Method","Construction"]+feature_columns

In [None]:
#!sudo pip install pandas --upgrade

In [None]:
import altair as alt

index_cols = ['Method', 'Construction', 'days_to_subtract1', 'days_to_subtract2','width']
graph_dfs = {}
for target_column in feature_names:
    graph_df = pd.DataFrame(columns=index_cols+feature_columns).set_index(index_cols)

    for width in pairs_by_width.keys():
        summary = None
        for pair in pairs_by_width[width]:
            data = Xy.set_index(['days_to_subtract1','days_to_subtract2']).loc[pair].reset_index()
            for_corr = data.set_index(['Method','Construction',"days_to_subtract1","days_to_subtract2"])
            if summary is None:
                summary = pd.DataFrame(columns=["days_to_subtract1","days_to_subtract2","Method","Construction"]+feature_columns).set_index(list(for_corr.index.names))
            for ix in for_corr.index.unique():
                corr_results = for_corr.loc[ix][[target_column]+feature_columns].corr()
                target_corr_results = corr_results.loc[target_column].drop(target_column)
                target_corr_results.name = ix
                summary = summary.append(target_corr_results)

        graph_df1 = summary.reset_index()
        graph_df1['width'] = width
        graph_df1 = graph_df1.set_index(index_cols)
        graph_df = graph_df.append(graph_df1)
    graph_dfs[target_column]=graph_df

In [None]:
for key in graph_dfs.keys():
    graph_dfs[key] = graph_dfs[key].reset_index()

In [None]:
graph_dfs[key].head()

In [None]:
for key in graph_dfs.keys():
    display(Markdown(f'## {key}'))
    graph_df = graph_dfs[key].melt(value_vars=feature_columns,id_vars=index_cols,value_name='Value',var_name='Feature')

    display(Markdown('### Colley'))
    g = alt.Chart(graph_df.set_index('Method').loc['Colley']).mark_bar().encode(
        x='width:N',
        y=alt.Y('average(Value)',scale=alt.Scale(domain=[-.6, .6])),
        row='Feature:N',
        color='Construction:N',
        column='Construction:N'
    )
    display(g)
    
    display(Markdown('### Massey'))
    g = alt.Chart(graph_df.set_index('Method').loc['Massey']).mark_bar().encode(
        x='width:N',
        y=alt.Y('average(Value)',scale=alt.Scale(domain=[-.6, .6])),
        row='Feature:N',
        color='Construction:N',
        column='Construction:N'
    )
    display(g)

### Old below this line

### Colley

In [None]:
g = alt.Chart(graph_df.set_index('Method').loc['Massey']).mark_bar().encode(
    x='width:N',
    y=alt.Y('average(Value)',scale=alt.Scale(domain=[-.6, .6])),
    row='Feature:N',
    color='Construction:N',
    column='Construction:N'
)

### Massey

In [None]:
g

## old below

## 0.6 to 0.7

(this was already broken)

In [None]:
data = Xy.set_index(['frac1','frac2']).loc[(0.6,0.7)].reset_index()
for_corr = data.set_index(['Method','Construction'])
for ix in for_corr.index.unique():
    display(pd.Series(ix,index=for_corr.index.names))
    display(for_corr.loc[ix][[target_column]+feature_columns].corr())

### 0.7 to 0.8

In [None]:
data = Xy.set_index(['frac1','frac2']).loc[(0.7,0.8)].reset_index()
for_corr = data.set_index(['Method'])
for ix in for_corr.index.unique():
    display(pd.Series(ix,index=for_corr.index.names))
    display(for_corr.loc[ix][[target_column]+feature_columns].corr())

### 0.8 to 0.9

In [None]:
data = Xy.set_index(['frac1','frac2']).loc[(0.8,0.9)].reset_index()
for_corr = data.set_index(['Method'])
for ix in for_corr.index.unique():
    display(pd.Series(ix,index=for_corr.index.names))
    display(for_corr.loc[ix][[target_column]+feature_columns].corr())

### 0.9 to 1.

In [None]:
data = Xy.set_index(['frac1','frac2']).loc[(0.9,1.)].reset_index()
for_corr = data.set_index(['Method'])
for ix in for_corr.index.unique():
    display(pd.Series(ix,index=for_corr.index.names))
    display(for_corr.loc[ix][[target_column]+feature_columns].corr())

In [None]:
for_corr = data.set_index(['Method','direct_thres','spread_thres','weight_indirect'])
for_display = pd.DataFrame(columns=feature_columns+list(for_corr.index.names))
for_display.set_index(list(for_corr.index.names),inplace=True)
for ix in for_corr.index.unique():
    dt = for_corr.loc[ix][[target_column]+feature_columns].corr().loc[target_column,feature_columns]
    dt.name = ix
    for_display = for_display.append(dt)

In [None]:
for_display.T

In [None]:
print(for_display.T.to_latex())