# RPLib Problem 0001 - Baseline

Provides the baseline version to rankability problem 0001. Focuses on Massey and Colley out of the box without ties or indirect game information.

In [6]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
import copy
import os

import pandas as pd
import numpy as np

from scipy.stats import pearsonr

from tqdm import tqdm
#import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import joblib
import itertools
from pathlib import Path

from IPython.display import display, Markdown, Latex

**All packages are relative to the home directory of the user**

In [8]:
home = str(Path.home())

**Import the main rankability package**

In [9]:
import sys
sys.path.insert(0,"%s/rankability_toolbox_dev"%home)
import pyrankability

**Load the problem information**

In [10]:
problem = joblib.load("/disk/RPLib/problem_0001.joblib.z")

## Explore and setup the problem

In [11]:
problem.keys()

dict_keys(['description', 'target', 'data', 'other'])

In [12]:
print(problem["description"])


A practitioner wants to predict the degree to which a the rankings during season 
of the NCAA Men’s Basketball are likely to change as more games are played (i.e., sensitivity to more games). 
They want to start the analysis after a minimum of 50% of the games are played. 
They want to run Massey and Colley.

Sensitivity of new games will be measured as the intersection of between two 
rankings derived from before and after the new games are included.



In [13]:
problem['target']

Unnamed: 0,frac1,frac2,domain,range,direct_thres,spread_thres,weight_indirect,Method,Year,top15_intersection
0,0.5,0.6,all,madness,0,0,0,Colley,2002,0.733333
1,0.5,0.7,all,madness,0,0,0,Colley,2002,0.600000
2,0.5,0.8,all,madness,0,0,0,Colley,2002,0.600000
3,0.5,0.9,all,madness,0,0,0,Colley,2002,0.666667
4,0.5,1.0,all,madness,0,0,0,Colley,2002,0.533333
...,...,...,...,...,...,...,...,...,...,...
3055,0.7,0.9,all,madness,0,0,0,Massey,2018,0.933333
3056,0.7,1.0,all,madness,0,0,0,Massey,2018,0.866667
3057,0.8,0.9,all,madness,0,0,0,Massey,2018,0.866667
3058,0.8,1.0,all,madness,0,0,0,Massey,2018,0.866667


In [14]:
problem['data'].keys()

dict_keys(['2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018'])

In [15]:
problem['data']['2002'].keys()

dict_keys(['frac=0.5', 'frac=0.6', 'frac=0.7', 'frac=0.8', 'frac=0.9', 'frac=1.0'])

**Create easier to reference variables**

In [16]:
years = list(problem['data'].keys())
frac_keys = list(problem['data'][years[0]].keys())
remaining_games = problem['other']['remaining_games']
madness_teams = problem['other']['madness_teams']
best_df = problem['other']['best_df']
top_k = problem['other']['top_k']
target_column = f"top{top_k}_intersection"
best_pred_df = problem['other']['best_pred_df']

## Define helper functions

**Function to compute a D matrix from games using hyperparameters**

In [17]:
def compute_D(game_df,team_range,direct_thres,spread_thres):
    map_func = lambda linked: pyrankability.construct.support_map_vectorized_direct_indirect(linked,direct_thres=direct_thres,spread_thres=spread_thres)
    Ds = pyrankability.construct.V_count_vectorized(game_df,map_func)
    for i in range(len(Ds)):
        Ds[i] = Ds[i].reindex(index=team_range,columns=team_range)
    return Ds

In [18]:
def process(data,target,best_df_all):
    index_cols = ["Year","frac_key","direct_thres","spread_thres","weight_indirect","range","Method"]
    Ds = pd.DataFrame(columns=["D"]+index_cols)
    Ds.set_index(index_cols,inplace=True)
    for frac_key,year in tqdm(itertools.product(frac_keys,years)):
        frac = float(frac_key.split("=")[1])
        best_df = best_df_all.set_index('frac').loc[frac]
        for index,row in best_df.iterrows():
            dom,ran,dt,st,iw,method = row.loc['domain'],row.loc['range'],row.loc['direct_thres'],row.loc['spread_thres'],row.loc['weight_indirect'],row.loc['Method']
            iw = 1 # Set this so we get both direct and indirect D matrices
            # set the team_range
            team_range = None
            if ran == 'madness':
                team_range = madness_teams[year]
            elif ran == 'all':
                team_range = all_teams[year]
            else:
                raise Exception(f"range={ran} not supported")
            name = (year,frac_key,dt,st,iw,ran,method)
            if iw == 0:
                st = np.Inf
            D = compute_D(data[year][frac_key],team_range,dt,st)
            Ds = Ds.append(pd.Series([D],index=["D"],name=name)) 
    return Ds

## Create D matrices

In [19]:
best_df

Unnamed: 0,frac,Method,domain,range,direct_thres,spread_thres,weight_indirect
0,0.5,Colley,all,madness,0,0,0
1,0.5,Massey,all,madness,0,0,0
2,0.6,Colley,all,madness,0,0,0
3,0.6,Massey,all,madness,0,0,0
4,0.7,Colley,all,madness,0,0,0
5,0.7,Massey,all,madness,0,0,0
6,0.8,Colley,all,madness,0,0,0
7,0.8,Massey,all,madness,0,0,0
8,0.9,Colley,all,madness,0,0,0
9,0.9,Massey,all,madness,0,0,0


In [20]:
Ds = process(problem['data'],problem['target'],best_df)

102it [03:27,  3.53s/it]


In [21]:
Ds

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,D
Year,frac_key,direct_thres,spread_thres,weight_indirect,range,Method,Unnamed: 7_level_1
2002,frac=0.5,0,0,1,madness,Colley,"[[Alabama, Alcorn_St, Arizona, Boston_College,..."
2002,frac=0.5,0,0,1,madness,Massey,"[[Alabama, Alcorn_St, Arizona, Boston_College,..."
2003,frac=0.5,0,0,1,madness,Colley,"[[Alabama, Arizona, Arizona_St, Auburn, Austin..."
2003,frac=0.5,0,0,1,madness,Massey,"[[Alabama, Arizona, Arizona_St, Auburn, Austin..."
2004,frac=0.5,0,0,1,madness,Colley,"[[Air_Force, Alabama, Alabama_St, Arizona, BYU..."
...,...,...,...,...,...,...,...
2016,frac=1.0,0,0,1,madness,Massey,"[[Akron, Arizona, Ark_Little_Rock, Baylor, Bel..."
2017,frac=1.0,0,0,1,madness,Colley,"[[Arizona, Arkansas, Baylor, Bucknell, Butler,..."
2017,frac=1.0,0,0,1,madness,Massey,"[[Arizona, Arkansas, Baylor, Bucknell, Butler,..."
2018,frac=1.0,0,0,1,madness,Colley,"[[Alabama, Arizona, Arkansas, Auburn, Bucknell..."


In [22]:
Ds.iloc[[0,-1]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,D
Year,frac_key,direct_thres,spread_thres,weight_indirect,range,Method,Unnamed: 7_level_1
2002,frac=0.5,0,0,1,madness,Colley,"[[Alabama, Alcorn_St, Arizona, Boston_College,..."
2018,frac=1.0,0,0,1,madness,Massey,"[[Alabama, Arizona, Arkansas, Auburn, Bucknell..."


In [23]:
Ds.loc['2002',"D"][0][0]

  """Entry point for launching an IPython kernel.


team2,Alabama,Alcorn_St,Arizona,Boston_College,Boston_Univ,California,Central_Conn,Charlotte,Cincinnati,Connecticut,...,UNC_Wilmington,USC,Utah,Valparaiso,W_Kentucky,Wake_Forest,Winthrop,Wisconsin,Wyoming,Xavier
team1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,,,,,,,,,,,...,,,1.0,,,,,,,
Alcorn_St,,,,,,,,,,,...,,,,,,,,,,
Arizona,,,,,,,,,,,...,,,,1.0,,,,,,
Boston_College,,,,,1.0,,,,,,...,,,,,,,,,,
Boston_Univ,,,,0.0,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wake_Forest,,,,,,,,,,,...,,0.0,,,,,,,,
Winthrop,,,,,,,,,,,...,,,,,,,,,,0.0
Wisconsin,,,,,,,,,,,...,,,,,,,,,,
Wyoming,,,,,,,,,,,...,,0.0,,,,,,,,


In [24]:
Ds.loc['2002',"D"][0][1]

  """Entry point for launching an IPython kernel.


team2,Alabama,Alcorn_St,Arizona,Boston_College,Boston_Univ,California,Central_Conn,Charlotte,Cincinnati,Connecticut,...,UNC_Wilmington,USC,Utah,Valparaiso,W_Kentucky,Wake_Forest,Winthrop,Wisconsin,Wyoming,Xavier
team1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,,0.0,,,,,,1.0,,0.0,...,,0.0,,,,0.0,1.0,1.0,,0.0
Alcorn_St,0.0,,,,,0.0,,,,,...,,,,0.0,,0.0,0.0,0.0,,
Arizona,,,,,,0.0,,2.0,0.0,1.0,...,,1.0,0.0,1.0,,,0.0,,,1.0
Boston_College,,,,,0.0,0.0,1.0,,,1.0,...,0.0,0.0,,,,,1.0,,,0.0
Boston_Univ,,,,0.0,,0.0,0.0,0.0,,0.0,...,,,,0.0,,,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wake_Forest,0.0,1.0,,,,0.0,,0.0,0.0,,...,1.0,0.0,1.0,1.0,,,1.0,,0.0,
Winthrop,0.0,0.0,1.0,0.0,0.0,0.0,,0.0,0.0,,...,,,0.0,0.0,,0.0,,,0.0,
Wisconsin,0.0,1.0,,,1.0,,,0.0,0.0,0.0,...,,,,,,,,,,0.0
Wyoming,,,,,,,,,0.0,,...,,,0.0,0.0,,0.0,1.0,,,


In [25]:
Ds.index.names

FrozenList(['Year', 'frac_key', 'direct_thres', 'spread_thres', 'weight_indirect', 'range', 'Method'])

### Compute the features

In [26]:
best_pred_df = best_pred_df.reset_index()
best_pred_df['frac_key'] = "frac="+best_pred_df['frac'].astype(str)
best_pred_df

Unnamed: 0,domain,range,direct_thres,spread_thres,weight_indirect,frac,Method,Year,Predictability,rankings,frac_key
0,all,madness,0,0,0,0.5,Colley,2002,64.044944,Alabama 20.0 Alcorn_St 62.0 ...,frac=0.5
1,all,madness,0,0,0,0.5,Colley,2003,55.952381,Alabama 13.0 Arizona 2.0 Arizon...,frac=0.5
2,all,madness,0,0,0,0.5,Colley,2004,58.064516,Air_Force 43.0 Alabama 23.0 Alabam...,frac=0.5
3,all,madness,0,0,0,0.5,Colley,2005,62.637363,Alabama 18.0 Alabama_A&M 65.0 ...,frac=0.5
4,all,madness,0,0,0,0.5,Colley,2006,65.116279,Air_Force 32.0 Alabama 52.0 Al...,frac=0.5
...,...,...,...,...,...,...,...,...,...,...,...
199,all,madness,0,0,0,1.0,Massey,2014,65.000000,Albany_NY 64.0 American_Univ 59.0 Ar...,frac=1.0
200,all,madness,0,0,0,1.0,Massey,2015,69.387755,Albany_NY 61.0 Arizona 3.0 Arkansas...,frac=1.0
201,all,madness,0,0,0,1.0,Massey,2016,70.238095,Akron 53.0 Arizona 12....,frac=1.0
202,all,madness,0,0,0,1.0,Massey,2017,54.838710,Arizona 26.0 Arkansas 35.0 Ba...,frac=1.0


In [27]:
feature_columns = ["delta_lop","delta_hillside","nfrac_xstar_lop","nfrac_xstar_hillside","diameter_lop","diameter_hillside"]

def compute_features(D,rankings,top_k):
    top_teams = list(rankings.sort_values().index[:top_k])
    D = D.loc[top_teams,top_teams]
    
    delta_lop,details_lop = pyrankability.rank.solve(D.fillna(0),method='lop',cont=True)

    x = pd.DataFrame(details_lop['x'],index=D.index,columns=D.columns)
    r = x.sum(axis=0)
    order = np.argsort(r)
    xstar = x.iloc[order,:].iloc[:,order]
    xstar.loc[:,:] = pyrankability.common.threshold_x(xstar.values)
    inxs = np.triu_indices(len(xstar),k=1)
    xstar_upper = xstar.values[inxs[0],inxs[1]]
    nfrac_upper_lop = sum((xstar_upper > 0) & (xstar_upper < 1))
    
    top_teams = xstar.columns[:top_k]
    
    k_two_distant,details_two_distant = pyrankability.search.solve_pair_max_tau(D.fillna(0),method='lop',cont=False,verbose=False)
    d_lop = details_two_distant['tau']
    
    delta_hillside,details_hillside = pyrankability.rank.solve(D,method='hillside',cont=True)
    
    x = pd.DataFrame(details_hillside['x'],index=D.index,columns=D.columns)
    r = x.sum(axis=0)
    order = np.argsort(r)
    xstar = x.iloc[order,:].iloc[:,order]
    xstar.loc[:,:] = pyrankability.common.threshold_x(xstar.values)
    inxs = np.triu_indices(len(xstar),k=1)
    xstar_upper = xstar.values[inxs[0],inxs[1]]
    nfrac_upper_hillside = sum((xstar_upper > 0) & (xstar_upper < 1))
    
    top_teams = xstar.columns[:top_k]
    
    k_two_distant,details_two_distant = pyrankability.search.solve_pair_max_tau(D,method='hillside',verbose=False,cont=False)
    d_hillside = details_two_distant['tau']
    
    features = pd.Series([delta_lop,delta_hillside,2*nfrac_upper_lop,2*nfrac_upper_hillside,d_lop,d_hillside],index=feature_columns)

    return features

In [28]:
def create_features(Ds,best_pred_df,top_k):
    index_cols = list(Ds.index.names)+["Construction"]
    X = pd.DataFrame(columns=index_cols + feature_columns)
    X.set_index(index_cols,inplace=True)
    for index,row in tqdm(Ds.iterrows()):
        sum_D = None
        year,frac_key,dt,st,iw,ran,method = index
        frac = float(frac_key.split("=")[1])
        spec_best_pred_df = best_pred_df.set_index(['Year','frac_key',"Method"]).loc[[(year,frac_key,method)]]
        for i,D in enumerate(Ds.loc[(year,frac_key,dt,st,iw,ran,method),"D"]):
            if sum_D is None:
                sum_D = D
            else:
                sum_D = sum_D.add(iw*D,fill_value=0)
            if i == 0:
                construction = "Direct"
            elif i == 1:
                construction = "Indirect"
            else:
                raise Exception("Error")
            rankings = spec_best_pred_df['rankings'][0]
            features = compute_features(D,rankings,top_k)
            features.name = tuple(list(index)+[construction])
            X = X.append(features)
            
            if i == 1:
                construction = "Both"
                features = compute_features(sum_D,rankings,top_k)
                features.name = tuple(list(index)+[construction])
                X = X.append(features)
    return X

In [29]:
top_k

15

In [30]:
X = create_features(Ds,best_pred_df,2*top_k)

0it [00:00, ?it/s]

Using license file /home/jupyter-pander14/gurobi.lic
Academic license - for non-commercial use only - expires 2021-02-06


204it [1:42:07, 21.03s/it]


In [31]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
Year,frac_key,direct_thres,spread_thres,weight_indirect,range,Method,Construction,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2002,frac=0.5,0,0,1,madness,Colley,Direct,3.0,6.0,684.0,724.0,-0.485057,-0.604598
2002,frac=0.5,0,0,1,madness,Colley,Indirect,21.0,968.0,458.0,28.0,0.245977,0.935632
2002,frac=0.5,0,0,1,madness,Colley,Both,28.0,1257.0,140.0,24.0,0.678161,0.949425
2002,frac=0.5,0,0,1,madness,Massey,Direct,1.0,2.0,444.0,600.0,0.016092,-0.278161
2002,frac=0.5,0,0,1,madness,Massey,Indirect,22.0,849.0,436.0,92.0,0.177011,0.802299
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,frac=1.0,0,0,1,madness,Colley,Indirect,216.0,4134.0,98.0,6.0,0.779310,0.986207
2018,frac=1.0,0,0,1,madness,Colley,Both,248.0,4329.0,140.0,24.0,0.701149,0.949425
2018,frac=1.0,0,0,1,madness,Massey,Direct,24.0,78.0,532.0,122.0,-0.131034,0.737931
2018,frac=1.0,0,0,1,madness,Massey,Indirect,287.0,4881.0,84.0,74.0,0.806897,0.820690


## Refine the target dataset

In [32]:
target = problem['target'].groupby(['frac1','frac2','Method','Year','direct_thres','spread_thres','weight_indirect'])[target_column].mean().to_frame()
target

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,top15_intersection
frac1,frac2,Method,Year,direct_thres,spread_thres,weight_indirect,Unnamed: 7_level_1
0.5,0.6,Colley,2002,0,0,0,0.733333
0.5,0.6,Colley,2003,0,0,0,0.600000
0.5,0.6,Colley,2004,0,0,0,0.666667
0.5,0.6,Colley,2005,0,0,0,0.733333
0.5,0.6,Colley,2006,0,0,0,0.800000
...,...,...,...,...,...,...,...
0.9,1.0,Massey,2014,0,0,0,0.866667
0.9,1.0,Massey,2015,0,0,0,0.933333
0.9,1.0,Massey,2016,0,0,0,0.933333
0.9,1.0,Massey,2017,0,0,0,0.866667


In [33]:
X_for_join = X.copy().reset_index()
X_for_join['frac1']= X_for_join['frac_key'].str.replace("frac=","").astype(float)
X_for_join.weight_indirect=0
X_for_join

Unnamed: 0,Year,frac_key,direct_thres,spread_thres,weight_indirect,range,Method,Construction,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside,frac1
0,2002,frac=0.5,0,0,0,madness,Colley,Direct,3.0,6.0,684.0,724.0,-0.485057,-0.604598,0.5
1,2002,frac=0.5,0,0,0,madness,Colley,Indirect,21.0,968.0,458.0,28.0,0.245977,0.935632,0.5
2,2002,frac=0.5,0,0,0,madness,Colley,Both,28.0,1257.0,140.0,24.0,0.678161,0.949425,0.5
3,2002,frac=0.5,0,0,0,madness,Massey,Direct,1.0,2.0,444.0,600.0,0.016092,-0.278161,0.5
4,2002,frac=0.5,0,0,0,madness,Massey,Indirect,22.0,849.0,436.0,92.0,0.177011,0.802299,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607,2018,frac=1.0,0,0,0,madness,Colley,Indirect,216.0,4134.0,98.0,6.0,0.779310,0.986207,1.0
608,2018,frac=1.0,0,0,0,madness,Colley,Both,248.0,4329.0,140.0,24.0,0.701149,0.949425,1.0
609,2018,frac=1.0,0,0,0,madness,Massey,Direct,24.0,78.0,532.0,122.0,-0.131034,0.737931,1.0
610,2018,frac=1.0,0,0,0,madness,Massey,Indirect,287.0,4881.0,84.0,74.0,0.806897,0.820690,1.0


In [34]:
target

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,top15_intersection
frac1,frac2,Method,Year,direct_thres,spread_thres,weight_indirect,Unnamed: 7_level_1
0.5,0.6,Colley,2002,0,0,0,0.733333
0.5,0.6,Colley,2003,0,0,0,0.600000
0.5,0.6,Colley,2004,0,0,0,0.666667
0.5,0.6,Colley,2005,0,0,0,0.733333
0.5,0.6,Colley,2006,0,0,0,0.800000
...,...,...,...,...,...,...,...
0.9,1.0,Massey,2014,0,0,0,0.866667
0.9,1.0,Massey,2015,0,0,0,0.933333
0.9,1.0,Massey,2016,0,0,0,0.933333
0.9,1.0,Massey,2017,0,0,0,0.866667


In [35]:
Xy = target.reset_index().set_index(['Method','frac1','Year','direct_thres','spread_thres','weight_indirect']).join(X_for_join.set_index(['Method','frac1','Year','direct_thres','spread_thres','weight_indirect'])).dropna()
Xy = Xy.reset_index()
Xy

Unnamed: 0,Method,frac1,Year,direct_thres,spread_thres,weight_indirect,frac2,top15_intersection,frac_key,range,Construction,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
0,Colley,0.5,2002,0,0,0,0.6,0.733333,frac=0.5,madness,Direct,3.0,6.0,684.0,724.0,-0.485057,-0.604598
1,Colley,0.5,2002,0,0,0,0.6,0.733333,frac=0.5,madness,Indirect,21.0,968.0,458.0,28.0,0.245977,0.935632
2,Colley,0.5,2002,0,0,0,0.6,0.733333,frac=0.5,madness,Both,28.0,1257.0,140.0,24.0,0.678161,0.949425
3,Colley,0.5,2002,0,0,0,0.7,0.600000,frac=0.5,madness,Direct,3.0,6.0,684.0,724.0,-0.485057,-0.604598
4,Colley,0.5,2002,0,0,0,0.7,0.600000,frac=0.5,madness,Indirect,21.0,968.0,458.0,28.0,0.245977,0.935632
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1525,Massey,0.9,2017,0,0,0,1.0,0.866667,frac=0.9,madness,Indirect,172.0,4056.0,270.0,24.0,0.489655,0.949425
1526,Massey,0.9,2017,0,0,0,1.0,0.866667,frac=0.9,madness,Both,206.0,4440.0,128.0,10.0,0.719540,0.977011
1527,Massey,0.9,2018,0,0,0,1.0,0.866667,frac=0.9,madness,Direct,24.0,106.0,598.0,156.0,-0.245977,0.650575
1528,Massey,0.9,2018,0,0,0,1.0,0.866667,frac=0.9,madness,Indirect,227.0,4677.0,32.0,4.0,0.926437,0.990805


## Process results

In [36]:
pairs_by_width = {}
fracs = itertools.combinations([0.5,0.6,0.7,0.8,0.9,1.],2)
for f1,f2 in fracs:
    if f2 < f1:
        f1,f2 = f2,f1
    width = round(100*(f2-f1))
    if width not in pairs_by_width:
        pairs_by_width[width] = []
    pairs_by_width[width].append((f1,f2))

In [37]:
pairs_by_width

{10: [(0.5, 0.6), (0.6, 0.7), (0.7, 0.8), (0.8, 0.9), (0.9, 1.0)],
 20: [(0.5, 0.7), (0.6, 0.8), (0.7, 0.9), (0.8, 1.0)],
 30: [(0.5, 0.8), (0.6, 0.9), (0.7, 1.0)],
 40: [(0.5, 0.9), (0.6, 1.0)],
 50: [(0.5, 1.0)]}

In [38]:
["frac1","frac2","Method","Construction"]+feature_columns

['frac1',
 'frac2',
 'Method',
 'Construction',
 'delta_lop',
 'delta_hillside',
 'nfrac_xstar_lop',
 'nfrac_xstar_hillside',
 'diameter_lop',
 'diameter_hillside']

In [None]:
#!sudo pip install pandas --upgrade

In [39]:
import altair as alt

for width in [10,20,30,40]:
    display(Markdown(f'### {width}'))
    summary = None
    for pair in pairs_by_width[width]:
        data = Xy.set_index(['frac1','frac2']).loc[pair].reset_index()
        for_corr = data.set_index(['Method','Construction',"frac1","frac2"])
        if summary is None:
            summary = pd.DataFrame(columns=["frac1","frac2","Method","Construction"]+feature_columns).set_index(list(for_corr.index.names))
        for ix in for_corr.index.unique():
            corr_results = for_corr.loc[ix][[target_column]+feature_columns].corr()
            target_corr_results = corr_results.loc[target_column].drop(target_column)
            target_corr_results.name = ix
            summary = summary.append(target_corr_results)

    graph_df = summary.reset_index()
    for feature in feature_columns:
        g = alt.Chart(graph_df).mark_bar().encode(
            x='frac1:N',
            y=alt.Y(feature,scale=alt.Scale(domain=[-.6, .6])),
            row='Method:N',
            column='Construction',
            color='frac1:N'
        )
        display(g)

### 10

  import sys
  if sys.path[0] == '':


### 20

### 30

### 40

In [36]:
display(summary)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
Method,Construction,frac1,frac2,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Colley,Direct,0.5,0.6,0.123466,-0.108213,0.332554,0.026583,-0.319112,-0.079849
Colley,Indirect,0.5,0.6,-0.387917,-0.241248,0.000728,0.126425,-0.023572,-0.112108
Colley,Both,0.5,0.6,-0.396601,-0.327872,-0.149536,0.216164,0.120507,-0.235282
Massey,Direct,0.5,0.6,0.094033,0.083245,-0.053071,-0.166145,0.08921,0.100923
Massey,Indirect,0.5,0.6,-0.050784,0.01206,-0.098572,-0.289405,0.102642,0.249573
Massey,Both,0.5,0.6,-0.101076,0.025185,-0.343856,-0.034393,0.352831,0.011566
Colley,Direct,0.6,0.7,-0.096102,-0.091739,-0.107759,0.196069,0.134929,0.051433
Colley,Indirect,0.6,0.7,-0.142586,-0.224899,0.361415,0.234138,-0.377886,-0.258264
Colley,Both,0.6,0.7,-0.215121,-0.232977,0.160491,0.099074,-0.133272,-0.126891
Massey,Direct,0.6,0.7,0.434428,0.355113,0.131484,0.116638,-0.091029,0.229646


In [82]:
summary

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
Method,Construction,frac1,frac2,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Colley,Direct,0.5,0.6,1.809442e-16,0.218595,-0.274607,-0.34159,0.304669,0.313927


## 0.6 to 0.7

In [57]:
data = Xy.set_index(['frac1','frac2']).loc[(0.6,0.7)].reset_index()
for_corr = data.set_index(['Method','Construction'])
for ix in for_corr.index.unique():
    display(pd.Series(ix,index=for_corr.index.names))
    display(for_corr.loc[ix][[target_column]+feature_columns].corr())

  """Entry point for launching an IPython kernel.


Method          Colley
Construction    Direct
dtype: object

  """


Unnamed: 0,top15_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top15_intersection,1.0,-0.033692,0.034758,-0.124823,0.229806,0.275894,-0.168381
delta_lop,-0.033692,1.0,0.735085,0.279614,-0.197032,-0.099624,0.292868
delta_hillside,0.034758,0.735085,1.0,0.341162,-0.555433,-0.315206,0.610122
nfrac_xstar_lop,-0.124823,0.279614,0.341162,1.0,0.070455,-0.931563,-0.051844
nfrac_xstar_hillside,0.229806,-0.197032,-0.555433,0.070455,1.0,0.066163,-0.973198
diameter_lop,0.275894,-0.099624,-0.315206,-0.931563,0.066163,1.0,-0.045087
diameter_hillside,-0.168381,0.292868,0.610122,-0.051844,-0.973198,-0.045087,1.0


Method            Colley
Construction    Indirect
dtype: object

  """


Unnamed: 0,top15_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top15_intersection,1.0,0.04046,-0.067101,0.399105,-0.124068,-0.511588,0.136769
delta_lop,0.04046,1.0,0.933609,0.448464,0.053871,-0.334884,-0.045194
delta_hillside,-0.067101,0.933609,1.0,0.301852,-0.101378,-0.190484,0.113999
nfrac_xstar_lop,0.399105,0.448464,0.301852,1.0,0.271442,-0.951606,-0.221158
nfrac_xstar_hillside,-0.124068,0.053871,-0.101378,0.271442,1.0,-0.240039,-0.995307
diameter_lop,-0.511588,-0.334884,-0.190484,-0.951606,-0.240039,1.0,0.192296
diameter_hillside,0.136769,-0.045194,0.113999,-0.221158,-0.995307,0.192296,1.0


Method          Colley
Construction      Both
dtype: object

  """


Unnamed: 0,top15_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top15_intersection,1.0,-0.157214,-0.046902,-0.07849,-0.249868,0.083512,0.2517
delta_lop,-0.157214,1.0,0.906945,0.444305,-0.569154,-0.234295,0.57321
delta_hillside,-0.046902,0.906945,1.0,0.200477,-0.495748,0.045845,0.474313
nfrac_xstar_lop,-0.07849,0.444305,0.200477,1.0,-0.346656,-0.917743,0.378526
nfrac_xstar_hillside,-0.249868,-0.569154,-0.495748,-0.346656,1.0,0.26013,-0.990913
diameter_lop,0.083512,-0.234295,0.045845,-0.917743,0.26013,1.0,-0.300622
diameter_hillside,0.2517,0.57321,0.474313,0.378526,-0.990913,-0.300622,1.0


Method          Massey
Construction    Direct
dtype: object

  """


Unnamed: 0,top15_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top15_intersection,1.0,0.051682,0.070177,0.323252,0.00047,-0.3288,-0.114328
delta_lop,0.051682,1.0,0.68216,0.546261,-0.311122,-0.566135,0.33973
delta_hillside,0.070177,0.68216,1.0,0.059334,-0.608424,-0.139715,0.611381
nfrac_xstar_lop,0.323252,0.546261,0.059334,1.0,0.420533,-0.978047,-0.422727
nfrac_xstar_hillside,0.00047,-0.311122,-0.608424,0.420533,1.0,-0.42608,-0.979986
diameter_lop,-0.3288,-0.566135,-0.139715,-0.978047,-0.42608,1.0,0.431566
diameter_hillside,-0.114328,0.33973,0.611381,-0.422727,-0.979986,0.431566,1.0


Method          Massey
Construction      Both
dtype: object

  """


Unnamed: 0,top15_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top15_intersection,1.0,0.051682,0.070177,0.323252,0.00047,-0.3288,-0.114328
delta_lop,0.051682,1.0,0.68216,0.546261,-0.311122,-0.566135,0.33973
delta_hillside,0.070177,0.68216,1.0,0.059334,-0.608424,-0.139715,0.611381
nfrac_xstar_lop,0.323252,0.546261,0.059334,1.0,0.420533,-0.978047,-0.422727
nfrac_xstar_hillside,0.00047,-0.311122,-0.608424,0.420533,1.0,-0.42608,-0.979986
diameter_lop,-0.3288,-0.566135,-0.139715,-0.978047,-0.42608,1.0,0.431566
diameter_hillside,-0.114328,0.33973,0.611381,-0.422727,-0.979986,0.431566,1.0


### 0.7 to 0.8

In [58]:
data = Xy.set_index(['frac1','frac2']).loc[(0.7,0.8)].reset_index()
for_corr = data.set_index(['Method'])
for ix in for_corr.index.unique():
    display(pd.Series(ix,index=for_corr.index.names))
    display(for_corr.loc[ix][[target_column]+feature_columns].corr())

  """Entry point for launching an IPython kernel.


Method    Colley
dtype: object

Unnamed: 0,top15_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top15_intersection,1.0,0.02229,0.023643,-0.145952,-0.019798,0.131875,0.013795
delta_lop,0.02229,1.0,0.856411,-0.600197,-0.833061,0.699814,0.840072
delta_hillside,0.023643,0.856411,1.0,-0.823404,-0.923045,0.869911,0.929844
nfrac_xstar_lop,-0.145952,-0.600197,-0.823404,1.0,0.778909,-0.974619,-0.795137
nfrac_xstar_hillside,-0.019798,-0.833061,-0.923045,0.778909,1.0,-0.854231,-0.996746
diameter_lop,0.131875,0.699814,0.869911,-0.974619,-0.854231,1.0,0.869986
diameter_hillside,0.013795,0.840072,0.929844,-0.795137,-0.996746,0.869986,1.0


Method    Massey
dtype: object

Unnamed: 0,top15_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top15_intersection,1.0,0.244961,0.302184,-0.191937,-0.126197,0.204923,0.362996
delta_lop,0.244961,1.0,0.569115,0.359219,-0.127186,-0.413252,0.369689
delta_hillside,0.302184,0.569115,1.0,0.049907,-0.061282,-0.080169,0.456502
nfrac_xstar_lop,-0.191937,0.359219,0.049907,1.0,0.306418,-0.966973,-0.333667
nfrac_xstar_hillside,-0.126197,-0.127186,-0.061282,0.306418,1.0,-0.301968,-0.789993
diameter_lop,0.204923,-0.413252,-0.080169,-0.966973,-0.301968,1.0,0.373802
diameter_hillside,0.362996,0.369689,0.456502,-0.333667,-0.789993,0.373802,1.0


### 0.8 to 0.9

In [59]:
data = Xy.set_index(['frac1','frac2']).loc[(0.8,0.9)].reset_index()
for_corr = data.set_index(['Method'])
for ix in for_corr.index.unique():
    display(pd.Series(ix,index=for_corr.index.names))
    display(for_corr.loc[ix][[target_column]+feature_columns].corr())

  """Entry point for launching an IPython kernel.


Method    Colley
dtype: object

Unnamed: 0,top15_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top15_intersection,1.0,-0.063546,-0.04663,0.021394,0.072194,-0.033096,-0.069062
delta_lop,-0.063546,1.0,0.865985,-0.624501,-0.78181,0.731686,0.801398
delta_hillside,-0.04663,0.865985,1.0,-0.833724,-0.913166,0.884851,0.928728
nfrac_xstar_lop,0.021394,-0.624501,-0.833724,1.0,0.77814,-0.964683,-0.805747
nfrac_xstar_hillside,0.072194,-0.78181,-0.913166,0.77814,1.0,-0.836519,-0.987047
diameter_lop,-0.033096,0.731686,0.884851,-0.964683,-0.836519,1.0,0.868255
diameter_hillside,-0.069062,0.801398,0.928728,-0.805747,-0.987047,0.868255,1.0


Method    Massey
dtype: object

Unnamed: 0,top15_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top15_intersection,1.0,-0.200329,-0.081585,0.151543,-0.029163,-0.148642,0.088628
delta_lop,-0.200329,1.0,0.410175,0.611958,-0.035864,-0.526587,-0.072341
delta_hillside,-0.081585,0.410175,1.0,0.046915,-0.104072,-0.051227,0.012704
nfrac_xstar_lop,0.151543,0.611958,0.046915,1.0,0.295671,-0.974192,-0.275981
nfrac_xstar_hillside,-0.029163,-0.035864,-0.104072,0.295671,1.0,-0.361536,-0.901983
diameter_lop,-0.148642,-0.526587,-0.051227,-0.974192,-0.361536,1.0,0.351146
diameter_hillside,0.088628,-0.072341,0.012704,-0.275981,-0.901983,0.351146,1.0


### 0.9 to 1.

In [60]:
data = Xy.set_index(['frac1','frac2']).loc[(0.9,1.)].reset_index()
for_corr = data.set_index(['Method'])
for ix in for_corr.index.unique():
    display(pd.Series(ix,index=for_corr.index.names))
    display(for_corr.loc[ix][[target_column]+feature_columns].corr())

  """Entry point for launching an IPython kernel.


Method    Colley
dtype: object

Unnamed: 0,top15_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top15_intersection,1.0,0.080682,0.101705,-0.090726,-0.104757,0.084122,0.138211
delta_lop,0.080682,1.0,0.865931,-0.664981,-0.747569,0.736892,0.760151
delta_hillside,0.101705,0.865931,1.0,-0.839537,-0.86156,0.88946,0.87902
nfrac_xstar_lop,-0.090726,-0.664981,-0.839537,1.0,0.751581,-0.97363,-0.806614
nfrac_xstar_hillside,-0.104757,-0.747569,-0.86156,0.751581,1.0,-0.81121,-0.926384
diameter_lop,0.084122,0.736892,0.88946,-0.97363,-0.81121,1.0,0.875447
diameter_hillside,0.138211,0.760151,0.87902,-0.806614,-0.926384,0.875447,1.0


Method    Massey
dtype: object

Unnamed: 0,top15_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top15_intersection,1.0,0.06008,0.144706,-0.27109,-0.074017,0.379677,0.254621
delta_lop,0.06008,1.0,0.530237,0.151979,0.0749,0.052209,0.195383
delta_hillside,0.144706,0.530237,1.0,-0.453008,-0.12982,0.488829,0.367424
nfrac_xstar_lop,-0.27109,0.151979,-0.453008,1.0,0.181947,-0.909558,-0.203733
nfrac_xstar_hillside,-0.074017,0.0749,-0.12982,0.181947,1.0,-0.157968,-0.883072
diameter_lop,0.379677,0.052209,0.488829,-0.909558,-0.157968,1.0,0.297851
diameter_hillside,0.254621,0.195383,0.367424,-0.203733,-0.883072,0.297851,1.0


In [72]:
for_corr = data.set_index(['Method','direct_thres','spread_thres','weight_indirect'])
for_display = pd.DataFrame(columns=feature_columns+list(for_corr.index.names))
for_display.set_index(list(for_corr.index.names),inplace=True)
for ix in for_corr.index.unique():
    dt = for_corr.loc[ix][[target_column]+feature_columns].corr().loc[target_column,feature_columns]
    dt.name = ix
    for_display = for_display.append(dt)

  """


In [77]:
for_display.T

Method,Massey,Colley
direct_thres,0.0,3.0
spread_thres,3.0,3.0
weight_indirect,0.25,0.00
delta_lop,-0.181122,-0.045966
delta_hillside,-0.118407,0.284547
nfrac_xstar_lop,0.067466,-0.065937
nfrac_xstar_hillside,-0.08798,0.058396
diameter_lop,-0.079867,0.376589
diameter_hillside,0.42151,0.102595


In [76]:
print(for_display.T.to_latex())

\begin{tabular}{lrr}
\toprule
Method &    Massey &    Colley \\
direct\_thres &       0.0 &       3.0 \\
spread\_thres &       3.0 &       3.0 \\
weight\_indirect &      0.25 &      0.00 \\
\midrule
delta\_lop            & -0.181122 & -0.045966 \\
delta\_hillside       & -0.118407 &  0.284547 \\
nfrac\_xstar\_lop      &  0.067466 & -0.065937 \\
nfrac\_xstar\_hillside & -0.087980 &  0.058396 \\
diameter\_lop         & -0.079867 &  0.376589 \\
diameter\_hillside    &  0.421510 &  0.102595 \\
\bottomrule
\end{tabular}

