# RPLib Problem 0001 - Baseline

Provides the baseline version to rankability problem 0001. 

In [31]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
import copy
import os

import pandas as pd
import numpy as np

from scipy.stats import pearsonr

from tqdm import tqdm
#import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import joblib
import itertools
from pathlib import Path

from IPython.display import display, Markdown, Latex

**All packages are relative to the home directory of the user**

In [33]:
home = str(Path.home())

**Import the main rankability package**

In [34]:
import sys
sys.path.insert(0,"%s/rankability_toolbox_dev"%home)
import pyrankability

**Load the problem information**

In [35]:
problem = joblib.load("generate.joblib.z")

## Explore and setup the problem

In [36]:
problem.keys()

dict_keys(['description', 'target', 'data', 'other'])

In [37]:
print(problem["description"])

First representative example for the rankability library. Built around the study of NCAA Men's Basketball league.


In [38]:
problem['target']

Unnamed: 0,days_to_subtract1,days_to_subtract2,domain,range,direct_thres,spread_thres,weight_indirect,Method,Year,rankings1,rankings2,top10_jaccard,top10_tau,days_diff
0,28,35,all,madness,0,0,0.1,Colley,2002,Alabama 6.0 Arizona 7.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.800000,0.388889,7
1,21,35,all,madness,0,0,0.1,Colley,2002,Alabama 5.0 Arizona 9.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.636364,-0.111111,14
2,14,35,all,madness,0,0,0.1,Colley,2002,Alabama 6.0 Arizona 9.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.800000,-0.055556,21
3,7,35,all,madness,0,0,0.1,Colley,2002,Alabama 6.0 Arizona 7.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.800000,0.000000,28
4,21,28,all,madness,0,0,0.1,Colley,2002,Alabama 5.0 Arizona 9.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.800000,-0.722222,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1695,14,28,all,madness,0,0,0.1,Massey,2018,Cincinnati 8.0 Duke 2.0 Ka...,Cincinnati 6.0 Duke 4.0 Ka...,0.800000,0.222222,14
1696,7,28,all,madness,0,0,0.1,Massey,2018,Cincinnati 7.0 Duke 2.0 Ka...,Cincinnati 6.0 Duke 4.0 Ka...,0.800000,-0.111111,21
1697,14,21,all,madness,0,0,0.1,Massey,2018,Cincinnati 8.0 Duke 2.0 Ka...,Cincinnati 7.0 Duke 3.0 Ka...,0.800000,0.222222,7
1698,7,21,all,madness,0,0,0.1,Massey,2018,Cincinnati 7.0 Duke 2.0 Ka...,Cincinnati 7.0 Duke 3.0 Ka...,0.800000,0.222222,14


In [39]:
problem['data'].keys()

dict_keys(['2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018'])

In [40]:
problem['data']['2002'].keys()

dict_keys(['days_to_subtract=35', 'days_to_subtract=28', 'days_to_subtract=21', 'days_to_subtract=14', 'days_to_subtract=7'])

**Create easier to reference variables**

In [41]:
years = list(problem['data'].keys())
days_to_subtract_keys = list(problem['data'][years[0]].keys())
remaining_games = problem['other']['remaining_games']
madness_teams = problem['other']['madness_teams']
best_df = problem['other']['best_df']
top_k = problem['other']['top_k']
#target_column = f"top{top_k}_intersection"
feature_names = problem['other']['feature_names']

In [70]:
days_to_subtract_keys

['days_to_subtract=35',
 'days_to_subtract=28',
 'days_to_subtract=21',
 'days_to_subtract=14',
 'days_to_subtract=7']

In [50]:
target = problem['target']

In [71]:
target

Unnamed: 0,days_to_subtract1,days_to_subtract2,domain,range,direct_thres,spread_thres,weight_indirect,Method,Year,rankings1,rankings2,top10_jaccard,top10_tau,days_diff
0,28,35,all,madness,0,0,0.1,Colley,2002,Alabama 6.0 Arizona 7.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.800000,0.388889,7
1,21,35,all,madness,0,0,0.1,Colley,2002,Alabama 5.0 Arizona 9.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.636364,-0.111111,14
2,14,35,all,madness,0,0,0.1,Colley,2002,Alabama 6.0 Arizona 9.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.800000,-0.055556,21
3,7,35,all,madness,0,0,0.1,Colley,2002,Alabama 6.0 Arizona 7.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.800000,0.000000,28
4,21,28,all,madness,0,0,0.1,Colley,2002,Alabama 5.0 Arizona 9.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.800000,-0.722222,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1695,14,28,all,madness,0,0,0.1,Massey,2018,Cincinnati 8.0 Duke 2.0 Ka...,Cincinnati 6.0 Duke 4.0 Ka...,0.800000,0.222222,14
1696,7,28,all,madness,0,0,0.1,Massey,2018,Cincinnati 7.0 Duke 2.0 Ka...,Cincinnati 6.0 Duke 4.0 Ka...,0.800000,-0.111111,21
1697,14,21,all,madness,0,0,0.1,Massey,2018,Cincinnati 8.0 Duke 2.0 Ka...,Cincinnati 7.0 Duke 3.0 Ka...,0.800000,0.222222,7
1698,7,21,all,madness,0,0,0.1,Massey,2018,Cincinnati 7.0 Duke 2.0 Ka...,Cincinnati 7.0 Duke 3.0 Ka...,0.800000,0.222222,14


## Define helper functions

**Function to compute a D matrix from games using hyperparameters**

In [42]:
def compute_D(game_df,team_range,direct_thres,spread_thres):
    map_func = lambda linked: pyrankability.construct.support_map_vectorized_direct_indirect(linked,direct_thres=direct_thres,spread_thres=spread_thres)
    Ds = pyrankability.construct.V_count_vectorized(game_df,map_func)
    for i in range(len(Ds)):
        Ds[i] = Ds[i].reindex(index=team_range,columns=team_range)
    return Ds

In [78]:
def process(data,target,best_df_all):
    index_cols = ["Year","days_to_subtract_key","direct_thres","spread_thres","weight_indirect","range","Method"]
    Ds = pd.DataFrame(columns=["D"]+index_cols)
    Ds.set_index(index_cols,inplace=True)
    for days_to_subtract_key,year in tqdm(itertools.product(days_to_subtract_keys,years)):
        days_to_subtract = float(days_to_subtract_key.split("=")[1])
        best_df = best_df_all.set_index('days_to_subtract').loc[days_to_subtract]
        for index,row in best_df.iterrows():
            dom,ran,dt,st,iw,method = row.loc['domain'],row.loc['range'],row.loc['direct_thres'],row.loc['spread_thres'],row.loc['weight_indirect'],row.loc['Method']
            iw = .1 # Set this so we get both direct and indirect D matrices
            # set the team_range
            team_range = None
            if ran == 'madness':
                team_range = madness_teams[year]
            elif ran == 'all':
                team_range = all_teams[year]
            else:
                raise Exception(f"range={ran} not supported")
            name = (year,days_to_subtract_key,dt,st,iw,ran,method)
            if iw == 0:
                st = np.Inf
            D = compute_D(data[year][days_to_subtract_key],team_range,dt,st)
            Ds = Ds.append(pd.Series([D],index=["D"],name=name)) 
    return Ds

## Create D matrices

In [79]:
best_df

Unnamed: 0,days_to_subtract,Method,domain,range,direct_thres,spread_thres,weight_indirect
0,7,Colley,all,madness,0,0,0.1
1,7,Massey,all,madness,0,0,0.1
2,14,Colley,all,madness,0,0,0.1
3,14,Massey,all,madness,0,0,0.1
4,21,Colley,all,madness,0,0,0.1
5,21,Massey,all,madness,0,0,0.1
6,28,Colley,all,madness,0,0,0.1
7,28,Massey,all,madness,0,0,0.1
8,35,Colley,all,madness,0,0,0.1
9,35,Massey,all,madness,0,0,0.1


In [80]:
Ds = process(problem['data'],problem['target'],best_df)

85it [04:29,  4.39s/it]


In [81]:
Ds

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,D
Year,days_to_subtract_key,direct_thres,spread_thres,weight_indirect,range,Method,Unnamed: 7_level_1
2002,days_to_subtract=35,0,0,0.1,madness,Colley,"[[Alabama, Alcorn_St, Arizona, Boston_College,..."
2002,days_to_subtract=35,0,0,0.1,madness,Massey,"[[Alabama, Alcorn_St, Arizona, Boston_College,..."
2003,days_to_subtract=35,0,0,0.1,madness,Colley,"[[Alabama, Arizona, Arizona_St, Auburn, Austin..."
2003,days_to_subtract=35,0,0,0.1,madness,Massey,"[[Alabama, Arizona, Arizona_St, Auburn, Austin..."
2004,days_to_subtract=35,0,0,0.1,madness,Colley,"[[Air_Force, Alabama, Alabama_St, Arizona, BYU..."
...,...,...,...,...,...,...,...
2016,days_to_subtract=7,0,0,0.1,madness,Massey,"[[Akron, Arizona, Ark_Little_Rock, Baylor, Bel..."
2017,days_to_subtract=7,0,0,0.1,madness,Colley,"[[Arizona, Arkansas, Baylor, Bucknell, Butler,..."
2017,days_to_subtract=7,0,0,0.1,madness,Massey,"[[Arizona, Arkansas, Baylor, Bucknell, Butler,..."
2018,days_to_subtract=7,0,0,0.1,madness,Colley,"[[Alabama, Arizona, Arkansas, Auburn, Bucknell..."


In [82]:
Ds.iloc[[0,-1]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,D
Year,days_to_subtract_key,direct_thres,spread_thres,weight_indirect,range,Method,Unnamed: 7_level_1
2002,days_to_subtract=35,0,0,0.1,madness,Colley,"[[Alabama, Alcorn_St, Arizona, Boston_College,..."
2018,days_to_subtract=7,0,0,0.1,madness,Massey,"[[Alabama, Arizona, Arkansas, Auburn, Bucknell..."


In [83]:
Ds.loc['2002',"D"][0][0]

  """Entry point for launching an IPython kernel.


team2,Alabama,Alcorn_St,Arizona,Boston_College,Boston_Univ,California,Central_Conn,Charlotte,Cincinnati,Connecticut,...,UNC_Wilmington,USC,Utah,Valparaiso,W_Kentucky,Wake_Forest,Winthrop,Wisconsin,Wyoming,Xavier
team1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,,,,,,,,,,,...,,,1.0,,,,,,,
Alcorn_St,,,,,,,,,,,...,,,,,,,,,,
Arizona,,,,,,1.0,,,,0.0,...,,1.0,,1.0,,1.0,,,,
Boston_College,,,,,1.0,,,,,,...,,,,,,,,,,
Boston_Univ,,,,0.0,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wake_Forest,,,0.0,,,0.0,,,,,...,,0.0,,,,,,,,
Winthrop,,,,,,,,,,,...,,,,,,,,,,0.0
Wisconsin,,,,,,,,,,,...,,,,,,,,,,
Wyoming,,,,,,,,,,,...,,0.0,,,,,,,,


In [84]:
Ds.loc['2002',"D"][0][1]

  """Entry point for launching an IPython kernel.


team2,Alabama,Alcorn_St,Arizona,Boston_College,Boston_Univ,California,Central_Conn,Charlotte,Cincinnati,Connecticut,...,UNC_Wilmington,USC,Utah,Valparaiso,W_Kentucky,Wake_Forest,Winthrop,Wisconsin,Wyoming,Xavier
team1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,,0.0,0.0,,,0.0,,1.0,0.0,0.0,...,,0.0,,,,0.0,1.0,1.0,,0.0
Alcorn_St,0.0,,,,,0.0,,0.0,,,...,,,0.0,0.0,,0.0,0.0,0.0,0.0,
Arizona,1.0,,,,,2.0,,2.0,0.0,1.0,...,,2.0,0.0,1.0,,9.0,1.0,,1.0,1.0
Boston_College,,,,,0.0,0.0,1.0,1.0,,2.0,...,0.0,0.0,,,,,2.0,0.0,,0.0
Boston_Univ,,,,0.0,,0.0,0.0,0.0,,0.0,...,,,,0.0,,,0.0,0.0,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wake_Forest,0.0,1.0,3.0,,,1.0,,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,,,1.0,0.0,0.0,
Winthrop,0.0,0.0,1.0,0.0,0.0,0.0,,0.0,1.0,,...,0.0,,0.0,1.0,,0.0,,,0.0,1.0
Wisconsin,0.0,2.0,,0.0,1.0,,,0.0,0.0,0.0,...,,,,1.0,,0.0,,,0.0,0.0
Wyoming,,1.0,0.0,,,0.0,,,0.0,,...,,,0.0,0.0,,0.0,1.0,0.0,,


In [85]:
Ds.index.names

FrozenList(['Year', 'days_to_subtract_key', 'direct_thres', 'spread_thres', 'weight_indirect', 'range', 'Method'])

In [112]:
rankings = {}
dt = target.set_index(['days_to_subtract2','Year','direct_thres','spread_thres','weight_indirect','range','Method'])
for index in dt.index:
    rankings[index] = dt.loc[index].iloc[0]['rankings2']
dt = target.set_index(['days_to_subtract1','Year','direct_thres','spread_thres','weight_indirect','range','Method'])
for index in dt.index:
    rankings[index] = dt.loc[index].iloc[0]['rankings1']

  after removing the cwd from sys.path.
  import sys


Unnamed: 0_level_0,35,28,21,14,35,28,21,14,35,28,...,7,7,7,7,7,7,7,7,7,7
Unnamed: 0_level_1,2002,2002,2002,2002,2003,2003,2003,2003,2004,2004,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
Unnamed: 0_level_2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Unnamed: 0_level_3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Unnamed: 0_level_4,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,...,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
Unnamed: 0_level_5,madness,madness,madness,madness,madness,madness,madness,madness,madness,madness,...,madness,madness,madness,madness,madness,madness,madness,madness,madness,madness
Unnamed: 0_level_6,Colley,Colley,Colley,Colley,Colley,Colley,Colley,Colley,Colley,Colley,...,Massey,Massey,Massey,Massey,Massey,Massey,Massey,Massey,Massey,Massey
Alabama,6.0,6.0,5.0,6.0,,,,,,,...,,,,,,,,,,
Arizona,7.0,7.0,9.0,9.0,2.0,3.0,1.0,1.0,,,...,,,,,,2.0,3.0,,,
Auburn,,,,,,,,,,,...,,,,,,,,,,
BYU,,,,,,,,,,,...,,5.0,9.0,,,,,,,
Baylor,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Washington,,,,,,,,,,,...,,,6.0,,,,,,,
West_Virginia,,,,,,,,,,,...,,9.0,,,,,,3.0,2.0,9.0
Wichita_St,,,,,,,,,,,...,,,,,,,,,,
Wisconsin,,,,,,,,,,,...,,6.0,,9.0,,,2.0,,,


In [117]:
rankings_df = pd.DataFrame(rankings).T

### Compute the features

In [118]:
feature_columns = ["delta_lop","delta_hillside","nfrac_xstar_lop","nfrac_xstar_hillside","diameter_lop","diameter_hillside"]

def compute_features(D,rankings,top_k):
    top_teams = list(rankings.sort_values().index[:top_k])
    D = D.loc[top_teams,top_teams]
    delta_lop,details_lop = pyrankability.rank.solve(D.fillna(0),method='lop',cont=True)

    x = pd.DataFrame(details_lop['x'],index=D.index,columns=D.columns)
    r = x.sum(axis=0)
    order = np.argsort(r)
    xstar = x.iloc[order,:].iloc[:,order]
    xstar.loc[:,:] = pyrankability.common.threshold_x(xstar.values)
    inxs = np.triu_indices(len(xstar),k=1)
    xstar_upper = xstar.values[inxs[0],inxs[1]]
    nfrac_upper_lop = sum((xstar_upper > 0) & (xstar_upper < 1))
    
    top_teams = xstar.columns[:top_k]
    
    k_two_distant,details_two_distant = pyrankability.search.solve_pair(D.fillna(0),method='lop',minimize=False,verbose=False)
    d_lop = k_two_distant#details_two_distant['tau']
    
    delta_hillside,details_hillside = pyrankability.rank.solve(D,method='hillside',cont=True)
    
    x = pd.DataFrame(details_hillside['x'],index=D.index,columns=D.columns)
    r = x.sum(axis=0)
    order = np.argsort(r)
    xstar = x.iloc[order,:].iloc[:,order]
    xstar.loc[:,:] = pyrankability.common.threshold_x(xstar.values)
    inxs = np.triu_indices(len(xstar),k=1)
    xstar_upper = xstar.values[inxs[0],inxs[1]]
    nfrac_upper_hillside = sum((xstar_upper > 0) & (xstar_upper < 1))
    
    top_teams = xstar.columns[:top_k]
    
    k_two_distant,details_two_distant = pyrankability.search.solve_pair(D,method='hillside',minimize=False,verbose=False)
    d_hillside = k_two_distant#details_two_distant['tau']
    
    features = pd.Series([delta_lop,delta_hillside,2*nfrac_upper_lop,2*nfrac_upper_hillside,d_lop,d_hillside],index=feature_columns)

    return features

In [121]:
def create_features(Ds,rankings_df,top_k):
    index_cols = list(Ds.index.names)+["Construction"]
    X = pd.DataFrame(columns=index_cols + feature_columns)
    X.set_index(index_cols,inplace=True)
    #target = target.set_index(['days_to_subtract2','Year','direct_thres','spread_thres','weight_indirect','range','Method'])
    for index,row in tqdm(Ds.iterrows()):
        sum_D = None
        year,days_to_subtract_key,dt,st,iw,ran,method = index
        days_to_subtract = int(days_to_subtract_key.split("=")[1])
        print(days_to_subtract,year,dt,st,iw,ran,method)
        rankings = rankings_df.loc[days_to_subtract,year,dt,st,iw,ran,method].dropna() #spec_best_pred_df = best_pred_df.set_index(['Year','days_to_subtract_key',"Method"]).loc[[(year,days_to_subtract_key,method)]]
        for i,D in enumerate(Ds.loc[(year,days_to_subtract_key,dt,st,iw,ran,method),"D"]):
            if sum_D is None:
                sum_D = D
            else:
                sum_D = sum_D.add(iw*D,fill_value=0)
            if i == 0:
                construction = "Direct"
            elif i == 1:
                construction = "Indirect"
            else:
                raise Exception("Error")
            features = compute_features(D,rankings,top_k)
            features.name = tuple(list(index)+[construction])
            X = X.append(features)
            
            if i == 1:
                construction = "Both"
                features = compute_features(sum_D,rankings,top_k)
                features.name = tuple(list(index)+[construction])
                X = X.append(features)
    return X

In [122]:
X = create_features(Ds,rankings_df,top_k)





0it [00:00, ?it/s][A[A[A[A

35 2002 0 0 0.1 madness Colley






1it [00:00,  3.80it/s][A[A[A[A

35 2002 0 0 0.1 madness Massey






2it [00:00,  3.51it/s][A[A[A[A

35 2003 0 0 0.1 madness Colley






3it [00:00,  3.58it/s][A[A[A[A

35 2003 0 0 0.1 madness Massey






4it [00:01,  3.67it/s][A[A[A[A

35 2004 0 0 0.1 madness Colley






5it [00:01,  3.61it/s][A[A[A[A

35 2004 0 0 0.1 madness Massey






6it [00:01,  3.73it/s][A[A[A[A

35 2005 0 0 0.1 madness Colley






7it [00:01,  3.83it/s][A[A[A[A

35 2005 0 0 0.1 madness Massey






8it [00:02,  3.90it/s][A[A[A[A

35 2006 0 0 0.1 madness Colley






9it [00:02,  3.53it/s][A[A[A[A

35 2006 0 0 0.1 madness Massey






10it [00:02,  3.44it/s][A[A[A[A

35 2007 0 0 0.1 madness Colley






11it [00:03,  3.57it/s][A[A[A[A

35 2007 0 0 0.1 madness Massey






12it [00:03,  3.69it/s][A[A[A[A

35 2008 0 0 0.1 madness Colley






13it [00:03,  3.01it/s][A[A[A[A

35 2008 0 0 0.1 madness Massey






14it [00:04,  3.26it/s][A[A[A[A

35 2009 0 0 0.1 madness Colley






15it [00:04,  3.41it/s][A[A[A[A

35 2009 0 0 0.1 madness Massey






16it [00:04,  3.58it/s][A[A[A[A

35 2010 0 0 0.1 madness Colley






17it [00:04,  3.43it/s][A[A[A[A

35 2010 0 0 0.1 madness Massey






18it [00:05,  3.55it/s][A[A[A[A

35 2011 0 0 0.1 madness Colley






19it [00:05,  3.44it/s][A[A[A[A

35 2011 0 0 0.1 madness Massey






20it [00:05,  3.48it/s][A[A[A[A

35 2012 0 0 0.1 madness Colley






21it [00:05,  3.56it/s][A[A[A[A

35 2012 0 0 0.1 madness Massey






22it [00:06,  3.64it/s][A[A[A[A

35 2013 0 0 0.1 madness Colley






23it [00:06,  3.63it/s][A[A[A[A

35 2013 0 0 0.1 madness Massey






24it [00:06,  3.48it/s][A[A[A[A

35 2014 0 0 0.1 madness Colley






25it [00:07,  3.56it/s][A[A[A[A

35 2014 0 0 0.1 madness Massey






26it [00:07,  3.66it/s][A[A[A[A

35 2015 0 0 0.1 madness Colley






27it [00:07,  3.71it/s][A[A[A[A

35 2015 0 0 0.1 madness Massey






28it [00:07,  3.65it/s][A[A[A[A

35 2016 0 0 0.1 madness Colley






29it [00:08,  3.38it/s][A[A[A[A

35 2016 0 0 0.1 madness Massey






30it [00:08,  3.53it/s][A[A[A[A

35 2017 0 0 0.1 madness Colley






31it [00:08,  3.66it/s][A[A[A[A

35 2017 0 0 0.1 madness Massey






32it [00:09,  3.41it/s][A[A[A[A

35 2018 0 0 0.1 madness Colley






33it [00:09,  3.45it/s][A[A[A[A

35 2018 0 0 0.1 madness Massey






34it [00:09,  3.61it/s][A[A[A[A

28 2002 0 0 0.1 madness Colley






35it [00:09,  3.56it/s][A[A[A[A

28 2002 0 0 0.1 madness Massey






36it [00:10,  3.69it/s][A[A[A[A

28 2003 0 0 0.1 madness Colley






37it [00:10,  3.68it/s][A[A[A[A

28 2003 0 0 0.1 madness Massey






38it [00:10,  3.79it/s][A[A[A[A

28 2004 0 0 0.1 madness Colley






39it [00:10,  3.62it/s][A[A[A[A

28 2004 0 0 0.1 madness Massey






40it [00:11,  3.46it/s][A[A[A[A

28 2005 0 0 0.1 madness Colley






41it [00:11,  3.60it/s][A[A[A[A

28 2005 0 0 0.1 madness Massey






42it [00:11,  3.71it/s][A[A[A[A

28 2006 0 0 0.1 madness Colley






43it [00:12,  3.53it/s][A[A[A[A

28 2006 0 0 0.1 madness Massey






44it [00:12,  3.48it/s][A[A[A[A

28 2007 0 0 0.1 madness Colley






45it [00:12,  3.63it/s][A[A[A[A

28 2007 0 0 0.1 madness Massey






46it [00:12,  3.72it/s][A[A[A[A

28 2008 0 0 0.1 madness Colley






47it [00:13,  3.50it/s][A[A[A[A

28 2008 0 0 0.1 madness Massey






48it [00:13,  3.25it/s][A[A[A[A

28 2009 0 0 0.1 madness Colley






49it [00:13,  3.20it/s][A[A[A[A

28 2009 0 0 0.1 madness Massey






50it [00:14,  3.29it/s][A[A[A[A

28 2010 0 0 0.1 madness Colley






51it [00:14,  3.42it/s][A[A[A[A

28 2010 0 0 0.1 madness Massey






52it [00:14,  3.54it/s][A[A[A[A

28 2011 0 0 0.1 madness Colley






53it [00:15,  3.55it/s][A[A[A[A

28 2011 0 0 0.1 madness Massey






54it [00:15,  3.53it/s][A[A[A[A

28 2012 0 0 0.1 madness Colley






55it [00:15,  3.51it/s][A[A[A[A

28 2012 0 0 0.1 madness Massey






56it [00:15,  3.62it/s][A[A[A[A

28 2013 0 0 0.1 madness Colley






57it [00:16,  3.59it/s][A[A[A[A

28 2013 0 0 0.1 madness Massey






58it [00:16,  3.56it/s][A[A[A[A

28 2014 0 0 0.1 madness Colley






59it [00:16,  3.39it/s][A[A[A[A

28 2014 0 0 0.1 madness Massey






60it [00:16,  3.56it/s][A[A[A[A

28 2015 0 0 0.1 madness Colley






61it [00:17,  3.41it/s][A[A[A[A

28 2015 0 0 0.1 madness Massey






62it [00:17,  3.34it/s][A[A[A[A

28 2016 0 0 0.1 madness Colley






63it [00:17,  3.29it/s][A[A[A[A

28 2016 0 0 0.1 madness Massey






64it [00:18,  3.39it/s][A[A[A[A

28 2017 0 0 0.1 madness Colley






65it [00:18,  3.43it/s][A[A[A[A

28 2017 0 0 0.1 madness Massey






66it [00:18,  3.59it/s][A[A[A[A

28 2018 0 0 0.1 madness Colley






67it [00:18,  3.69it/s][A[A[A[A

28 2018 0 0 0.1 madness Massey






68it [00:19,  3.65it/s][A[A[A[A

21 2002 0 0 0.1 madness Colley






69it [00:19,  3.38it/s][A[A[A[A

21 2002 0 0 0.1 madness Massey






70it [00:19,  3.45it/s][A[A[A[A

21 2003 0 0 0.1 madness Colley






71it [00:20,  3.50it/s][A[A[A[A

21 2003 0 0 0.1 madness Massey






72it [00:20,  3.57it/s][A[A[A[A

21 2004 0 0 0.1 madness Colley






73it [00:20,  3.52it/s][A[A[A[A

21 2004 0 0 0.1 madness Massey






74it [00:20,  3.62it/s][A[A[A[A

21 2005 0 0 0.1 madness Colley






75it [00:21,  3.68it/s][A[A[A[A

21 2005 0 0 0.1 madness Massey






76it [00:21,  3.76it/s][A[A[A[A

21 2006 0 0 0.1 madness Colley






77it [00:21,  3.43it/s][A[A[A[A

21 2006 0 0 0.1 madness Massey






78it [00:22,  3.39it/s][A[A[A[A

21 2007 0 0 0.1 madness Colley






79it [00:22,  3.36it/s][A[A[A[A

21 2007 0 0 0.1 madness Massey






80it [00:22,  3.41it/s][A[A[A[A

21 2008 0 0 0.1 madness Colley






81it [00:23,  3.48it/s][A[A[A[A

21 2008 0 0 0.1 madness Massey






82it [00:23,  3.55it/s][A[A[A[A

21 2009 0 0 0.1 madness Colley






83it [00:23,  3.59it/s][A[A[A[A

21 2009 0 0 0.1 madness Massey






84it [00:23,  3.62it/s][A[A[A[A

21 2010 0 0 0.1 madness Colley






85it [00:24,  3.57it/s][A[A[A[A

21 2010 0 0 0.1 madness Massey






86it [00:24,  3.68it/s][A[A[A[A

21 2011 0 0 0.1 madness Colley






87it [00:24,  3.72it/s][A[A[A[A

21 2011 0 0 0.1 madness Massey






88it [00:24,  3.40it/s][A[A[A[A

21 2012 0 0 0.1 madness Colley






89it [00:25,  3.53it/s][A[A[A[A

21 2012 0 0 0.1 madness Massey






90it [00:25,  3.59it/s][A[A[A[A

21 2013 0 0 0.1 madness Colley






91it [00:25,  3.54it/s][A[A[A[A

21 2013 0 0 0.1 madness Massey






92it [00:26,  3.25it/s][A[A[A[A

21 2014 0 0 0.1 madness Colley






93it [00:26,  3.36it/s][A[A[A[A

21 2014 0 0 0.1 madness Massey






94it [00:26,  3.56it/s][A[A[A[A

21 2015 0 0 0.1 madness Colley






95it [00:27,  3.39it/s][A[A[A[A

21 2015 0 0 0.1 madness Massey






96it [00:27,  3.59it/s][A[A[A[A

21 2016 0 0 0.1 madness Colley






97it [00:27,  3.69it/s][A[A[A[A

21 2016 0 0 0.1 madness Massey






98it [00:27,  3.49it/s][A[A[A[A

21 2017 0 0 0.1 madness Colley






99it [00:28,  3.45it/s][A[A[A[A

21 2017 0 0 0.1 madness Massey






100it [00:28,  3.50it/s][A[A[A[A

21 2018 0 0 0.1 madness Colley






101it [00:28,  3.62it/s][A[A[A[A

21 2018 0 0 0.1 madness Massey






102it [00:28,  3.68it/s][A[A[A[A

14 2002 0 0 0.1 madness Colley






103it [00:29,  3.50it/s][A[A[A[A

14 2002 0 0 0.1 madness Massey






104it [00:29,  3.63it/s][A[A[A[A

14 2003 0 0 0.1 madness Colley






105it [00:29,  3.75it/s][A[A[A[A

14 2003 0 0 0.1 madness Massey






106it [00:29,  3.85it/s][A[A[A[A

14 2004 0 0 0.1 madness Colley






107it [00:30,  3.60it/s][A[A[A[A

14 2004 0 0 0.1 madness Massey






108it [00:30,  3.70it/s][A[A[A[A

14 2005 0 0 0.1 madness Colley






109it [00:30,  3.48it/s][A[A[A[A

14 2005 0 0 0.1 madness Massey






110it [00:31,  3.58it/s][A[A[A[A

14 2006 0 0 0.1 madness Colley






111it [00:31,  3.54it/s][A[A[A[A

14 2006 0 0 0.1 madness Massey






112it [00:31,  3.60it/s][A[A[A[A

14 2007 0 0 0.1 madness Colley






113it [00:31,  3.56it/s][A[A[A[A

14 2007 0 0 0.1 madness Massey






114it [00:32,  3.70it/s][A[A[A[A

14 2008 0 0 0.1 madness Colley






115it [00:32,  3.45it/s][A[A[A[A

14 2008 0 0 0.1 madness Massey






116it [00:32,  3.39it/s][A[A[A[A

14 2009 0 0 0.1 madness Colley






117it [00:33,  3.54it/s][A[A[A[A

14 2009 0 0 0.1 madness Massey






118it [00:33,  3.54it/s][A[A[A[A

14 2010 0 0 0.1 madness Colley






119it [00:33,  3.42it/s][A[A[A[A

14 2010 0 0 0.1 madness Massey






120it [00:33,  3.55it/s][A[A[A[A

14 2011 0 0 0.1 madness Colley






121it [00:34,  3.64it/s][A[A[A[A

14 2011 0 0 0.1 madness Massey






122it [00:34,  3.63it/s][A[A[A[A

14 2012 0 0 0.1 madness Colley






123it [00:34,  3.50it/s][A[A[A[A

14 2012 0 0 0.1 madness Massey






124it [00:35,  3.53it/s][A[A[A[A

14 2013 0 0 0.1 madness Colley






125it [00:35,  3.63it/s][A[A[A[A

14 2013 0 0 0.1 madness Massey






126it [00:35,  3.63it/s][A[A[A[A

14 2014 0 0 0.1 madness Colley






127it [00:35,  3.71it/s][A[A[A[A

14 2014 0 0 0.1 madness Massey






128it [00:36,  3.82it/s][A[A[A[A

14 2015 0 0 0.1 madness Colley






129it [00:36,  3.77it/s][A[A[A[A

14 2015 0 0 0.1 madness Massey






130it [00:36,  3.43it/s][A[A[A[A

14 2016 0 0 0.1 madness Colley






131it [00:37,  3.33it/s][A[A[A[A

14 2016 0 0 0.1 madness Massey






132it [00:37,  3.46it/s][A[A[A[A

14 2017 0 0 0.1 madness Colley






133it [00:37,  3.47it/s][A[A[A[A

14 2017 0 0 0.1 madness Massey






134it [00:37,  3.51it/s][A[A[A[A

14 2018 0 0 0.1 madness Colley






135it [00:38,  3.64it/s][A[A[A[A

14 2018 0 0 0.1 madness Massey






136it [00:38,  3.68it/s][A[A[A[A

7 2002 0 0 0.1 madness Colley






137it [00:38,  3.76it/s][A[A[A[A

7 2002 0 0 0.1 madness Massey






138it [00:38,  3.79it/s][A[A[A[A

7 2003 0 0 0.1 madness Colley






139it [00:39,  3.77it/s][A[A[A[A

7 2003 0 0 0.1 madness Massey






140it [00:39,  3.68it/s][A[A[A[A

7 2004 0 0 0.1 madness Colley






141it [00:39,  3.76it/s][A[A[A[A

7 2004 0 0 0.1 madness Massey






142it [00:39,  3.80it/s][A[A[A[A

7 2005 0 0 0.1 madness Colley






143it [00:40,  3.52it/s][A[A[A[A

7 2005 0 0 0.1 madness Massey






144it [00:40,  3.56it/s][A[A[A[A

7 2006 0 0 0.1 madness Colley






145it [00:40,  3.66it/s][A[A[A[A

7 2006 0 0 0.1 madness Massey






146it [00:41,  3.66it/s][A[A[A[A

7 2007 0 0 0.1 madness Colley






147it [00:41,  3.68it/s][A[A[A[A

7 2007 0 0 0.1 madness Massey






148it [00:41,  3.80it/s][A[A[A[A

7 2008 0 0 0.1 madness Colley






149it [00:41,  3.67it/s][A[A[A[A

7 2008 0 0 0.1 madness Massey






150it [00:42,  3.50it/s][A[A[A[A

7 2009 0 0 0.1 madness Colley






151it [00:42,  3.41it/s][A[A[A[A

7 2009 0 0 0.1 madness Massey






152it [00:42,  3.54it/s][A[A[A[A

7 2010 0 0 0.1 madness Colley






153it [00:43,  3.67it/s][A[A[A[A

7 2010 0 0 0.1 madness Massey






154it [00:43,  3.53it/s][A[A[A[A

7 2011 0 0 0.1 madness Colley






155it [00:43,  3.62it/s][A[A[A[A

7 2011 0 0 0.1 madness Massey






156it [00:43,  3.73it/s][A[A[A[A

7 2012 0 0 0.1 madness Colley






157it [00:44,  3.79it/s][A[A[A[A

7 2012 0 0 0.1 madness Massey






158it [00:44,  3.70it/s][A[A[A[A

7 2013 0 0 0.1 madness Colley






159it [00:44,  3.49it/s][A[A[A[A

7 2013 0 0 0.1 madness Massey






160it [00:45,  3.39it/s][A[A[A[A

7 2014 0 0 0.1 madness Colley






161it [00:45,  3.49it/s][A[A[A[A

7 2014 0 0 0.1 madness Massey






162it [00:45,  3.52it/s][A[A[A[A

7 2015 0 0 0.1 madness Colley






163it [00:45,  3.65it/s][A[A[A[A

7 2015 0 0 0.1 madness Massey






164it [00:46,  3.56it/s][A[A[A[A

7 2016 0 0 0.1 madness Colley






165it [00:46,  3.67it/s][A[A[A[A

7 2016 0 0 0.1 madness Massey






166it [00:46,  3.44it/s][A[A[A[A

7 2017 0 0 0.1 madness Colley






167it [00:47,  3.49it/s][A[A[A[A

7 2017 0 0 0.1 madness Massey






168it [00:47,  3.63it/s][A[A[A[A

7 2018 0 0 0.1 madness Colley






169it [00:47,  3.59it/s][A[A[A[A

7 2018 0 0 0.1 madness Massey






170it [00:47,  3.45it/s][A[A[A[A

In [124]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
Year,days_to_subtract_key,direct_thres,spread_thres,weight_indirect,range,Method,Construction,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2002,days_to_subtract=35,0,0,0.1,madness,Colley,Direct,0.0,0.0,56.0,70.0,28.0,35.0
2002,days_to_subtract=35,0,0,0.1,madness,Colley,Indirect,4.0,48.0,38.0,6.0,18.0,3.0
2002,days_to_subtract=35,0,0,0.1,madness,Colley,Both,0.4,56.0,12.0,8.0,6.0,4.0
2002,days_to_subtract=35,0,0,0.1,madness,Massey,Direct,0.0,0.0,56.0,68.0,28.0,34.0
2002,days_to_subtract=35,0,0,0.1,madness,Massey,Indirect,7.0,62.0,30.0,8.0,15.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,days_to_subtract=7,0,0,0.1,madness,Colley,Indirect,8.0,62.0,14.0,6.0,7.0,3.0
2018,days_to_subtract=7,0,0,0.1,madness,Colley,Both,1.5,78.0,4.0,12.0,2.0,4.0
2018,days_to_subtract=7,0,0,0.1,madness,Massey,Direct,1.0,1.0,32.0,44.0,16.0,22.0
2018,days_to_subtract=7,0,0,0.1,madness,Massey,Indirect,16.0,80.0,18.0,6.0,9.0,3.0


## Refine the target dataset

In [125]:
target = problem['target'].groupby(['days_to_subtract1','days_to_subtract2','Method','Year','direct_thres','spread_thres','weight_indirect'])[feature_names].mean()
target

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,top10_jaccard,top10_tau
days_to_subtract1,days_to_subtract2,Method,Year,direct_thres,spread_thres,weight_indirect,Unnamed: 7_level_1,Unnamed: 8_level_1
7,14,Colley,2002,0,0,0.1,0.800000,0.277778
7,14,Colley,2003,0,0,0.1,0.800000,-0.111111
7,14,Colley,2004,0,0,0.1,0.636364,0.055556
7,14,Colley,2005,0,0,0.1,1.000000,-0.555556
7,14,Colley,2006,0,0,0.1,0.636364,-0.444444
...,...,...,...,...,...,...,...,...
28,35,Massey,2014,0,0,0.1,0.636364,0.111111
28,35,Massey,2015,0,0,0.1,0.800000,-0.222222
28,35,Massey,2016,0,0,0.1,0.800000,0.166667
28,35,Massey,2017,0,0,0.1,1.000000,0.111111


In [126]:
target.corr()

Unnamed: 0,top10_jaccard,top10_tau
top10_jaccard,1.0,0.025604
top10_tau,0.025604,1.0


In [130]:
X_for_join = X.copy().reset_index()
X_for_join['days_to_subtract1']= X_for_join['days_to_subtract_key'].str.replace("days_to_subtract=","").astype(float)
X_for_join.weight_indirect=0.1
X_for_join

Unnamed: 0,Year,days_to_subtract_key,direct_thres,spread_thres,weight_indirect,range,Method,Construction,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside,days_to_subtract1
0,2002,days_to_subtract=35,0,0,0.1,madness,Colley,Direct,0.0,0.0,56.0,70.0,28.0,35.0,35.0
1,2002,days_to_subtract=35,0,0,0.1,madness,Colley,Indirect,4.0,48.0,38.0,6.0,18.0,3.0,35.0
2,2002,days_to_subtract=35,0,0,0.1,madness,Colley,Both,0.4,56.0,12.0,8.0,6.0,4.0,35.0
3,2002,days_to_subtract=35,0,0,0.1,madness,Massey,Direct,0.0,0.0,56.0,68.0,28.0,34.0,35.0
4,2002,days_to_subtract=35,0,0,0.1,madness,Massey,Indirect,7.0,62.0,30.0,8.0,15.0,4.0,35.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,2018,days_to_subtract=7,0,0,0.1,madness,Colley,Indirect,8.0,62.0,14.0,6.0,7.0,3.0,7.0
506,2018,days_to_subtract=7,0,0,0.1,madness,Colley,Both,1.5,78.0,4.0,12.0,2.0,4.0,7.0
507,2018,days_to_subtract=7,0,0,0.1,madness,Massey,Direct,1.0,1.0,32.0,44.0,16.0,22.0,7.0
508,2018,days_to_subtract=7,0,0,0.1,madness,Massey,Indirect,16.0,80.0,18.0,6.0,9.0,3.0,7.0


In [131]:
target

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,top10_jaccard,top10_tau
days_to_subtract1,days_to_subtract2,Method,Year,direct_thres,spread_thres,weight_indirect,Unnamed: 7_level_1,Unnamed: 8_level_1
7,14,Colley,2002,0,0,0.1,0.800000,0.277778
7,14,Colley,2003,0,0,0.1,0.800000,-0.111111
7,14,Colley,2004,0,0,0.1,0.636364,0.055556
7,14,Colley,2005,0,0,0.1,1.000000,-0.555556
7,14,Colley,2006,0,0,0.1,0.636364,-0.444444
...,...,...,...,...,...,...,...,...
28,35,Massey,2014,0,0,0.1,0.636364,0.111111
28,35,Massey,2015,0,0,0.1,0.800000,-0.222222
28,35,Massey,2016,0,0,0.1,0.800000,0.166667
28,35,Massey,2017,0,0,0.1,1.000000,0.111111


In [132]:
Xy = target.reset_index().set_index(['Method','days_to_subtract1','Year','direct_thres','spread_thres','weight_indirect']).join(X_for_join.set_index(['Method','days_to_subtract1','Year','direct_thres','spread_thres','weight_indirect'])).dropna()
Xy = Xy.reset_index()
Xy

Unnamed: 0,Method,days_to_subtract1,Year,direct_thres,spread_thres,weight_indirect,days_to_subtract2,top10_jaccard,top10_tau,days_to_subtract_key,range,Construction,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
0,Colley,7,2002,0,0,0.1,14,0.800000,0.277778,days_to_subtract=7,madness,Direct,1.0,0.0,50.0,54.0,25.0,27.0
1,Colley,7,2002,0,0,0.1,14,0.800000,0.277778,days_to_subtract=7,madness,Indirect,9.0,87.0,8.0,10.0,4.0,5.0
2,Colley,7,2002,0,0,0.1,14,0.800000,0.277778,days_to_subtract=7,madness,Both,2.0,104.0,6.0,10.0,3.0,5.0
3,Colley,7,2002,0,0,0.1,21,0.636364,-0.555556,days_to_subtract=7,madness,Direct,1.0,0.0,50.0,54.0,25.0,27.0
4,Colley,7,2002,0,0,0.1,21,0.636364,-0.555556,days_to_subtract=7,madness,Indirect,9.0,87.0,8.0,10.0,4.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1015,Massey,28,2017,0,0,0.1,35,1.000000,0.111111,days_to_subtract=28,madness,Indirect,11.0,80.0,16.0,10.0,8.0,5.0
1016,Massey,28,2017,0,0,0.1,35,1.000000,0.111111,days_to_subtract=28,madness,Both,2.1,92.0,12.0,0.0,6.0,-0.0
1017,Massey,28,2018,0,0,0.1,35,0.636364,0.055556,days_to_subtract=28,madness,Direct,2.0,2.0,64.0,64.0,31.0,31.0
1018,Massey,28,2018,0,0,0.1,35,0.636364,0.055556,days_to_subtract=28,madness,Indirect,7.0,84.0,16.0,2.0,8.0,1.0


## Process results

In [133]:
pairs_by_width = {}
for f1,f2 in itertools.combinations(X_for_join['days_to_subtract1'].unique().astype(int),2):
    if f2 < f1:
        f1,f2 = f2,f1
    width = f2-f1#round(100*(f2-f1))
    if width not in pairs_by_width:
        pairs_by_width[width] = []
    pairs_by_width[width].append((f1,f2))

In [134]:
pairs_by_width

{7: [(28, 35), (21, 28), (14, 21), (7, 14)],
 14: [(21, 35), (14, 28), (7, 21)],
 21: [(14, 35), (7, 28)],
 28: [(7, 35)]}

In [135]:
["days_to_subtract1","days_to_subtract2","Method","Construction"]+feature_columns

['days_to_subtract1',
 'days_to_subtract2',
 'Method',
 'Construction',
 'delta_lop',
 'delta_hillside',
 'nfrac_xstar_lop',
 'nfrac_xstar_hillside',
 'diameter_lop',
 'diameter_hillside']

In [136]:
#!sudo pip install pandas --upgrade

In [137]:
import altair as alt

index_cols = ['Method', 'Construction', 'days_to_subtract1', 'days_to_subtract2','width']
graph_dfs = {}
for target_column in feature_names:
    graph_df = pd.DataFrame(columns=index_cols+feature_columns).set_index(index_cols)

    for width in pairs_by_width.keys():
        summary = None
        for pair in pairs_by_width[width]:
            data = Xy.set_index(['days_to_subtract1','days_to_subtract2']).loc[pair].reset_index()
            for_corr = data.set_index(['Method','Construction',"days_to_subtract1","days_to_subtract2"])
            if summary is None:
                summary = pd.DataFrame(columns=["days_to_subtract1","days_to_subtract2","Method","Construction"]+feature_columns).set_index(list(for_corr.index.names))
            for ix in for_corr.index.unique():
                corr_results = for_corr.loc[ix][[target_column]+feature_columns].corr()
                target_corr_results = corr_results.loc[target_column].drop(target_column)
                target_corr_results.name = ix
                summary = summary.append(target_corr_results)

        graph_df1 = summary.reset_index()
        graph_df1['width'] = width
        graph_df1 = graph_df1.set_index(index_cols)
        graph_df = graph_df.append(graph_df1)
    graph_dfs[target_column]=graph_df

  # This is added back by InteractiveShellApp.init_path()
  app.launch_new_instance()


In [138]:
for key in graph_dfs.keys():
    graph_dfs[key] = graph_dfs[key].reset_index()

In [139]:
graph_dfs[key].head()

Unnamed: 0,Method,Construction,days_to_subtract1,days_to_subtract2,width,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
0,Colley,Direct,28,35,7,0.089558,0.27691,-0.005222,0.24395,0.007466,-0.026597
1,Colley,Indirect,28,35,7,0.277504,0.159061,0.245822,-0.395223,0.301307,-0.345514
2,Colley,Both,28,35,7,0.255392,0.249144,0.158624,0.065713,0.197243,-0.018779
3,Massey,Direct,28,35,7,0.178955,0.373124,-0.018475,0.348866,-0.226361,-0.237737
4,Massey,Indirect,28,35,7,0.195509,0.208005,0.172273,0.324947,0.147121,0.244718


In [141]:
for key in graph_dfs.keys():
    display(Markdown(f'## {key}'))
    graph_df = graph_dfs[key].melt(value_vars=feature_columns,id_vars=index_cols,value_name='Value',var_name='Feature')

    display(Markdown('### Colley'))
    g = alt.Chart(graph_df.set_index('Method').loc['Colley']).mark_bar().encode(
        x='width:N',
        y=alt.Y('average(Value)',scale=alt.Scale(domain=[-.6, .6])),
        row='Feature:N',
        color='Construction:N',
        column='Construction:N'
    )
    display(g)
    
    display(Markdown('### Massey'))
    g = alt.Chart(graph_df.set_index('Method').loc['Massey']).mark_bar().encode(
        x='width:N',
        y=alt.Y('average(Value)',scale=alt.Scale(domain=[-.6, .6])),
        row='Feature:N',
        color='Construction:N',
        column='Construction:N'
    )
    display(g)

## top10_jaccard

### Colley

### Massey

## top10_tau

### Colley

### Massey

### Old below this line

### Colley

In [78]:
g = alt.Chart(graph_df.set_index('Method').loc['Massey']).mark_bar().encode(
    x='width:N',
    y=alt.Y('average(Value)',scale=alt.Scale(domain=[-.6, .6])),
    row='Feature:N',
    color='Construction:N',
    column='Construction:N'
)

### Massey

In [79]:
g

## old below

## 0.6 to 0.7

In [57]:
data = Xy.set_index(['frac1','frac2']).loc[(0.6,0.7)].reset_index()
for_corr = data.set_index(['Method','Construction'])
for ix in for_corr.index.unique():
    display(pd.Series(ix,index=for_corr.index.names))
    display(for_corr.loc[ix][[target_column]+feature_columns].corr())

  """Entry point for launching an IPython kernel.


Method          Colley
Construction    Direct
dtype: object

  """


Unnamed: 0,top15_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top15_intersection,1.0,-0.033692,0.034758,-0.124823,0.229806,0.275894,-0.168381
delta_lop,-0.033692,1.0,0.735085,0.279614,-0.197032,-0.099624,0.292868
delta_hillside,0.034758,0.735085,1.0,0.341162,-0.555433,-0.315206,0.610122
nfrac_xstar_lop,-0.124823,0.279614,0.341162,1.0,0.070455,-0.931563,-0.051844
nfrac_xstar_hillside,0.229806,-0.197032,-0.555433,0.070455,1.0,0.066163,-0.973198
diameter_lop,0.275894,-0.099624,-0.315206,-0.931563,0.066163,1.0,-0.045087
diameter_hillside,-0.168381,0.292868,0.610122,-0.051844,-0.973198,-0.045087,1.0


Method            Colley
Construction    Indirect
dtype: object

  """


Unnamed: 0,top15_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top15_intersection,1.0,0.04046,-0.067101,0.399105,-0.124068,-0.511588,0.136769
delta_lop,0.04046,1.0,0.933609,0.448464,0.053871,-0.334884,-0.045194
delta_hillside,-0.067101,0.933609,1.0,0.301852,-0.101378,-0.190484,0.113999
nfrac_xstar_lop,0.399105,0.448464,0.301852,1.0,0.271442,-0.951606,-0.221158
nfrac_xstar_hillside,-0.124068,0.053871,-0.101378,0.271442,1.0,-0.240039,-0.995307
diameter_lop,-0.511588,-0.334884,-0.190484,-0.951606,-0.240039,1.0,0.192296
diameter_hillside,0.136769,-0.045194,0.113999,-0.221158,-0.995307,0.192296,1.0


Method          Colley
Construction      Both
dtype: object

  """


Unnamed: 0,top15_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top15_intersection,1.0,-0.157214,-0.046902,-0.07849,-0.249868,0.083512,0.2517
delta_lop,-0.157214,1.0,0.906945,0.444305,-0.569154,-0.234295,0.57321
delta_hillside,-0.046902,0.906945,1.0,0.200477,-0.495748,0.045845,0.474313
nfrac_xstar_lop,-0.07849,0.444305,0.200477,1.0,-0.346656,-0.917743,0.378526
nfrac_xstar_hillside,-0.249868,-0.569154,-0.495748,-0.346656,1.0,0.26013,-0.990913
diameter_lop,0.083512,-0.234295,0.045845,-0.917743,0.26013,1.0,-0.300622
diameter_hillside,0.2517,0.57321,0.474313,0.378526,-0.990913,-0.300622,1.0


Method          Massey
Construction    Direct
dtype: object

  """


Unnamed: 0,top15_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top15_intersection,1.0,0.051682,0.070177,0.323252,0.00047,-0.3288,-0.114328
delta_lop,0.051682,1.0,0.68216,0.546261,-0.311122,-0.566135,0.33973
delta_hillside,0.070177,0.68216,1.0,0.059334,-0.608424,-0.139715,0.611381
nfrac_xstar_lop,0.323252,0.546261,0.059334,1.0,0.420533,-0.978047,-0.422727
nfrac_xstar_hillside,0.00047,-0.311122,-0.608424,0.420533,1.0,-0.42608,-0.979986
diameter_lop,-0.3288,-0.566135,-0.139715,-0.978047,-0.42608,1.0,0.431566
diameter_hillside,-0.114328,0.33973,0.611381,-0.422727,-0.979986,0.431566,1.0


Method          Massey
Construction      Both
dtype: object

  """


Unnamed: 0,top15_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top15_intersection,1.0,0.051682,0.070177,0.323252,0.00047,-0.3288,-0.114328
delta_lop,0.051682,1.0,0.68216,0.546261,-0.311122,-0.566135,0.33973
delta_hillside,0.070177,0.68216,1.0,0.059334,-0.608424,-0.139715,0.611381
nfrac_xstar_lop,0.323252,0.546261,0.059334,1.0,0.420533,-0.978047,-0.422727
nfrac_xstar_hillside,0.00047,-0.311122,-0.608424,0.420533,1.0,-0.42608,-0.979986
diameter_lop,-0.3288,-0.566135,-0.139715,-0.978047,-0.42608,1.0,0.431566
diameter_hillside,-0.114328,0.33973,0.611381,-0.422727,-0.979986,0.431566,1.0


### 0.7 to 0.8

In [58]:
data = Xy.set_index(['frac1','frac2']).loc[(0.7,0.8)].reset_index()
for_corr = data.set_index(['Method'])
for ix in for_corr.index.unique():
    display(pd.Series(ix,index=for_corr.index.names))
    display(for_corr.loc[ix][[target_column]+feature_columns].corr())

  """Entry point for launching an IPython kernel.


Method    Colley
dtype: object

Unnamed: 0,top15_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top15_intersection,1.0,0.02229,0.023643,-0.145952,-0.019798,0.131875,0.013795
delta_lop,0.02229,1.0,0.856411,-0.600197,-0.833061,0.699814,0.840072
delta_hillside,0.023643,0.856411,1.0,-0.823404,-0.923045,0.869911,0.929844
nfrac_xstar_lop,-0.145952,-0.600197,-0.823404,1.0,0.778909,-0.974619,-0.795137
nfrac_xstar_hillside,-0.019798,-0.833061,-0.923045,0.778909,1.0,-0.854231,-0.996746
diameter_lop,0.131875,0.699814,0.869911,-0.974619,-0.854231,1.0,0.869986
diameter_hillside,0.013795,0.840072,0.929844,-0.795137,-0.996746,0.869986,1.0


Method    Massey
dtype: object

Unnamed: 0,top15_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top15_intersection,1.0,0.244961,0.302184,-0.191937,-0.126197,0.204923,0.362996
delta_lop,0.244961,1.0,0.569115,0.359219,-0.127186,-0.413252,0.369689
delta_hillside,0.302184,0.569115,1.0,0.049907,-0.061282,-0.080169,0.456502
nfrac_xstar_lop,-0.191937,0.359219,0.049907,1.0,0.306418,-0.966973,-0.333667
nfrac_xstar_hillside,-0.126197,-0.127186,-0.061282,0.306418,1.0,-0.301968,-0.789993
diameter_lop,0.204923,-0.413252,-0.080169,-0.966973,-0.301968,1.0,0.373802
diameter_hillside,0.362996,0.369689,0.456502,-0.333667,-0.789993,0.373802,1.0


### 0.8 to 0.9

In [59]:
data = Xy.set_index(['frac1','frac2']).loc[(0.8,0.9)].reset_index()
for_corr = data.set_index(['Method'])
for ix in for_corr.index.unique():
    display(pd.Series(ix,index=for_corr.index.names))
    display(for_corr.loc[ix][[target_column]+feature_columns].corr())

  """Entry point for launching an IPython kernel.


Method    Colley
dtype: object

Unnamed: 0,top15_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top15_intersection,1.0,-0.063546,-0.04663,0.021394,0.072194,-0.033096,-0.069062
delta_lop,-0.063546,1.0,0.865985,-0.624501,-0.78181,0.731686,0.801398
delta_hillside,-0.04663,0.865985,1.0,-0.833724,-0.913166,0.884851,0.928728
nfrac_xstar_lop,0.021394,-0.624501,-0.833724,1.0,0.77814,-0.964683,-0.805747
nfrac_xstar_hillside,0.072194,-0.78181,-0.913166,0.77814,1.0,-0.836519,-0.987047
diameter_lop,-0.033096,0.731686,0.884851,-0.964683,-0.836519,1.0,0.868255
diameter_hillside,-0.069062,0.801398,0.928728,-0.805747,-0.987047,0.868255,1.0


Method    Massey
dtype: object

Unnamed: 0,top15_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top15_intersection,1.0,-0.200329,-0.081585,0.151543,-0.029163,-0.148642,0.088628
delta_lop,-0.200329,1.0,0.410175,0.611958,-0.035864,-0.526587,-0.072341
delta_hillside,-0.081585,0.410175,1.0,0.046915,-0.104072,-0.051227,0.012704
nfrac_xstar_lop,0.151543,0.611958,0.046915,1.0,0.295671,-0.974192,-0.275981
nfrac_xstar_hillside,-0.029163,-0.035864,-0.104072,0.295671,1.0,-0.361536,-0.901983
diameter_lop,-0.148642,-0.526587,-0.051227,-0.974192,-0.361536,1.0,0.351146
diameter_hillside,0.088628,-0.072341,0.012704,-0.275981,-0.901983,0.351146,1.0


### 0.9 to 1.

In [60]:
data = Xy.set_index(['frac1','frac2']).loc[(0.9,1.)].reset_index()
for_corr = data.set_index(['Method'])
for ix in for_corr.index.unique():
    display(pd.Series(ix,index=for_corr.index.names))
    display(for_corr.loc[ix][[target_column]+feature_columns].corr())

  """Entry point for launching an IPython kernel.


Method    Colley
dtype: object

Unnamed: 0,top15_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top15_intersection,1.0,0.080682,0.101705,-0.090726,-0.104757,0.084122,0.138211
delta_lop,0.080682,1.0,0.865931,-0.664981,-0.747569,0.736892,0.760151
delta_hillside,0.101705,0.865931,1.0,-0.839537,-0.86156,0.88946,0.87902
nfrac_xstar_lop,-0.090726,-0.664981,-0.839537,1.0,0.751581,-0.97363,-0.806614
nfrac_xstar_hillside,-0.104757,-0.747569,-0.86156,0.751581,1.0,-0.81121,-0.926384
diameter_lop,0.084122,0.736892,0.88946,-0.97363,-0.81121,1.0,0.875447
diameter_hillside,0.138211,0.760151,0.87902,-0.806614,-0.926384,0.875447,1.0


Method    Massey
dtype: object

Unnamed: 0,top15_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top15_intersection,1.0,0.06008,0.144706,-0.27109,-0.074017,0.379677,0.254621
delta_lop,0.06008,1.0,0.530237,0.151979,0.0749,0.052209,0.195383
delta_hillside,0.144706,0.530237,1.0,-0.453008,-0.12982,0.488829,0.367424
nfrac_xstar_lop,-0.27109,0.151979,-0.453008,1.0,0.181947,-0.909558,-0.203733
nfrac_xstar_hillside,-0.074017,0.0749,-0.12982,0.181947,1.0,-0.157968,-0.883072
diameter_lop,0.379677,0.052209,0.488829,-0.909558,-0.157968,1.0,0.297851
diameter_hillside,0.254621,0.195383,0.367424,-0.203733,-0.883072,0.297851,1.0


In [72]:
for_corr = data.set_index(['Method','direct_thres','spread_thres','weight_indirect'])
for_display = pd.DataFrame(columns=feature_columns+list(for_corr.index.names))
for_display.set_index(list(for_corr.index.names),inplace=True)
for ix in for_corr.index.unique():
    dt = for_corr.loc[ix][[target_column]+feature_columns].corr().loc[target_column,feature_columns]
    dt.name = ix
    for_display = for_display.append(dt)

  """


In [77]:
for_display.T

Method,Massey,Colley
direct_thres,0.0,3.0
spread_thres,3.0,3.0
weight_indirect,0.25,0.00
delta_lop,-0.181122,-0.045966
delta_hillside,-0.118407,0.284547
nfrac_xstar_lop,0.067466,-0.065937
nfrac_xstar_hillside,-0.08798,0.058396
diameter_lop,-0.079867,0.376589
diameter_hillside,0.42151,0.102595


In [76]:
print(for_display.T.to_latex())

\begin{tabular}{lrr}
\toprule
Method &    Massey &    Colley \\
direct\_thres &       0.0 &       3.0 \\
spread\_thres &       3.0 &       3.0 \\
weight\_indirect &      0.25 &      0.00 \\
\midrule
delta\_lop            & -0.181122 & -0.045966 \\
delta\_hillside       & -0.118407 &  0.284547 \\
nfrac\_xstar\_lop      &  0.067466 & -0.065937 \\
nfrac\_xstar\_hillside & -0.087980 &  0.058396 \\
diameter\_lop         & -0.079867 &  0.376589 \\
diameter\_hillside    &  0.421510 &  0.102595 \\
\bottomrule
\end{tabular}

