# RPLib Problem 0001 - Baseline

Provides the baseline version to rankability problem 0001. 

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import copy
import os

import pandas as pd
import numpy as np

from scipy.stats import pearsonr

from tqdm import tqdm
from joblib import Parallel, delayed
import joblib
import itertools
from pathlib import Path
from sklearn.pipeline import Pipeline

from IPython.display import display, Markdown, Latex

**All packages are relative to the home directory of the user**

In [3]:
home = str(Path.home())

**Import the main rankability package**

In [4]:
import sys
sys.path.insert(0,"%s/rankability_toolbox_dev"%home)
import pyrankability

In [5]:
sys.path.insert(0,"%s/RPLib"%home)
import pyrplib

**Load the problem information**

In [6]:
problem = joblib.load("generate.joblib.z")

## Explore and setup the problem

In [7]:
problem.keys()

dict_keys(['description', 'target', 'data', 'other'])

In [8]:
print(problem["description"])

First representative example for the rankability library. Built around the study of NCAA Men's Basketball league.


In [9]:
problem['target']

Unnamed: 0,days_to_subtract1,days_to_subtract2,domain,range,direct_thres,spread_thres,weight_indirect,Method,Year,rankings1,rankings2,top10_jaccard,top10_tau,days_diff
0,28,35,all,madness,0,0,0.1,Colley,2002,Alabama 6.0 Arizona 7.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.800000,0.388889,7
1,21,35,all,madness,0,0,0.1,Colley,2002,Alabama 5.0 Arizona 9.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.636364,-0.111111,14
2,14,35,all,madness,0,0,0.1,Colley,2002,Alabama 6.0 Arizona 9.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.800000,-0.055556,21
3,7,35,all,madness,0,0,0.1,Colley,2002,Alabama 6.0 Arizona 7.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.800000,0.000000,28
4,21,28,all,madness,0,0,0.1,Colley,2002,Alabama 5.0 Arizona 9.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.800000,-0.722222,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1695,14,28,all,madness,0,0,0.1,Massey,2018,Cincinnati 8.0 Duke 2.0 Ka...,Cincinnati 6.0 Duke 4.0 Ka...,0.800000,0.222222,14
1696,7,28,all,madness,0,0,0.1,Massey,2018,Cincinnati 7.0 Duke 2.0 Ka...,Cincinnati 6.0 Duke 4.0 Ka...,0.800000,-0.111111,21
1697,14,21,all,madness,0,0,0.1,Massey,2018,Cincinnati 8.0 Duke 2.0 Ka...,Cincinnati 7.0 Duke 3.0 Ka...,0.800000,0.222222,7
1698,7,21,all,madness,0,0,0.1,Massey,2018,Cincinnati 7.0 Duke 2.0 Ka...,Cincinnati 7.0 Duke 3.0 Ka...,0.800000,0.222222,14


In [10]:
problem['data'].keys()

dict_keys(['2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018'])

In [11]:
problem['data']['2002'].keys()

dict_keys(['days_to_subtract=35', 'days_to_subtract=28', 'days_to_subtract=21', 'days_to_subtract=14', 'days_to_subtract=7'])

**Create easier to reference variables**

In [12]:
years = list(problem['data'].keys())
days_to_subtract_keys = list(problem['data'][years[0]].keys())
remaining_games = problem['other']['remaining_games']
madness_teams = problem['other']['madness_teams']
best_df = problem['other']['best_df']
top_k = problem['other']['top_k']
#target_column = f"top{top_k}_intersection"
feature_names = problem['other']['feature_names']

In [13]:
days_to_subtract_keys

['days_to_subtract=35',
 'days_to_subtract=28',
 'days_to_subtract=21',
 'days_to_subtract=14',
 'days_to_subtract=7']

In [14]:
target = problem['target']

In [15]:
target

Unnamed: 0,days_to_subtract1,days_to_subtract2,domain,range,direct_thres,spread_thres,weight_indirect,Method,Year,rankings1,rankings2,top10_jaccard,top10_tau,days_diff
0,28,35,all,madness,0,0,0.1,Colley,2002,Alabama 6.0 Arizona 7.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.800000,0.388889,7
1,21,35,all,madness,0,0,0.1,Colley,2002,Alabama 5.0 Arizona 9.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.636364,-0.111111,14
2,14,35,all,madness,0,0,0.1,Colley,2002,Alabama 6.0 Arizona 9.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.800000,-0.055556,21
3,7,35,all,madness,0,0,0.1,Colley,2002,Alabama 6.0 Arizona 7.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.800000,0.000000,28
4,21,28,all,madness,0,0,0.1,Colley,2002,Alabama 5.0 Arizona 9.0 Cincinnati...,Alabama 6.0 Arizona 7.0 Cincinnati...,0.800000,-0.722222,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1695,14,28,all,madness,0,0,0.1,Massey,2018,Cincinnati 8.0 Duke 2.0 Ka...,Cincinnati 6.0 Duke 4.0 Ka...,0.800000,0.222222,14
1696,7,28,all,madness,0,0,0.1,Massey,2018,Cincinnati 7.0 Duke 2.0 Ka...,Cincinnati 6.0 Duke 4.0 Ka...,0.800000,-0.111111,21
1697,14,21,all,madness,0,0,0.1,Massey,2018,Cincinnati 8.0 Duke 2.0 Ka...,Cincinnati 7.0 Duke 3.0 Ka...,0.800000,0.222222,7
1698,7,21,all,madness,0,0,0.1,Massey,2018,Cincinnati 7.0 Duke 2.0 Ka...,Cincinnati 7.0 Duke 3.0 Ka...,0.800000,0.222222,14


## Create D matrices

In [16]:
best_df

Unnamed: 0,days_to_subtract,Method,domain,range,direct_thres,spread_thres,weight_indirect
0,7,Colley,all,madness,0,0,0.1
1,7,Massey,all,madness,0,0,0.1
2,14,Colley,all,madness,0,0,0.1
3,14,Massey,all,madness,0,0,0.1
4,21,Colley,all,madness,0,0,0.1
5,21,Massey,all,madness,0,0,0.1
6,28,Colley,all,madness,0,0,0.1
7,28,Massey,all,madness,0,0,0.1
8,35,Colley,all,madness,0,0,0.1
9,35,Massey,all,madness,0,0,0.1


In [17]:
#Ds = process(problem['data'],problem['target'],best_df)

In [18]:
#Ds

In [19]:
#Ds.iloc[[0,-1]]

In [20]:
#Ds.loc['2002',"D"][0][0]

In [21]:
#Ds.loc['2002',"D"][0][1]

In [22]:
#Ds.index.names

In [23]:
rankings = {}
dt = target.set_index(['days_to_subtract2','Year','direct_thres','spread_thres','weight_indirect','range','Method'])
for index in dt.index:
    rankings[index] = dt.loc[index].iloc[0]['rankings2']
dt = target.set_index(['days_to_subtract1','Year','direct_thres','spread_thres','weight_indirect','range','Method'])
for index in dt.index:
    rankings[index] = dt.loc[index].iloc[0]['rankings1']

  after removing the cwd from sys.path.
  import sys


In [24]:
rankings_df = pd.DataFrame(rankings).T

### Compute the features

In [26]:
index_cols = ["Year","days_to_subtract_key","direct_thres","spread_thres","weight_indirect","range","Method"]
#process_pipe = Pipeline([('compute_D', pyrplib.transformers.ProcessTransformer(index_cols, days_to_subtract_keys, years, 'madness', madness_teams, best_df))])
#process_pipe.fit(problem['data'])
#Ds = process_pipe.transform(problem['data'])
feature_columns = ["delta_lop","delta_hillside","nfrac_xstar_lop","nfrac_xstar_hillside","diameter_lop","diameter_hillside"]
#feature_pipe = Pipeline([('create_features', pyrplib.transformers.CreateFeaturesTransformer(feature_columns, rankings_df, top_k))])
#feature_pipe.fit(Ds)
#X = feature_pipe.transform(Ds)
process_feature_pipe = Pipeline([('compute_D', pyrplib.transformers.ProcessTransformer(index_cols, days_to_subtract_keys, years, 'madness', madness_teams, best_df)),
                                ('create_features', pyrplib.transformers.CreateFeaturesTransformer(feature_columns, rankings_df, top_k))])
process_feature_pipe.fit(problem['data'])
X = process_feature_pipe.transform(problem['data'])

85it [03:52,  3.85s/it]
85it [03:54,  3.87s/it]
0it [00:00, ?it/s]

35 2002 0 0 0.1 madness Colley
Restricted license - for non-production use only - expires 2022-01-13


1it [00:00,  3.40it/s]

35 2002 0 0 0.1 madness Massey


2it [00:00,  3.50it/s]

35 2003 0 0 0.1 madness Colley


3it [00:00,  3.63it/s]

35 2003 0 0 0.1 madness Massey


4it [00:01,  3.71it/s]

35 2004 0 0 0.1 madness Colley


5it [00:01,  3.78it/s]

35 2004 0 0 0.1 madness Massey


6it [00:01,  3.85it/s]

35 2005 0 0 0.1 madness Colley


7it [00:01,  3.90it/s]

35 2005 0 0 0.1 madness Massey


8it [00:02,  3.93it/s]

35 2006 0 0 0.1 madness Colley


9it [00:02,  3.83it/s]

35 2006 0 0 0.1 madness Massey


10it [00:02,  3.89it/s]

35 2007 0 0 0.1 madness Colley


11it [00:02,  3.90it/s]

35 2007 0 0 0.1 madness Massey


12it [00:03,  3.94it/s]

35 2008 0 0 0.1 madness Colley


13it [00:03,  3.87it/s]

35 2008 0 0 0.1 madness Massey


14it [00:03,  3.92it/s]

35 2009 0 0 0.1 madness Colley


15it [00:03,  3.91it/s]

35 2009 0 0 0.1 madness Massey


16it [00:04,  3.90it/s]

35 2010 0 0 0.1 madness Colley


17it [00:04,  3.87it/s]

35 2010 0 0 0.1 madness Massey


18it [00:04,  3.87it/s]

35 2011 0 0 0.1 madness Colley


19it [00:04,  3.89it/s]

35 2011 0 0 0.1 madness Massey


20it [00:05,  3.93it/s]

35 2012 0 0 0.1 madness Colley


21it [00:05,  3.69it/s]

35 2012 0 0 0.1 madness Massey


22it [00:05,  3.68it/s]

35 2013 0 0 0.1 madness Colley


23it [00:05,  3.72it/s]

35 2013 0 0 0.1 madness Massey


24it [00:06,  3.74it/s]

35 2014 0 0 0.1 madness Colley


25it [00:06,  3.78it/s]

35 2014 0 0 0.1 madness Massey


26it [00:06,  3.80it/s]

35 2015 0 0 0.1 madness Colley


27it [00:07,  3.80it/s]

35 2015 0 0 0.1 madness Massey


28it [00:07,  3.82it/s]

35 2016 0 0 0.1 madness Colley


29it [00:07,  3.70it/s]

35 2016 0 0 0.1 madness Massey


30it [00:07,  3.68it/s]

35 2017 0 0 0.1 madness Colley


31it [00:08,  3.73it/s]

35 2017 0 0 0.1 madness Massey


32it [00:08,  3.48it/s]

35 2018 0 0 0.1 madness Colley


33it [00:08,  3.54it/s]

35 2018 0 0 0.1 madness Massey


34it [00:08,  3.60it/s]

28 2002 0 0 0.1 madness Colley


35it [00:09,  3.59it/s]

28 2002 0 0 0.1 madness Massey


36it [00:09,  3.64it/s]

28 2003 0 0 0.1 madness Colley


37it [00:09,  3.58it/s]

28 2003 0 0 0.1 madness Massey


38it [00:10,  3.65it/s]

28 2004 0 0 0.1 madness Colley


39it [00:10,  3.69it/s]

28 2004 0 0 0.1 madness Massey


40it [00:10,  3.71it/s]

28 2005 0 0 0.1 madness Colley


41it [00:10,  3.74it/s]

28 2005 0 0 0.1 madness Massey


42it [00:11,  3.76it/s]

28 2006 0 0 0.1 madness Colley


43it [00:11,  3.68it/s]

28 2006 0 0 0.1 madness Massey


44it [00:11,  3.58it/s]

28 2007 0 0 0.1 madness Colley


45it [00:11,  3.69it/s]

28 2007 0 0 0.1 madness Massey


46it [00:12,  3.78it/s]

28 2008 0 0 0.1 madness Colley


47it [00:12,  3.80it/s]

28 2008 0 0 0.1 madness Massey


48it [00:12,  3.64it/s]

28 2009 0 0 0.1 madness Colley


49it [00:13,  3.76it/s]

28 2009 0 0 0.1 madness Massey


50it [00:13,  3.84it/s]

28 2010 0 0 0.1 madness Colley


51it [00:13,  3.80it/s]

28 2010 0 0 0.1 madness Massey


52it [00:13,  3.77it/s]

28 2011 0 0 0.1 madness Colley


53it [00:14,  3.85it/s]

28 2011 0 0 0.1 madness Massey


54it [00:14,  3.86it/s]

28 2012 0 0 0.1 madness Colley


55it [00:14,  3.88it/s]

28 2012 0 0 0.1 madness Massey


56it [00:14,  3.89it/s]

28 2013 0 0 0.1 madness Colley


57it [00:15,  3.72it/s]

28 2013 0 0 0.1 madness Massey


58it [00:15,  3.74it/s]

28 2014 0 0 0.1 madness Colley


59it [00:15,  3.75it/s]

28 2014 0 0 0.1 madness Massey


60it [00:15,  3.81it/s]

28 2015 0 0 0.1 madness Colley


61it [00:16,  3.59it/s]

28 2015 0 0 0.1 madness Massey


62it [00:16,  3.70it/s]

28 2016 0 0 0.1 madness Colley


63it [00:16,  3.74it/s]

28 2016 0 0 0.1 madness Massey


64it [00:17,  3.72it/s]

28 2017 0 0 0.1 madness Colley


65it [00:17,  3.80it/s]

28 2017 0 0 0.1 madness Massey


66it [00:17,  3.81it/s]

28 2018 0 0 0.1 madness Colley


67it [00:17,  3.87it/s]

28 2018 0 0 0.1 madness Massey


68it [00:18,  3.88it/s]

21 2002 0 0 0.1 madness Colley


69it [00:18,  3.90it/s]

21 2002 0 0 0.1 madness Massey


70it [00:18,  3.89it/s]

21 2003 0 0 0.1 madness Colley


71it [00:18,  3.86it/s]

21 2003 0 0 0.1 madness Massey


72it [00:19,  3.85it/s]

21 2004 0 0 0.1 madness Colley


73it [00:19,  3.90it/s]

21 2004 0 0 0.1 madness Massey


74it [00:19,  3.91it/s]

21 2005 0 0 0.1 madness Colley


75it [00:19,  3.67it/s]

21 2005 0 0 0.1 madness Massey


76it [00:20,  3.76it/s]

21 2006 0 0 0.1 madness Colley


77it [00:20,  3.75it/s]

21 2006 0 0 0.1 madness Massey


78it [00:20,  3.75it/s]

21 2007 0 0 0.1 madness Colley


79it [00:20,  3.74it/s]

21 2007 0 0 0.1 madness Massey


80it [00:21,  3.82it/s]

21 2008 0 0 0.1 madness Colley


81it [00:21,  3.76it/s]

21 2008 0 0 0.1 madness Massey


82it [00:21,  3.83it/s]

21 2009 0 0 0.1 madness Colley


83it [00:21,  3.81it/s]

21 2009 0 0 0.1 madness Massey


84it [00:22,  3.87it/s]

21 2010 0 0 0.1 madness Colley


85it [00:22,  3.92it/s]

21 2010 0 0 0.1 madness Massey


86it [00:22,  3.94it/s]

21 2011 0 0 0.1 madness Colley


87it [00:22,  3.89it/s]

21 2011 0 0 0.1 madness Massey


88it [00:23,  3.89it/s]

21 2012 0 0 0.1 madness Colley


89it [00:23,  3.90it/s]

21 2012 0 0 0.1 madness Massey


90it [00:23,  3.93it/s]

21 2013 0 0 0.1 madness Colley


91it [00:24,  3.79it/s]

21 2013 0 0 0.1 madness Massey


92it [00:24,  3.77it/s]

21 2014 0 0 0.1 madness Colley


93it [00:24,  3.81it/s]

21 2014 0 0 0.1 madness Massey


94it [00:24,  3.88it/s]

21 2015 0 0 0.1 madness Colley


95it [00:25,  3.71it/s]

21 2015 0 0 0.1 madness Massey


96it [00:25,  3.78it/s]

21 2016 0 0 0.1 madness Colley


97it [00:25,  3.80it/s]

21 2016 0 0 0.1 madness Massey


98it [00:25,  3.80it/s]

21 2017 0 0 0.1 madness Colley


99it [00:26,  3.86it/s]

21 2017 0 0 0.1 madness Massey


100it [00:26,  3.88it/s]

21 2018 0 0 0.1 madness Colley


101it [00:26,  3.88it/s]

21 2018 0 0 0.1 madness Massey


102it [00:26,  3.81it/s]

14 2002 0 0 0.1 madness Colley


103it [00:27,  3.78it/s]

14 2002 0 0 0.1 madness Massey


104it [00:27,  3.85it/s]

14 2003 0 0 0.1 madness Colley


105it [00:27,  3.90it/s]

14 2003 0 0 0.1 madness Massey


106it [00:27,  3.94it/s]

14 2004 0 0 0.1 madness Colley


107it [00:28,  3.95it/s]

14 2004 0 0 0.1 madness Massey


108it [00:28,  3.94it/s]

14 2005 0 0 0.1 madness Colley


109it [00:28,  3.95it/s]

14 2005 0 0 0.1 madness Massey


110it [00:28,  3.95it/s]

14 2006 0 0 0.1 madness Colley


111it [00:29,  3.79it/s]

14 2006 0 0 0.1 madness Massey


112it [00:29,  3.75it/s]

14 2007 0 0 0.1 madness Colley


113it [00:29,  3.76it/s]

14 2007 0 0 0.1 madness Massey


114it [00:30,  3.82it/s]

14 2008 0 0 0.1 madness Colley


115it [00:30,  3.81it/s]

14 2008 0 0 0.1 madness Massey


116it [00:30,  3.59it/s]

14 2009 0 0 0.1 madness Colley


117it [00:30,  3.66it/s]

14 2009 0 0 0.1 madness Massey


118it [00:31,  3.74it/s]

14 2010 0 0 0.1 madness Colley


119it [00:31,  3.81it/s]

14 2010 0 0 0.1 madness Massey


120it [00:31,  3.86it/s]

14 2011 0 0 0.1 madness Colley


121it [00:31,  3.83it/s]

14 2011 0 0 0.1 madness Massey


122it [00:32,  3.89it/s]

14 2012 0 0 0.1 madness Colley


123it [00:32,  3.81it/s]

14 2012 0 0 0.1 madness Massey


124it [00:32,  3.78it/s]

14 2013 0 0 0.1 madness Colley


125it [00:32,  3.80it/s]

14 2013 0 0 0.1 madness Massey


126it [00:33,  3.85it/s]

14 2014 0 0 0.1 madness Colley


127it [00:33,  3.81it/s]

14 2014 0 0 0.1 madness Massey


128it [00:33,  3.85it/s]

14 2015 0 0 0.1 madness Colley


129it [00:33,  3.88it/s]

14 2015 0 0 0.1 madness Massey


130it [00:34,  3.87it/s]

14 2016 0 0 0.1 madness Colley


131it [00:34,  3.70it/s]

14 2016 0 0 0.1 madness Massey


132it [00:34,  3.60it/s]

14 2017 0 0 0.1 madness Colley


133it [00:35,  3.59it/s]

14 2017 0 0 0.1 madness Massey


134it [00:35,  3.57it/s]

14 2018 0 0 0.1 madness Colley


135it [00:35,  3.55it/s]

14 2018 0 0 0.1 madness Massey


136it [00:35,  3.54it/s]

7 2002 0 0 0.1 madness Colley


137it [00:36,  3.64it/s]

7 2002 0 0 0.1 madness Massey


138it [00:36,  3.70it/s]

7 2003 0 0 0.1 madness Colley


139it [00:36,  3.76it/s]

7 2003 0 0 0.1 madness Massey


140it [00:36,  3.77it/s]

7 2004 0 0 0.1 madness Colley


141it [00:37,  3.79it/s]

7 2004 0 0 0.1 madness Massey


142it [00:37,  3.77it/s]

7 2005 0 0 0.1 madness Colley


143it [00:37,  3.77it/s]

7 2005 0 0 0.1 madness Massey


144it [00:38,  3.82it/s]

7 2006 0 0 0.1 madness Colley


145it [00:38,  3.78it/s]

7 2006 0 0 0.1 madness Massey


146it [00:38,  3.83it/s]

7 2007 0 0 0.1 madness Colley


147it [00:38,  3.82it/s]

7 2007 0 0 0.1 madness Massey


148it [00:39,  3.87it/s]

7 2008 0 0 0.1 madness Colley


149it [00:39,  3.79it/s]

7 2008 0 0 0.1 madness Massey


150it [00:39,  3.77it/s]

7 2009 0 0 0.1 madness Colley


151it [00:39,  3.76it/s]

7 2009 0 0 0.1 madness Massey


152it [00:40,  3.80it/s]

7 2010 0 0 0.1 madness Colley


153it [00:40,  3.81it/s]

7 2010 0 0 0.1 madness Massey


154it [00:40,  3.85it/s]

7 2011 0 0 0.1 madness Colley


155it [00:40,  3.81it/s]

7 2011 0 0 0.1 madness Massey


156it [00:41,  3.85it/s]

7 2012 0 0 0.1 madness Colley


157it [00:41,  3.82it/s]

7 2012 0 0 0.1 madness Massey


158it [00:41,  3.81it/s]

7 2013 0 0 0.1 madness Colley


159it [00:42,  3.66it/s]

7 2013 0 0 0.1 madness Massey


160it [00:42,  3.71it/s]

7 2014 0 0 0.1 madness Colley


161it [00:42,  3.78it/s]

7 2014 0 0 0.1 madness Massey


162it [00:42,  3.84it/s]

7 2015 0 0 0.1 madness Colley


163it [00:43,  3.87it/s]

7 2015 0 0 0.1 madness Massey


164it [00:43,  3.83it/s]

7 2016 0 0 0.1 madness Colley


165it [00:43,  3.84it/s]

7 2016 0 0 0.1 madness Massey


166it [00:43,  3.83it/s]

7 2017 0 0 0.1 madness Colley


167it [00:44,  3.58it/s]

7 2017 0 0 0.1 madness Massey


168it [00:44,  3.66it/s]

7 2018 0 0 0.1 madness Colley


169it [00:44,  3.75it/s]

7 2018 0 0 0.1 madness Massey


170it [00:44,  3.81it/s]


In [27]:
# X = create_features(Ds,rankings_df,top_k)

In [28]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
Year,days_to_subtract_key,direct_thres,spread_thres,weight_indirect,range,Method,Construction,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2002,days_to_subtract=35,0,0,0.1,madness,Colley,Direct,0.0,0.0,56.0,70.0,28.0,35.0
2002,days_to_subtract=35,0,0,0.1,madness,Colley,Indirect,4.0,48.0,38.0,6.0,18.0,3.0
2002,days_to_subtract=35,0,0,0.1,madness,Colley,Both,0.4,56.0,12.0,8.0,6.0,4.0
2002,days_to_subtract=35,0,0,0.1,madness,Massey,Direct,0.0,0.0,56.0,68.0,28.0,34.0
2002,days_to_subtract=35,0,0,0.1,madness,Massey,Indirect,7.0,62.0,30.0,8.0,15.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,days_to_subtract=7,0,0,0.1,madness,Colley,Indirect,8.0,62.0,14.0,6.0,7.0,3.0
2018,days_to_subtract=7,0,0,0.1,madness,Colley,Both,1.5,78.0,4.0,12.0,2.0,4.0
2018,days_to_subtract=7,0,0,0.1,madness,Massey,Direct,1.0,1.0,32.0,44.0,16.0,22.0
2018,days_to_subtract=7,0,0,0.1,madness,Massey,Indirect,16.0,80.0,18.0,6.0,9.0,3.0


## Refine the target dataset

In [29]:
target = problem['target'].groupby(['days_to_subtract1','days_to_subtract2','Method','Year','direct_thres','spread_thres','weight_indirect'])[feature_names].mean()
target

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,top10_jaccard,top10_tau
days_to_subtract1,days_to_subtract2,Method,Year,direct_thres,spread_thres,weight_indirect,Unnamed: 7_level_1,Unnamed: 8_level_1
7,14,Colley,2002,0,0,0.1,0.800000,0.277778
7,14,Colley,2003,0,0,0.1,0.800000,-0.111111
7,14,Colley,2004,0,0,0.1,0.636364,0.055556
7,14,Colley,2005,0,0,0.1,1.000000,-0.555556
7,14,Colley,2006,0,0,0.1,0.636364,-0.444444
...,...,...,...,...,...,...,...,...
28,35,Massey,2014,0,0,0.1,0.636364,0.111111
28,35,Massey,2015,0,0,0.1,0.800000,-0.222222
28,35,Massey,2016,0,0,0.1,0.800000,0.166667
28,35,Massey,2017,0,0,0.1,1.000000,0.111111


In [30]:
target.corr()

Unnamed: 0,top10_jaccard,top10_tau
top10_jaccard,1.0,0.025604
top10_tau,0.025604,1.0


In [31]:
X_for_join = X.copy().reset_index()
X_for_join['days_to_subtract1']= X_for_join['days_to_subtract_key'].str.replace("days_to_subtract=","").astype(float)
X_for_join.weight_indirect=0.1
X_for_join

Unnamed: 0,Year,days_to_subtract_key,direct_thres,spread_thres,weight_indirect,range,Method,Construction,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside,days_to_subtract1
0,2002,days_to_subtract=35,0,0,0.1,madness,Colley,Direct,0.0,0.0,56.0,70.0,28.0,35.0,35.0
1,2002,days_to_subtract=35,0,0,0.1,madness,Colley,Indirect,4.0,48.0,38.0,6.0,18.0,3.0,35.0
2,2002,days_to_subtract=35,0,0,0.1,madness,Colley,Both,0.4,56.0,12.0,8.0,6.0,4.0,35.0
3,2002,days_to_subtract=35,0,0,0.1,madness,Massey,Direct,0.0,0.0,56.0,68.0,28.0,34.0,35.0
4,2002,days_to_subtract=35,0,0,0.1,madness,Massey,Indirect,7.0,62.0,30.0,8.0,15.0,4.0,35.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,2018,days_to_subtract=7,0,0,0.1,madness,Colley,Indirect,8.0,62.0,14.0,6.0,7.0,3.0,7.0
506,2018,days_to_subtract=7,0,0,0.1,madness,Colley,Both,1.5,78.0,4.0,12.0,2.0,4.0,7.0
507,2018,days_to_subtract=7,0,0,0.1,madness,Massey,Direct,1.0,1.0,32.0,44.0,16.0,22.0,7.0
508,2018,days_to_subtract=7,0,0,0.1,madness,Massey,Indirect,16.0,80.0,18.0,6.0,9.0,3.0,7.0


In [32]:
target

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,top10_jaccard,top10_tau
days_to_subtract1,days_to_subtract2,Method,Year,direct_thres,spread_thres,weight_indirect,Unnamed: 7_level_1,Unnamed: 8_level_1
7,14,Colley,2002,0,0,0.1,0.800000,0.277778
7,14,Colley,2003,0,0,0.1,0.800000,-0.111111
7,14,Colley,2004,0,0,0.1,0.636364,0.055556
7,14,Colley,2005,0,0,0.1,1.000000,-0.555556
7,14,Colley,2006,0,0,0.1,0.636364,-0.444444
...,...,...,...,...,...,...,...,...
28,35,Massey,2014,0,0,0.1,0.636364,0.111111
28,35,Massey,2015,0,0,0.1,0.800000,-0.222222
28,35,Massey,2016,0,0,0.1,0.800000,0.166667
28,35,Massey,2017,0,0,0.1,1.000000,0.111111


In [33]:
Xy = target.reset_index().set_index(['Method','days_to_subtract1','Year','direct_thres','spread_thres','weight_indirect']).join(X_for_join.set_index(['Method','days_to_subtract1','Year','direct_thres','spread_thres','weight_indirect'])).dropna()
Xy = Xy.reset_index()
Xy

Unnamed: 0,Method,days_to_subtract1,Year,direct_thres,spread_thres,weight_indirect,days_to_subtract2,top10_jaccard,top10_tau,days_to_subtract_key,range,Construction,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
0,Colley,7,2002,0,0,0.1,14,0.800000,0.277778,days_to_subtract=7,madness,Direct,1.0,0.0,50.0,54.0,25.0,27.0
1,Colley,7,2002,0,0,0.1,14,0.800000,0.277778,days_to_subtract=7,madness,Indirect,9.0,87.0,8.0,10.0,4.0,5.0
2,Colley,7,2002,0,0,0.1,14,0.800000,0.277778,days_to_subtract=7,madness,Both,2.0,104.0,6.0,10.0,3.0,5.0
3,Colley,7,2002,0,0,0.1,21,0.636364,-0.555556,days_to_subtract=7,madness,Direct,1.0,0.0,50.0,54.0,25.0,27.0
4,Colley,7,2002,0,0,0.1,21,0.636364,-0.555556,days_to_subtract=7,madness,Indirect,9.0,87.0,8.0,10.0,4.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1015,Massey,28,2017,0,0,0.1,35,1.000000,0.111111,days_to_subtract=28,madness,Indirect,11.0,80.0,16.0,10.0,8.0,5.0
1016,Massey,28,2017,0,0,0.1,35,1.000000,0.111111,days_to_subtract=28,madness,Both,2.1,92.0,12.0,0.0,6.0,-0.0
1017,Massey,28,2018,0,0,0.1,35,0.636364,0.055556,days_to_subtract=28,madness,Direct,2.0,2.0,64.0,64.0,31.0,31.0
1018,Massey,28,2018,0,0,0.1,35,0.636364,0.055556,days_to_subtract=28,madness,Indirect,7.0,84.0,16.0,2.0,8.0,1.0


## Process results

In [34]:
pairs_by_width = {}
for f1,f2 in itertools.combinations(X_for_join['days_to_subtract1'].unique().astype(int),2):
    if f2 < f1:
        f1,f2 = f2,f1
    width = f2-f1#round(100*(f2-f1))
    if width not in pairs_by_width:
        pairs_by_width[width] = []
    pairs_by_width[width].append((f1,f2))

In [35]:
pairs_by_width

{7: [(28, 35), (21, 28), (14, 21), (7, 14)],
 14: [(21, 35), (14, 28), (7, 21)],
 21: [(14, 35), (7, 28)],
 28: [(7, 35)]}

In [36]:
["days_to_subtract1","days_to_subtract2","Method","Construction"]+feature_columns

['days_to_subtract1',
 'days_to_subtract2',
 'Method',
 'Construction',
 'delta_lop',
 'delta_hillside',
 'nfrac_xstar_lop',
 'nfrac_xstar_hillside',
 'diameter_lop',
 'diameter_hillside']

In [37]:
#!sudo pip install pandas --upgrade

In [38]:
import altair as alt

index_cols = ['Method', 'Construction', 'days_to_subtract1', 'days_to_subtract2','width']
graph_dfs = {}
for target_column in feature_names:
    graph_df = pd.DataFrame(columns=index_cols+feature_columns).set_index(index_cols)

    for width in pairs_by_width.keys():
        summary = None
        for pair in pairs_by_width[width]:
            data = Xy.set_index(['days_to_subtract1','days_to_subtract2']).loc[pair].reset_index()
            for_corr = data.set_index(['Method','Construction',"days_to_subtract1","days_to_subtract2"])
            if summary is None:
                summary = pd.DataFrame(columns=["days_to_subtract1","days_to_subtract2","Method","Construction"]+feature_columns).set_index(list(for_corr.index.names))
            for ix in for_corr.index.unique():
                corr_results = for_corr.loc[ix][[target_column]+feature_columns].corr()
                target_corr_results = corr_results.loc[target_column].drop(target_column)
                target_corr_results.name = ix
                summary = summary.append(target_corr_results)

        graph_df1 = summary.reset_index()
        graph_df1['width'] = width
        graph_df1 = graph_df1.set_index(index_cols)
        graph_df = graph_df.append(graph_df1)
    graph_dfs[target_column]=graph_df

  # This is added back by InteractiveShellApp.init_path()
  app.launch_new_instance()


In [39]:
for key in graph_dfs.keys():
    graph_dfs[key] = graph_dfs[key].reset_index()

In [None]:
graph_dfs[key].head()

In [None]:
for key in graph_dfs.keys():
    display(Markdown(f'## {key}'))
    graph_df = graph_dfs[key].melt(value_vars=feature_columns,id_vars=index_cols,value_name='Value',var_name='Feature')

    display(Markdown('### Colley'))
    g = alt.Chart(graph_df.set_index('Method').loc['Colley']).mark_bar().encode(
        x='width:N',
        y=alt.Y('average(Value)',scale=alt.Scale(domain=[-.6, .6])),
        row='Feature:N',
        color='Construction:N',
        column='Construction:N'
    )
    display(g)
    
    display(Markdown('### Massey'))
    g = alt.Chart(graph_df.set_index('Method').loc['Massey']).mark_bar().encode(
        x='width:N',
        y=alt.Y('average(Value)',scale=alt.Scale(domain=[-.6, .6])),
        row='Feature:N',
        color='Construction:N',
        column='Construction:N'
    )
    display(g)

### Old below this line

### Colley

In [None]:
g = alt.Chart(graph_df.set_index('Method').loc['Massey']).mark_bar().encode(
    x='width:N',
    y=alt.Y('average(Value)',scale=alt.Scale(domain=[-.6, .6])),
    row='Feature:N',
    color='Construction:N',
    column='Construction:N'
)

### Massey

In [None]:
g

## old below

## 0.6 to 0.7

(this was already broken)

In [None]:
data = Xy.set_index(['frac1','frac2']).loc[(0.6,0.7)].reset_index()
for_corr = data.set_index(['Method','Construction'])
for ix in for_corr.index.unique():
    display(pd.Series(ix,index=for_corr.index.names))
    display(for_corr.loc[ix][[target_column]+feature_columns].corr())

### 0.7 to 0.8

In [None]:
data = Xy.set_index(['frac1','frac2']).loc[(0.7,0.8)].reset_index()
for_corr = data.set_index(['Method'])
for ix in for_corr.index.unique():
    display(pd.Series(ix,index=for_corr.index.names))
    display(for_corr.loc[ix][[target_column]+feature_columns].corr())

### 0.8 to 0.9

In [None]:
data = Xy.set_index(['frac1','frac2']).loc[(0.8,0.9)].reset_index()
for_corr = data.set_index(['Method'])
for ix in for_corr.index.unique():
    display(pd.Series(ix,index=for_corr.index.names))
    display(for_corr.loc[ix][[target_column]+feature_columns].corr())

### 0.9 to 1.

In [None]:
data = Xy.set_index(['frac1','frac2']).loc[(0.9,1.)].reset_index()
for_corr = data.set_index(['Method'])
for ix in for_corr.index.unique():
    display(pd.Series(ix,index=for_corr.index.names))
    display(for_corr.loc[ix][[target_column]+feature_columns].corr())

In [None]:
for_corr = data.set_index(['Method','direct_thres','spread_thres','weight_indirect'])
for_display = pd.DataFrame(columns=feature_columns+list(for_corr.index.names))
for_display.set_index(list(for_corr.index.names),inplace=True)
for ix in for_corr.index.unique():
    dt = for_corr.loc[ix][[target_column]+feature_columns].corr().loc[target_column,feature_columns]
    dt.name = ix
    for_display = for_display.append(dt)

In [None]:
for_display.T

In [None]:
print(for_display.T.to_latex())