# RPLib Problem 0001
## Baseline

Goal of this notebook is to analyze and visualize the results

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import copy
import os
import pandas as pd
import numpy as np
import networkx as nx
from scipy.stats import pearsonr
from scipy.stats import skew
from tqdm import tqdm
import matplotlib.pyplot as plt
from joblib import Parallel, delayed

In [4]:
from pathlib import Path
home = str(Path.home())
home

'/home/jupyter-pander14'

In [5]:
import sys
sys.path.insert(0,"%s/rankability_toolbox_dev"%home)
import pyrankability

In [6]:
sys.path.insert(0,"%s/sensitivity_study/src"%home)
from sensitivity_tests import *
from utilities import *
from base import *

In [7]:
import itertools
import joblib

In [8]:
problem = joblib.load("/disk/RPLib/problem_0001.joblib.z")

In [9]:
print(problem["description"])


A practitioner wants to predict the degree to which a the rankings during season 
of the NCAA Men’s Basketball are likely to change as more games are played (i.e., sensitivity to more games). 
They want to start the analysis after a minimum of 50% of the games are played. 
They want to run Massey and Colley.

Sensitivity of new games will be measured as the intersection of between two 
rankings derived from before and after the new games are included.



In [10]:
problem['target']

Unnamed: 0,frac1,frac2,domain,range,direct_thres,spread_thres,weight_indirect,Method,Year,top15_intersection
0,0.5,0.6,all,madness,0.0,3.0,0.25,Massey,2002,0.733333
1,0.5,0.7,all,madness,0.0,3.0,0.25,Massey,2002,0.800000
2,0.5,0.8,all,madness,0.0,3.0,0.25,Massey,2002,0.733333
3,0.5,0.9,all,madness,0.0,3.0,0.25,Massey,2002,0.733333
4,0.5,1.0,all,madness,0.0,3.0,0.25,Massey,2002,0.600000
...,...,...,...,...,...,...,...,...,...,...
505,0.7,0.9,all,madness,3.0,3.0,0.00,Colley,2018,0.800000
506,0.7,1.0,all,madness,3.0,3.0,0.00,Colley,2018,0.733333
507,0.8,0.9,all,madness,3.0,3.0,0.00,Colley,2018,0.800000
508,0.8,1.0,all,madness,3.0,3.0,0.00,Colley,2018,0.733333


In [11]:
problem['data'].keys()

dict_keys(['2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018'])

In [12]:
problem['data']['2002'].keys()

dict_keys(['frac=0.5', 'frac=0.6', 'frac=0.7', 'frac=0.8', 'frac=0.9', 'frac=1.0'])

In [13]:
years = list(problem['data'].keys())
frac_keys = list(problem['data'][years[0]].keys())
remaining_games = problem['other']['remaining_games']
madness_teams = problem['other']['madness_teams']

In [14]:
# Parameters to search
direct_thress = [0]
spread_thress = [3]
weight_indirects = [0.25]
domains_ranges = [('all','madness')]

In [15]:
def compute_D(game_df,team_range,direct_thres,spread_thres,weight_indirect):
    map_func = lambda linked: pyrankability.construct.support_map_vectorized_direct_indirect_weighted(linked,direct_thres=direct_thres,spread_thres=spread_thres,weight_indirect=weight_indirect)
    D = pyrankability.construct.V_count_vectorized(game_df,map_func).reindex(index=team_range,columns=team_range)
    return D

In [33]:
feature_columns = ["delta_lop","delta_hillside","nfrac_xstar_lop","nfrac_xstar_hillside","diameter_lop","diameter_hillside"]

def calc_tau(n,obj):
    nchoose2 = pyrankability.common.nCr(n,2)
    tau = (nchoose2 - obj)/nchoose2
    return tau

def compute_features(D,top_k=20):
    delta_lop,details_lop = pyrankability.rank.solve(D.fillna(0),method='lop',cont=True)

    x = pd.DataFrame(details_lop['x'],index=D.index,columns=D.columns)
    r = x.sum(axis=0)
    order = np.argsort(r)
    xstar = x.iloc[order,:].iloc[:,order]
    xstar.loc[:,:] = pyrankability.common.threshold_x(xstar.values)
    inxs = np.triu_indices(len(xstar),k=1)
    xstar_upper = xstar.values[inxs[0],inxs[1]]
    nfrac_upper_lop = sum((xstar_upper > 0) & (xstar_upper < 1))
    
    top_teams = xstar.columns[:top_k]
    
    k_two_distant,details_two_distant = pyrankability.search.solve_pair_max_tau(D.loc[top_teams,top_teams].fillna(0),method='lop',cont=False,verbose=False)
    d_lop = calc_tau(len(D),details_two_distant['obj'])
    
    delta_hillside,details_hillside = pyrankability.rank.solve(D,method='hillside',cont=True)
    
    x = pd.DataFrame(details_hillside['x'],index=D.index,columns=D.columns)
    r = x.sum(axis=0)
    order = np.argsort(r)
    xstar = x.iloc[order,:].iloc[:,order]
    xstar.loc[:,:] = pyrankability.common.threshold_x(xstar.values)
    inxs = np.triu_indices(len(xstar),k=1)
    xstar_upper = xstar.values[inxs[0],inxs[1]]
    nfrac_upper_hillside = sum((xstar_upper > 0) & (xstar_upper < 1))
    
    top_teams = xstar.columns[:top_k]
    
    k_two_distant,details_two_distant = pyrankability.search.solve_pair_max_tau(D.loc[top_teams,top_teams],method='hillside',verbose=False,cont=False)
    d_hillside = calc_tau(len(D),details_two_distant['obj'])
    
    features = pd.Series([delta_lop,delta_hillside,2*nfrac_upper_lop,2*nfrac_upper_hillside,d_lop,d_hillside],index=feature_columns)

    return features

In [30]:
target_column = "top15_intersection"
def process(data,target):
    index_cols = ["Year","frac_key","direct_thres","spread_thres","weight_indirect","range"]
    Ds = pd.DataFrame(columns=["D"]+index_cols)
    Ds.set_index(index_cols,inplace=True)
    outer_keys = list(itertools.product(years,frac_keys,direct_thress,spread_thress,weight_indirects,domains_ranges))
    for year,frac_key,dt,st,iw,domain_range in tqdm(outer_keys):
       # set the team_range
        team_range = None
        ran = domain_range[1]
        if ran == 'madness':
            team_range = madness_teams[year]
        elif ran == 'all':
            team_range = all_teams[year]
        elif "top" in ran:
            team_range = all_teams[year]
        D = compute_D(data[year][frac_key],team_range,dt,st,iw)
        Ds = Ds.append(pd.Series([D],index=["D"],name=(year,frac_key,dt,st,iw,ran))) 
    return Ds

In [18]:
Ds = process(problem['data'],problem['target'])

100%|██████████| 102/102 [01:05<00:00,  1.10it/s]


In [19]:
Ds

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,D
Year,frac_key,direct_thres,spread_thres,weight_indirect,range,Unnamed: 6_level_1
2002,frac=0.5,0,3,0.25,madness,team2 Alabama Alcorn_St Arizona B...
2002,frac=0.6,0,3,0.25,madness,team2 Alabama Alcorn_St Arizona B...
2002,frac=0.7,0,3,0.25,madness,team2 Alabama Alcorn_St Arizona B...
2002,frac=0.8,0,3,0.25,madness,team2 Alabama Alcorn_St Arizona B...
2002,frac=0.9,0,3,0.25,madness,team2 Alabama Alcorn_St Arizona B...
...,...,...,...,...,...,...
2018,frac=0.6,0,3,0.25,madness,team2 Alabama Arizona Arkansas Aub...
2018,frac=0.7,0,3,0.25,madness,team2 Alabama Arizona Arkansas Aub...
2018,frac=0.8,0,3,0.25,madness,team2 Alabama Arizona Arkansas Aub...
2018,frac=0.9,0,3,0.25,madness,team2 Alabama Arizona Arkansas Aub...


In [20]:
Ds.loc[('2002','frac=0.5',0,3,0.25,'madness'),'D']

team2,Alabama,Alcorn_St,Arizona,Boston_College,Boston_Univ,California,Central_Conn,Charlotte,Cincinnati,Connecticut,...,UNC_Wilmington,USC,Utah,Valparaiso,W_Kentucky,Wake_Forest,Winthrop,Wisconsin,Wyoming,Xavier
team1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,,0.00,,,,,,0.25,,0.00,...,,0.00,1.00,,,0.0,0.25,0.25,,0.00
Alcorn_St,0.0,,,,,0.0,,,,,...,,,,0.00,,0.0,0.00,0.00,,
Arizona,,,0.00,,,0.0,,0.50,0.0,0.25,...,,0.25,0.00,1.25,,,0.00,,,0.25
Boston_College,,,,0.00,1.00,0.0,0.25,,,0.25,...,0.00,0.00,,,,,0.25,,,0.00
Boston_Univ,,,,0.00,,0.0,0.00,0.00,,0.00,...,,,,0.00,,,0.00,0.00,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wake_Forest,0.0,0.25,,,,0.0,,0.00,0.0,,...,0.25,0.00,0.25,0.25,,,0.25,,0.0,
Winthrop,0.0,0.00,0.25,0.00,0.00,0.0,,0.00,0.0,,...,,,0.00,0.00,,0.0,,,0.0,0.00
Wisconsin,0.0,0.25,,,0.25,,,0.00,0.0,0.00,...,,,,,,,,0.00,,0.00
Wyoming,,,,,,,,,0.0,,...,,0.00,0.00,0.00,,0.0,0.25,,,


In [21]:
Ds.index.names

FrozenList(['Year', 'frac_key', 'direct_thres', 'spread_thres', 'weight_indirect', 'range'])

In [34]:
def create_features(Ds):
    index_cols = list(Ds.index.names)
    X = pd.DataFrame(columns=index_cols + feature_columns)
    X.set_index(index_cols,inplace=True)
    for index,row in tqdm(Ds.iterrows()):
        year,frac_key,dt,st,iw,ran = index
        features = compute_features(Ds.loc[(year,frac_key,dt,st,iw,ran),"D"])
        features.name = index
        X = X.append(features)
    return X

### To limit computation based on the minimum needed later, restrict to running on only 0.5

In [35]:
X = create_features(Ds.xs('frac=0.5',level='frac_key',drop_level=False))



0it [00:00, ?it/s][A[A

1it [00:04,  4.19s/it][A[A

2it [00:07,  4.03s/it][A[A

3it [00:11,  3.92s/it][A[A

4it [00:15,  3.87s/it][A[A

5it [00:19,  3.87s/it][A[A

6it [00:22,  3.84s/it][A[A

7it [00:26,  3.85s/it][A[A

8it [00:30,  3.74s/it][A[A

9it [00:34,  3.80s/it][A[A

10it [00:38,  3.91s/it][A[A

11it [00:42,  3.95s/it][A[A

12it [00:46,  3.99s/it][A[A

13it [00:50,  4.02s/it][A[A

14it [00:54,  4.11s/it][A[A

15it [00:59,  4.20s/it][A[A

16it [01:02,  3.96s/it][A[A

17it [01:06,  3.90s/it][A[A

In [36]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
Year,frac_key,direct_thres,spread_thres,weight_indirect,range,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2002,frac=0.5,0,3,0.25,madness,41.25,8323.0,1682.0,296.0,0.967788,0.980288
2003,frac=0.5,0,3,0.25,madness,24.0,5961.0,986.0,106.0,0.943452,0.982143
2004,frac=0.5,0,3,0.25,madness,12.5,5545.0,1018.0,398.0,0.975962,0.997115
2005,frac=0.5,0,3,0.25,madness,12.5,5318.0,1160.0,220.0,0.958173,0.992308
2006,frac=0.5,0,3,0.25,madness,18.5,7012.0,528.0,196.0,0.979327,0.996154
2007,frac=0.5,0,3,0.25,madness,28.5,8546.0,1056.0,236.0,0.967788,0.99375
2008,frac=0.5,0,3,0.25,madness,24.5,6082.0,1488.0,68.0,0.937004,0.998512
2009,frac=0.5,0,3,0.25,madness,20.5,7475.0,614.0,176.0,0.984623,0.996528
2010,frac=0.5,0,3,0.25,madness,24.0,7536.0,810.0,200.0,0.976923,0.999038
2011,frac=0.5,0,3,0.25,madness,29.5,10195.0,808.0,296.0,0.982002,0.993854


In [37]:
index_cols = list(Ds.index.names)
index_cols

['Year',
 'frac_key',
 'direct_thres',
 'spread_thres',
 'weight_indirect',
 'range']

### Refine the target dataset
We will try to predict the average over the parameters run for each method.

In [43]:
target = problem['target'].groupby(['frac1','frac2','Method','Year'])[target_column].mean().to_frame()
target

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,top15_intersection
frac1,frac2,Method,Year,Unnamed: 4_level_1
0.5,0.6,Colley,2002,0.800000
0.5,0.6,Colley,2003,0.733333
0.5,0.6,Colley,2004,0.800000
0.5,0.6,Colley,2005,0.933333
0.5,0.6,Colley,2006,0.800000
...,...,...,...,...
0.9,1.0,Massey,2014,0.800000
0.9,1.0,Massey,2015,0.866667
0.9,1.0,Massey,2016,0.800000
0.9,1.0,Massey,2017,0.866667


In [44]:
X_for_join = X.copy().reset_index()
X_for_join['frac1']= X_for_join['frac_key'].str.replace("frac=","").astype(float)
X_for_join

Unnamed: 0,Year,frac_key,direct_thres,spread_thres,weight_indirect,range,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside,frac1
0,2002,frac=0.5,0,3,0.25,madness,41.25,8323.0,1682.0,296.0,0.967788,0.980288,0.5
1,2003,frac=0.5,0,3,0.25,madness,24.0,5961.0,986.0,106.0,0.943452,0.982143,0.5
2,2004,frac=0.5,0,3,0.25,madness,12.5,5545.0,1018.0,398.0,0.975962,0.997115,0.5
3,2005,frac=0.5,0,3,0.25,madness,12.5,5318.0,1160.0,220.0,0.958173,0.992308,0.5
4,2006,frac=0.5,0,3,0.25,madness,18.5,7012.0,528.0,196.0,0.979327,0.996154,0.5
5,2007,frac=0.5,0,3,0.25,madness,28.5,8546.0,1056.0,236.0,0.967788,0.99375,0.5
6,2008,frac=0.5,0,3,0.25,madness,24.5,6082.0,1488.0,68.0,0.937004,0.998512,0.5
7,2009,frac=0.5,0,3,0.25,madness,20.5,7475.0,614.0,176.0,0.984623,0.996528,0.5
8,2010,frac=0.5,0,3,0.25,madness,24.0,7536.0,810.0,200.0,0.976923,0.999038,0.5
9,2011,frac=0.5,0,3,0.25,madness,29.5,10195.0,808.0,296.0,0.982002,0.993854,0.5


In [45]:
target

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,top15_intersection
frac1,frac2,Method,Year,Unnamed: 4_level_1
0.5,0.6,Colley,2002,0.800000
0.5,0.6,Colley,2003,0.733333
0.5,0.6,Colley,2004,0.800000
0.5,0.6,Colley,2005,0.933333
0.5,0.6,Colley,2006,0.800000
...,...,...,...,...
0.9,1.0,Massey,2014,0.800000
0.9,1.0,Massey,2015,0.866667
0.9,1.0,Massey,2016,0.800000
0.9,1.0,Massey,2017,0.866667


In [46]:
Xy = target.reset_index().set_index(['frac1','Year']).join(X_for_join.set_index(['frac1','Year']))
Xy

Unnamed: 0_level_0,Unnamed: 1_level_0,frac2,Method,top15_intersection,frac_key,direct_thres,spread_thres,weight_indirect,range,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
frac1,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0.5,2002,0.6,Colley,0.800000,frac=0.5,0.0,3.0,0.25,madness,41.25,8323.0,1682.0,296.0,0.967788,0.980288
0.5,2002,0.6,Massey,0.733333,frac=0.5,0.0,3.0,0.25,madness,41.25,8323.0,1682.0,296.0,0.967788,0.980288
0.5,2002,0.7,Colley,0.733333,frac=0.5,0.0,3.0,0.25,madness,41.25,8323.0,1682.0,296.0,0.967788,0.980288
0.5,2002,0.7,Massey,0.800000,frac=0.5,0.0,3.0,0.25,madness,41.25,8323.0,1682.0,296.0,0.967788,0.980288
0.5,2002,0.8,Colley,0.666667,frac=0.5,0.0,3.0,0.25,madness,41.25,8323.0,1682.0,296.0,0.967788,0.980288
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0.9,2016,1.0,Massey,0.800000,,,,,,,,,,,
0.9,2017,1.0,Colley,0.866667,,,,,,,,,,,
0.9,2017,1.0,Massey,0.866667,,,,,,,,,,,
0.9,2018,1.0,Colley,0.800000,,,,,,,,,,,


## Narrowing our goal to 0.5 and 0.6

In [47]:
data = Xy.reset_index().set_index(['frac1','frac2']).loc[0.5,0.6]
data

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,Unnamed: 1_level_0,Year,Method,top15_intersection,frac_key,direct_thres,spread_thres,weight_indirect,range,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
frac1,frac2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0.5,0.6,2002,Colley,0.8,frac=0.5,0.0,3.0,0.25,madness,41.25,8323.0,1682.0,296.0,0.967788,0.980288
0.5,0.6,2002,Massey,0.733333,frac=0.5,0.0,3.0,0.25,madness,41.25,8323.0,1682.0,296.0,0.967788,0.980288
0.5,0.6,2003,Colley,0.733333,frac=0.5,0.0,3.0,0.25,madness,24.0,5961.0,986.0,106.0,0.943452,0.982143
0.5,0.6,2003,Massey,0.8,frac=0.5,0.0,3.0,0.25,madness,24.0,5961.0,986.0,106.0,0.943452,0.982143
0.5,0.6,2004,Colley,0.8,frac=0.5,0.0,3.0,0.25,madness,12.5,5545.0,1018.0,398.0,0.975962,0.997115
0.5,0.6,2004,Massey,0.866667,frac=0.5,0.0,3.0,0.25,madness,12.5,5545.0,1018.0,398.0,0.975962,0.997115
0.5,0.6,2005,Colley,0.933333,frac=0.5,0.0,3.0,0.25,madness,12.5,5318.0,1160.0,220.0,0.958173,0.992308
0.5,0.6,2005,Massey,0.866667,frac=0.5,0.0,3.0,0.25,madness,12.5,5318.0,1160.0,220.0,0.958173,0.992308
0.5,0.6,2006,Colley,0.8,frac=0.5,0.0,3.0,0.25,madness,18.5,7012.0,528.0,196.0,0.979327,0.996154
0.5,0.6,2006,Massey,0.8,frac=0.5,0.0,3.0,0.25,madness,18.5,7012.0,528.0,196.0,0.979327,0.996154


In [48]:
from sklearn.preprocessing import scale
for_index = list(data.drop(feature_columns+['Year']+[target_column],axis=1).columns)
scaled_data = data.copy().reset_index().set_index(for_index)
for ix in scaled_data.index.unique():
    print("Scaling for group of",ix)
    scaled_data.loc[ix,feature_columns] = scale(scaled_data.loc[ix,feature_columns])
scaled_data = scaled_data.reset_index()
scaled_data[target_column] = data[target_column].values
scaled_data

Scaling for group of ('Colley', 'frac=0.5', 0.0, 3.0, 0.25, 'madness')
Scaling for group of ('Massey', 'frac=0.5', 0.0, 3.0, 0.25, 'madness')


  return self._getitem_tuple(key)
  coro.send(None)
  return self._getitem_tuple(key)
  coro.send(None)


Unnamed: 0,Method,frac_key,direct_thres,spread_thres,weight_indirect,range,frac1,frac2,Year,top15_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
0,Colley,frac=0.5,0.0,3.0,0.25,madness,0.5,0.6,2002,0.8,2.373216,0.503512,2.227524,0.600604,-0.108881,-2.617142
1,Massey,frac=0.5,0.0,3.0,0.25,madness,0.5,0.6,2002,0.733333,2.373216,0.503512,2.227524,0.600604,-0.108881,-2.617142
2,Colley,frac=0.5,0.0,3.0,0.25,madness,0.5,0.6,2003,0.733333,0.035867,-1.25626,-0.043767,-0.926917,-2.085252,-2.247221
3,Massey,frac=0.5,0.0,3.0,0.25,madness,0.5,0.6,2003,0.8,0.035867,-1.25626,-0.043767,-0.926917,-2.085252,-2.247221
4,Colley,frac=0.5,0.0,3.0,0.25,madness,0.5,0.6,2004,0.8,-1.522365,-1.566194,0.06066,1.420642,0.554867,0.739551
5,Massey,frac=0.5,0.0,3.0,0.25,madness,0.5,0.6,2004,0.866667,-1.522365,-1.566194,0.06066,1.420642,0.554867,0.739551
6,Colley,frac=0.5,0.0,3.0,0.25,madness,0.5,0.6,2005,0.933333,-1.522365,-1.735317,0.524056,-0.010404,-0.889761,-0.219504
7,Massey,frac=0.5,0.0,3.0,0.25,madness,0.5,0.6,2005,0.866667,-1.522365,-1.735317,0.524056,-0.010404,-0.889761,-0.219504
8,Colley,frac=0.5,0.0,3.0,0.25,madness,0.5,0.6,2006,0.8,-0.709374,-0.473228,-1.538382,-0.203354,0.828175,0.54774
9,Massey,frac=0.5,0.0,3.0,0.25,madness,0.5,0.6,2006,0.8,-0.709374,-0.473228,-1.538382,-0.203354,0.828175,0.54774


In [49]:
import sklearn.linear_model as skl_lm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut
from sklearn.svm import SVR
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold

# based on 5x2 from http://ieeexplore.ieee.org/document/6790639/
# Dietterich also recommends a resampling method of his own devising called 5×2 cross-validation that involves 5 repeats of 2-fold cross-validation.
# Two folds are chosen to ensure that each observation appears only in the train or 
# test dataset for a single estimate of model skill. A paired Student’s t-test is used 
# on the results, updated to better reflect the limited degrees of freedom given the 
# dependence between the estimated skill scores.
# This interval is 2.571 for a 5% threshold and 3.365 for a 2% thresholds (https://www.medcalc.org/manual/t-distribution.php)

def evaluate(df,feature_cols,pred_col,model1,model2,param_grid1={},param_grid2={}):
    trn = df[feature_cols]
    target = df[pred_col]
    # Choose seeds for each 2-fold iterations
    seeds = [13, 51, 137, 24659, 347]
    # Initialize the score difference for the 1st fold of the 1st iteration 
    p_1_1 = 0.0
    # Initialize a place holder for the variance estimate
    s_sqr = 0.0
    # Initialize scores list for both classifiers
    scores_1 = []
    scores_2 = []
    diff_scores = []
    # Iterate through 5 2-fold CV
    for i_s, seed in enumerate(seeds):
        # Split the dataset in 2 parts with the current seed
        folds = KFold(n_splits=2, shuffle=True, random_state=seed)
        # Initialize score differences
        p_i = np.zeros(2)
        # Go through the current 2 fold
        for i_f, (trn_idx, val_idx) in enumerate(folds.split(target, target)):
            # Split the data
            trn_x, trn_y = trn.iloc[trn_idx], target.iloc[trn_idx]
            val_x, val_y = trn.iloc[val_idx], target.iloc[val_idx]
            cv = [(slice(None), slice(None))] # don't perform any cross validation
            grid1 = GridSearchCV(model1,param_grid1,verbose=0,n_jobs=-1,cv=cv,refit=True)
            grid2 = GridSearchCV(model2,param_grid2,verbose=0,n_jobs=-1,cv=cv,refit=True)
            # Train classifiers
            grid1.fit(trn_x, trn_y)
            grid2.fit(trn_x, trn_y)
            best_estimator1 = grid1.best_estimator_
            best_estimator2 = grid2.best_estimator_
            errors1 = val_y - best_estimator1.predict(val_x).flat
            errors2 = val_y - best_estimator2.predict(val_x).flat
            score_1 = -np.mean(np.abs(errors1))
            score_2 = -np.mean(np.abs(errors2))

            # keep score history for mean and stdev calculation
            scores_1.append(score_1)
            scores_2.append(score_2)
            diff_scores.append(score_1 - score_2)
            #print("Fold %2d score difference = %.6f" % (i_f + 1, score_1 - score_2))
            # Compute score difference for current fold  
            p_i[i_f] = score_1 - score_2
            # Keep the score difference of the 1st iteration and 1st fold
            if (i_s == 0) & (i_f == 0):
                p_1_1 = p_i[i_f]
        # Compute mean of scores difference for the current 2-fold CV
        p_i_bar = (p_i[0] + p_i[1]) / 2
        # Compute the variance estimate for the current 2-fold CV
        s_i_sqr = (p_i[0] - p_i_bar) ** 2 + (p_i[1] - p_i_bar) ** 2 
        # Add up to the overall variance
        s_sqr += s_i_sqr

    # Compute t value as the first difference divided by the square root of variance estimate
    t_bar = p_1_1 / ((s_sqr / 5) ** .5) 
 
    return pd.Series([t_bar,np.mean(diff_scores), np.std(diff_scores),np.mean(scores_1),np.mean(scores_2),np.std(scores_1),np.std(scores_2)],index=["t_bar","Difference Mean","Difference Stdev","Mean Score 1","Mean Score 2","Stdev 1","Stdev 2"])



In [50]:
for_index

['Method',
 'frac_key',
 'direct_thres',
 'spread_thres',
 'weight_indirect',
 'range']

In [51]:
scaled_data.set_index(for_index)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,frac1,frac2,Year,top15_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
Method,frac_key,direct_thres,spread_thres,weight_indirect,range,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Colley,frac=0.5,0.0,3.0,0.25,madness,0.5,0.6,2002,0.8,2.373216,0.503512,2.227524,0.600604,-0.108881,-2.617142
Massey,frac=0.5,0.0,3.0,0.25,madness,0.5,0.6,2002,0.733333,2.373216,0.503512,2.227524,0.600604,-0.108881,-2.617142
Colley,frac=0.5,0.0,3.0,0.25,madness,0.5,0.6,2003,0.733333,0.035867,-1.25626,-0.043767,-0.926917,-2.085252,-2.247221
Massey,frac=0.5,0.0,3.0,0.25,madness,0.5,0.6,2003,0.8,0.035867,-1.25626,-0.043767,-0.926917,-2.085252,-2.247221
Colley,frac=0.5,0.0,3.0,0.25,madness,0.5,0.6,2004,0.8,-1.522365,-1.566194,0.06066,1.420642,0.554867,0.739551
Massey,frac=0.5,0.0,3.0,0.25,madness,0.5,0.6,2004,0.866667,-1.522365,-1.566194,0.06066,1.420642,0.554867,0.739551
Colley,frac=0.5,0.0,3.0,0.25,madness,0.5,0.6,2005,0.933333,-1.522365,-1.735317,0.524056,-0.010404,-0.889761,-0.219504
Massey,frac=0.5,0.0,3.0,0.25,madness,0.5,0.6,2005,0.866667,-1.522365,-1.735317,0.524056,-0.010404,-0.889761,-0.219504
Colley,frac=0.5,0.0,3.0,0.25,madness,0.5,0.6,2006,0.8,-0.709374,-0.473228,-1.538382,-0.203354,0.828175,0.54774
Massey,frac=0.5,0.0,3.0,0.25,madness,0.5,0.6,2006,0.8,-0.709374,-0.473228,-1.538382,-0.203354,0.828175,0.54774


In [52]:
dummy_svr_results = scaled_data.groupby(for_index).apply(evaluate,feature_columns,
                                                         target_column,
                                                         DummyRegressor(),SVR(gamma='scale'),
                                                         param_grid1 = {},
                                                         param_grid2 = {'C': [0.1,1,10], 
                                                                        'epsilon': [0.1,0.5,1.],
                                                                        'kernel': ['linear']#, 'poly', 'rbf', 'sigmoid'],
                                                                       })


In [54]:
dummy_svr_results

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,t_bar,Difference Mean,Difference Stdev,Mean Score 1,Mean Score 2,Stdev 1,Stdev 2
Method,frac_key,direct_thres,spread_thres,weight_indirect,range,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Colley,frac=0.5,0.0,3.0,0.25,madness,0.75893,0.006688829,0.010839,-0.066296,-0.072985,0.013896,0.017555
Massey,frac=0.5,0.0,3.0,0.25,madness,-1.051758,-9.020562e-18,0.005188,-0.046111,-0.046111,0.003967,0.005537


In [62]:
from sklearn.linear_model import LinearRegression

dummy_linear_results = scaled_data.groupby(for_index).apply(evaluate,["diameter_hillside"],
                                                            target_column,
                                                            DummyRegressor(),LinearRegression(),
                                                            param_grid1 = {},
                                                            param_grid2 = {})


In [63]:
dummy_linear_results

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,t_bar,Difference Mean,Difference Stdev,Mean Score 1,Mean Score 2,Stdev 1,Stdev 2
Method,frac_key,direct_thres,spread_thres,weight_indirect,range,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Colley,frac=0.5,0.0,3.0,0.25,madness,0.214972,0.005098,0.005961,-0.066296,-0.071395,0.013896,0.015589
Massey,frac=0.5,0.0,3.0,0.25,madness,1.246773,0.002231,0.005013,-0.046111,-0.048342,0.003967,0.004969


In [59]:
scaled_data.set_index('Method').loc['Colley'][[target_column]+feature_columns].corr()

Unnamed: 0,top15_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top15_intersection,1.0,-0.014313,0.106036,-0.038364,0.49202,0.278904,-0.012366
delta_lop,-0.014313,1.0,0.665603,0.354442,0.080729,-0.017145,-0.388255
delta_hillside,0.106036,0.665603,1.0,-0.118375,0.175304,0.495383,0.008037
nfrac_xstar_lop,-0.038364,0.354442,-0.118375,1.0,-0.025839,-0.578411,-0.350976
nfrac_xstar_hillside,0.49202,0.080729,0.175304,-0.025839,1.0,0.321597,-0.051514
diameter_lop,0.278904,-0.017145,0.495383,-0.578411,0.321597,1.0,0.315623
diameter_hillside,-0.012366,-0.388255,0.008037,-0.350976,-0.051514,0.315623,1.0


In [61]:
scaled_data.set_index('Method').loc['Massey'][[target_column]+feature_columns].corr()

Unnamed: 0,top15_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top15_intersection,1.0,-0.181122,-0.118407,0.067466,-0.08798,-0.079867,0.42151
delta_lop,-0.181122,1.0,0.665603,0.354442,0.080729,-0.017145,-0.388255
delta_hillside,-0.118407,0.665603,1.0,-0.118375,0.175304,0.495383,0.008037
nfrac_xstar_lop,0.067466,0.354442,-0.118375,1.0,-0.025839,-0.578411,-0.350976
nfrac_xstar_hillside,-0.08798,0.080729,0.175304,-0.025839,1.0,0.321597,-0.051514
diameter_lop,-0.079867,-0.017145,0.495383,-0.578411,0.321597,1.0,0.315623
diameter_hillside,0.42151,-0.388255,0.008037,-0.350976,-0.051514,0.315623,1.0
