# RPLib Problem 0001
## Baseline

Goal of this notebook is to analyze and visualize the results

In [198]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [199]:
import copy
import os
import pandas as pd
import numpy as np
import networkx as nx
from scipy.stats import pearsonr
from scipy.stats import skew
from tqdm import tqdm
import matplotlib.pyplot as plt
from joblib import Parallel, delayed

In [200]:
from pathlib import Path
home = str(Path.home())
home

'/home/jupyter-pander14'

In [201]:
import sys
sys.path.insert(0,"%s/rankability_toolbox_dev"%home)
import pyrankability

In [202]:
sys.path.insert(0,"%s/sensitivity_study/src"%home)
from sensitivity_tests import *
from utilities import *
from base import *

In [203]:
import itertools
import joblib

In [204]:
problem = joblib.load("/disk/RPLib/problem_0001.joblib.z")

In [205]:
print(problem["description"])


A practitioner wants to predict the degree to which a the rankings during season 
of the NCAA Men’s Basketball are likely to change as more games are played (i.e., sensitivity to more games). 
They want to start the analysis after a minimum of 50% of the games are played. 
They want to run Massey and Colley.

Sensitivity of new games will be measured as the intersection of between two 
rankings derived from before and after the new games are included.



In [206]:
problem['target']

Unnamed: 0,frac1,frac2,domain,range,direct_thres,spread_thres,weight_indirect,Method,Year,top15_intersection
0,0.5,0.6,all,madness,3.0,3.0,0.00,Massey,2002,0.266667
1,0.5,0.7,all,madness,3.0,3.0,0.00,Massey,2002,0.266667
2,0.5,0.8,all,madness,3.0,3.0,0.00,Massey,2002,0.333333
3,0.5,0.9,all,madness,3.0,3.0,0.00,Massey,2002,0.333333
4,0.5,1.0,all,madness,3.0,3.0,0.00,Massey,2002,0.400000
...,...,...,...,...,...,...,...,...,...,...
505,0.7,0.9,all,madness,0.0,3.0,0.25,Colley,2018,0.266667
506,0.7,1.0,all,madness,0.0,3.0,0.25,Colley,2018,0.266667
507,0.8,0.9,all,madness,0.0,3.0,0.25,Colley,2018,0.400000
508,0.8,1.0,all,madness,0.0,3.0,0.25,Colley,2018,0.266667


In [207]:
problem['data'].keys()

dict_keys(['2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018'])

In [208]:
problem['data']['2002'].keys()

dict_keys(['frac=0.5', 'frac=0.6', 'frac=0.7', 'frac=0.8', 'frac=0.9', 'frac=1.0'])

In [209]:
years = list(problem['data'].keys())
frac_keys = list(problem['data'][years[0]].keys())
remaining_games = problem['other']['remaining_games']
madness_teams = problem['other']['madness_teams']

In [210]:
# Parameters to search
direct_thress = [0]
spread_thress = [0]
weight_indirects = [0.25]
domains_ranges = [('all','madness')]

In [211]:
def compute_D(game_df,team_range,direct_thres,spread_thres,weight_indirect):
    map_func = lambda linked: pyrankability.construct.support_map_vectorized_direct_indirect_weighted(linked,direct_thres=direct_thres,spread_thres=spread_thres,weight_indirect=weight_indirect)
    D = pyrankability.construct.V_count_vectorized(game_df,map_func).reindex(index=team_range,columns=team_range)
    return D

In [212]:
feature_columns = ["delta_lop","delta_hillside","nfrac_xstar_lop","nfrac_xstar_hillside","diameter_lop","diameter_hillside"]

def calc_tau(n,obj):
    nchoose2 = pyrankability.common.nCr(n,2)
    tau = (nchoose2 - obj)/nchoose2
    return tau

def compute_features(D):
    delta_lop,details_lop = pyrankability.rank.solve(D.fillna(0),method='lop',cont=True)

    x = pd.DataFrame(details_lop['x'],index=D.index,columns=D.columns)
    r = x.sum(axis=0)
    order = np.argsort(r)
    xstar = x.iloc[order,:].iloc[:,order]
    xstar.loc[:,:] = pyrankability.common.threshold_x(xstar.values)
    inxs = np.triu_indices(len(xstar),k=1)
    xstar_upper = xstar.values[inxs[0],inxs[1]]
    nfrac_upper_lop = sum((xstar_upper > 0) & (xstar_upper < 1))
    
    k_two_distant,details_two_distant = pyrankability.search.solve_pair_max_tau(D.fillna(0),method='lop',verbose=False)
    d_lop = calc_tau(len(D),details_two_distant['obj'])
    
    delta_hillside,details_hillside = pyrankability.rank.solve(D,method='hillside',cont=True)
    
    x = pd.DataFrame(details_hillside['x'],index=D.index,columns=D.columns)
    r = x.sum(axis=0)
    order = np.argsort(r)
    xstar = x.iloc[order,:].iloc[:,order]
    xstar.loc[:,:] = pyrankability.common.threshold_x(xstar.values)
    inxs = np.triu_indices(len(xstar),k=1)
    xstar_upper = xstar.values[inxs[0],inxs[1]]
    nfrac_upper_hillside = sum((xstar_upper > 0) & (xstar_upper < 1))
    
    k_two_distant,details_two_distant = pyrankability.search.solve_pair_max_tau(D,method='hillside',verbose=False)
    d_hillside = calc_tau(len(D),details_two_distant['obj'])
    
    features = pd.Series([delta_lop,delta_hillside,2*nfrac_upper_lop,2*nfrac_upper_hillside,d_lop,d_hillside],index=feature_columns)

    return features

In [213]:
target_column = "top10_intersection"
def process(data,target):
    index_cols = ["Year","frac_key","direct_thres","spread_thres","weight_indirect","range"]
    Ds = pd.DataFrame(columns=["D"]+index_cols)
    Ds.set_index(index_cols,inplace=True)
    outer_keys = list(itertools.product(years,frac_keys,direct_thress,spread_thress,weight_indirects,domains_ranges))
    for year,frac_key,dt,st,iw,domain_range in tqdm(outer_keys):
       # set the team_range
        team_range = None
        ran = domain_range[1]
        if ran == 'madness':
            team_range = madness_teams[year]
        elif ran == 'all':
            team_range = all_teams[year]
        elif "top" in ran:
            team_range = all_teams[year]
        D = compute_D(data[year][frac_key],team_range,dt,st,iw)
        Ds = Ds.append(pd.Series([D],index=["D"],name=(year,frac_key,dt,st,iw,ran))) 
    return Ds

In [214]:
Ds = process(problem['data'],problem['target'])







  0%|          | 0/102 [00:00<?, ?it/s][A[A[A[A[A[A





  1%|          | 1/102 [00:00<00:35,  2.85it/s][A[A[A[A[A[A





  2%|▏         | 2/102 [00:00<00:37,  2.70it/s][A[A[A[A[A[A





  3%|▎         | 3/102 [00:01<00:40,  2.42it/s][A[A[A[A[A[A





  4%|▍         | 4/102 [00:01<00:46,  2.09it/s][A[A[A[A[A[A





  5%|▍         | 5/102 [00:02<00:56,  1.71it/s][A[A[A[A[A[A





  6%|▌         | 6/102 [00:03<01:07,  1.43it/s][A[A[A[A[A[A





  7%|▋         | 7/102 [00:04<00:55,  1.70it/s][A[A[A[A[A[A





  8%|▊         | 8/102 [00:04<00:50,  1.88it/s][A[A[A[A[A[A





  9%|▉         | 9/102 [00:04<00:48,  1.92it/s][A[A[A[A[A[A





 10%|▉         | 10/102 [00:05<00:50,  1.82it/s][A[A[A[A[A[A





 11%|█         | 11/102 [00:06<00:57,  1.59it/s][A[A[A[A[A[A





 12%|█▏        | 12/102 [00:07<01:05,  1.37it/s][A[A[A[A[A[A





 13%|█▎        | 13/102 [00:07<00:54,  1.65it/s][A[A[A[A[A[A





 14%

In [215]:
Ds.loc[('2002','frac=0.5',0,0,0.25,'madness'),'D']

team2,Alabama,Alcorn_St,Arizona,Boston_College,Boston_Univ,California,Central_Conn,Charlotte,Cincinnati,Connecticut,...,UNC_Wilmington,USC,Utah,Valparaiso,W_Kentucky,Wake_Forest,Winthrop,Wisconsin,Wyoming,Xavier
team1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,,0.00,,,,,,0.25,,0.00,...,,0.00,1.00,,,0.0,0.25,0.25,,0.00
Alcorn_St,0.0,,,,,0.0,,,,,...,,,,0.00,,0.0,0.00,0.00,,
Arizona,,,0.00,,,0.0,,0.50,0.0,0.25,...,,0.25,0.00,1.25,,,0.00,,,0.25
Boston_College,,,,0.00,1.00,0.0,0.25,,,0.25,...,0.00,0.00,,,,,0.25,,,0.00
Boston_Univ,,,,0.00,,0.0,0.00,0.00,,0.00,...,,,,0.00,,,0.00,0.00,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wake_Forest,0.0,0.25,,,,0.0,,0.00,0.0,,...,0.25,0.00,0.25,0.25,,,0.25,,0.0,
Winthrop,0.0,0.00,0.25,0.00,0.00,0.0,,0.00,0.0,,...,,,0.00,0.00,,0.0,,,0.0,0.00
Wisconsin,0.0,0.25,,,0.25,,,0.00,0.0,0.00,...,,,,,,,,0.00,,0.00
Wyoming,,,,,,,,,0.0,,...,,0.00,0.00,0.00,,0.0,0.25,,,


In [216]:
Ds.index.names

FrozenList(['Year', 'frac_key', 'direct_thres', 'spread_thres', 'weight_indirect', 'range'])

In [217]:
def create_features(Ds):
    index_cols = list(Ds.index.names)
    X = pd.DataFrame(columns=index_cols + feature_columns)
    X.set_index(index_cols,inplace=True)
    for index,row in tqdm(Ds.iterrows()):
        year,frac_key,dt,st,iw,ran = index
        features = compute_features(Ds.loc[(year,frac_key,dt,st,iw,ran),"D"])
        features.name = index
        X = X.append(features)
    return X

In [None]:
X = create_features(Ds)







0it [00:00, ?it/s][A[A[A[A[A[A





1it [26:15, 1575.00s/it][A[A[A[A[A[A





2it [31:04, 1189.31s/it][A[A[A[A[A[A





3it [40:28, 1001.62s/it][A[A[A[A[A[A





4it [1:05:27, 1150.93s/it][A[A[A[A[A[A





5it [1:11:24, 912.93s/it] [A[A[A[A[A[A





6it [1:39:35, 1146.35s/it][A[A[A[A[A[A





7it [1:46:48, 932.15s/it] [A[A[A[A[A[A





8it [1:53:40, 776.05s/it][A[A[A[A[A[A





9it [3:12:26, 1961.21s/it][A[A[A[A[A[A





10it [3:17:17, 1460.20s/it][A[A[A[A[A[A





11it [3:23:55, 1141.48s/it][A[A[A[A[A[A





12it [3:35:50, 1013.58s/it][A[A[A[A[A[A





13it [3:38:54, 764.55s/it] [A[A[A[A[A[A





14it [3:43:11, 612.44s/it][A[A[A[A[A[A





15it [3:47:23, 504.16s/it][A[A[A[A[A[A





16it [3:50:01, 400.45s/it][A[A[A[A[A[A





17it [4:01:45, 491.55s/it][A[A[A[A[A[A





18it [4:04:42, 397.10s/it][A[A[A[A[A[A





19it [4:11:34, 401.49s/it][A[A[A[A[A[A





20it 

In [None]:
X

In [None]:
index_cols = list(Ds.index.names)
index_cols

### Refine the target dataset
We will try to predict the average over the parameters run for each method.

In [197]:
target = problem['target'].groupby(['frac1','frac2','Method','Year'])['top10_intersection']#.mean().to_frame()
target

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f02a45ed048>

In [177]:
X_for_join = X.copy().reset_index()
X_for_join['frac1']= X_for_join['frac_key'].str.replace("frac=","").astype(float)
X_for_join

Unnamed: 0,Year,frac_key,direct_thres,spread_thres,weight_indirect,range,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside,frac1
0,2002,frac=0.5,0,0,0.25,madness,2.50,111.0,3966.0,3790.0,0.047596,0.089904,0.5
1,2002,frac=0.6,0,0,0.25,madness,2.25,159.0,3896.0,3756.0,0.064423,0.098558,0.6
2,2002,frac=0.7,0,0,0.25,madness,5.25,295.0,3922.0,3710.0,0.060577,0.108173,0.7
3,2002,frac=0.8,0,0,0.25,madness,6.00,318.0,3948.0,3774.0,0.054808,0.095673,0.8
4,2002,frac=0.9,0,0,0.25,madness,7.25,426.0,3886.0,3734.0,0.066346,0.102885,0.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,2018,frac=0.6,0,0,0.25,madness,41.25,6059.0,1212.0,572.0,0.704861,0.869544,0.6
98,2018,frac=0.7,0,0,0.25,madness,58.25,7983.0,1176.0,408.0,0.713294,0.898810,0.7
99,2018,frac=0.8,0,0,0.25,madness,82.50,9938.0,1004.0,410.0,0.752976,0.898313,0.8
100,2018,frac=0.9,0,0,0.25,madness,126.00,11782.0,1226.0,650.0,0.708333,0.894345,0.9


In [178]:
Xy = target.reset_index().set_index(['frac1','Year']).join(X_for_join.set_index(['frac1','Year']))
Xy

Unnamed: 0_level_0,Unnamed: 1_level_0,frac2,Method,top10_intersection,frac_key,direct_thres,spread_thres,weight_indirect,range,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
frac1,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0.5,2002,0.6,Colley,0.20,frac=0.5,0,0,0.25,madness,2.50,111.0,3966.0,3790.0,0.047596,0.089904
0.5,2002,0.6,Massey,0.25,frac=0.5,0,0,0.25,madness,2.50,111.0,3966.0,3790.0,0.047596,0.089904
0.5,2002,0.7,Colley,0.20,frac=0.5,0,0,0.25,madness,2.50,111.0,3966.0,3790.0,0.047596,0.089904
0.5,2002,0.7,Massey,0.25,frac=0.5,0,0,0.25,madness,2.50,111.0,3966.0,3790.0,0.047596,0.089904
0.5,2002,0.8,Colley,0.30,frac=0.5,0,0,0.25,madness,2.50,111.0,3966.0,3790.0,0.047596,0.089904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0.9,2016,1.0,Massey,0.35,frac=0.9,0,0,0.25,madness,32.25,1706.0,4002.0,3756.0,0.122037,0.175593
0.9,2017,1.0,Colley,0.30,frac=0.9,0,0,0.25,madness,29.75,1435.0,3488.0,3212.0,0.136905,0.203373
0.9,2017,1.0,Massey,0.25,frac=0.9,0,0,0.25,madness,29.75,1435.0,3488.0,3212.0,0.136905,0.203373
0.9,2018,1.0,Colley,0.35,frac=0.9,0,0,0.25,madness,126.00,11782.0,1226.0,650.0,0.708333,0.894345


## Narrowing our goal to 0.5 and 0.6

In [179]:
data = Xy.reset_index().set_index(['frac1','frac2']).loc[0.5,0.6]
data

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,Unnamed: 1_level_0,Year,Method,top10_intersection,frac_key,direct_thres,spread_thres,weight_indirect,range,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
frac1,frac2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0.5,0.6,2002,Colley,0.2,frac=0.5,0,0,0.25,madness,2.5,111.0,3966.0,3790.0,0.047596,0.089904
0.5,0.6,2002,Massey,0.25,frac=0.5,0,0,0.25,madness,2.5,111.0,3966.0,3790.0,0.047596,0.089904
0.5,0.6,2003,Colley,0.3,frac=0.5,0,0,0.25,madness,0.25,51.0,3904.0,3736.0,0.032242,0.073413
0.5,0.6,2003,Massey,0.3,frac=0.5,0,0,0.25,madness,0.25,51.0,3904.0,3736.0,0.032242,0.073413
0.5,0.6,2004,Colley,0.25,frac=0.5,0,0,0.25,madness,0.25,43.0,4082.0,3924.0,0.01875,0.056731
0.5,0.6,2004,Massey,0.4,frac=0.5,0,0,0.25,madness,0.25,43.0,4082.0,3924.0,0.01875,0.056731
0.5,0.6,2005,Colley,0.35,frac=0.5,0,0,0.25,madness,0.0,23.0,4054.0,3776.0,0.029327,0.092308
0.5,0.6,2005,Massey,0.15,frac=0.5,0,0,0.25,madness,0.0,23.0,4054.0,3776.0,0.029327,0.092308
0.5,0.6,2006,Colley,0.3,frac=0.5,0,0,0.25,madness,3.5,308.0,3874.0,3398.0,0.072115,0.183654
0.5,0.6,2006,Massey,0.2,frac=0.5,0,0,0.25,madness,3.5,308.0,3874.0,3398.0,0.072115,0.183654


In [183]:
from sklearn.preprocessing import scale
for_index = list(data.drop(feature_columns+['Year']+[target_column],axis=1).columns)
scaled_data = data.copy().reset_index().set_index(for_index)
for ix in scaled_data.index.unique():
    print("Scaling for group of",ix)
    scaled_data.loc[ix,feature_columns] = scale(scaled_data.loc[ix,feature_columns])
scaled_data = scaled_data.reset_index()
scaled_data[target_column] = data[target_column].values
scaled_data

Scaling for group of ('Colley', 'frac=0.5', 0, 0, 0.25, 'madness')
Scaling for group of ('Massey', 'frac=0.5', 0, 0, 0.25, 'madness')


  return self._getitem_tuple(key)
  coro.send(None)
  return self._getitem_tuple(key)
  coro.send(None)


Unnamed: 0,Method,frac_key,direct_thres,spread_thres,weight_indirect,range,frac1,frac2,Year,top10_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
0,Colley,frac=0.5,0,0,0.25,madness,0.5,0.6,2002,0.2,-0.279778,-0.391155,0.167134,0.310972,-0.353926,-0.430557
1,Massey,frac=0.5,0,0,0.25,madness,0.5,0.6,2002,0.25,-0.279778,-0.391155,0.167134,0.310972,-0.353926,-0.430557
2,Colley,frac=0.5,0,0,0.25,madness,0.5,0.6,2003,0.3,-0.814855,-0.473606,0.05972,0.241582,-0.4706,-0.522362
3,Massey,frac=0.5,0,0,0.25,madness,0.5,0.6,2003,0.3,-0.814855,-0.473606,0.05972,0.241582,-0.4706,-0.522362
4,Colley,frac=0.5,0,0,0.25,madness,0.5,0.6,2004,0.25,-0.814855,-0.484599,0.368103,0.483164,-0.573125,-0.61523
5,Massey,frac=0.5,0,0,0.25,madness,0.5,0.6,2004,0.4,-0.814855,-0.484599,0.368103,0.483164,-0.573125,-0.61523
6,Colley,frac=0.5,0,0,0.25,madness,0.5,0.6,2005,0.35,-0.874308,-0.512083,0.319593,0.292982,-0.492752,-0.417174
7,Massey,frac=0.5,0,0,0.25,madness,0.5,0.6,2005,0.15,-0.874308,-0.512083,0.319593,0.292982,-0.492752,-0.417174
8,Colley,frac=0.5,0,0,0.25,madness,0.5,0.6,2006,0.3,-0.041967,-0.120443,0.007745,-0.192751,-0.167608,0.091348
9,Massey,frac=0.5,0,0,0.25,madness,0.5,0.6,2006,0.2,-0.041967,-0.120443,0.007745,-0.192751,-0.167608,0.091348


In [185]:
import sklearn.linear_model as skl_lm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut
from sklearn.svm import SVR
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold

# based on 5x2 from http://ieeexplore.ieee.org/document/6790639/
# Dietterich also recommends a resampling method of his own devising called 5×2 cross-validation that involves 5 repeats of 2-fold cross-validation.
# Two folds are chosen to ensure that each observation appears only in the train or 
# test dataset for a single estimate of model skill. A paired Student’s t-test is used 
# on the results, updated to better reflect the limited degrees of freedom given the 
# dependence between the estimated skill scores.
# This interval is 2.571 for a 5% threshold and 3.365 for a 2% thresholds (https://www.medcalc.org/manual/t-distribution.php)

def evaluate(df,feature_cols,pred_col,model1,model2,param_grid1={},param_grid2={}):
    trn = df[feature_cols]
    target = df[pred_col]
    # Choose seeds for each 2-fold iterations
    seeds = [13, 51, 137, 24659, 347]
    # Initialize the score difference for the 1st fold of the 1st iteration 
    p_1_1 = 0.0
    # Initialize a place holder for the variance estimate
    s_sqr = 0.0
    # Initialize scores list for both classifiers
    scores_1 = []
    scores_2 = []
    diff_scores = []
    # Iterate through 5 2-fold CV
    for i_s, seed in enumerate(seeds):
        # Split the dataset in 2 parts with the current seed
        folds = KFold(n_splits=2, shuffle=True, random_state=seed)
        # Initialize score differences
        p_i = np.zeros(2)
        # Go through the current 2 fold
        for i_f, (trn_idx, val_idx) in enumerate(folds.split(target, target)):
            # Split the data
            trn_x, trn_y = trn.iloc[trn_idx], target.iloc[trn_idx]
            val_x, val_y = trn.iloc[val_idx], target.iloc[val_idx]
            cv = [(slice(None), slice(None))] # don't perform any cross validation
            grid1 = GridSearchCV(model1,param_grid1,verbose=0,n_jobs=-1,cv=cv,refit=True)
            grid2 = GridSearchCV(model2,param_grid2,verbose=0,n_jobs=-1,cv=cv,refit=True)
            # Train classifiers
            grid1.fit(trn_x, trn_y)
            grid2.fit(trn_x, trn_y)
            best_estimator1 = grid1.best_estimator_
            best_estimator2 = grid2.best_estimator_
            errors1 = val_y - best_estimator1.predict(val_x).flat
            errors2 = val_y - best_estimator2.predict(val_x).flat
            score_1 = -np.mean(np.abs(errors1))
            score_2 = -np.mean(np.abs(errors2))

            # keep score history for mean and stdev calculation
            scores_1.append(score_1)
            scores_2.append(score_2)
            diff_scores.append(score_1 - score_2)
            #print("Fold %2d score difference = %.6f" % (i_f + 1, score_1 - score_2))
            # Compute score difference for current fold  
            p_i[i_f] = score_1 - score_2
            # Keep the score difference of the 1st iteration and 1st fold
            if (i_s == 0) & (i_f == 0):
                p_1_1 = p_i[i_f]
        # Compute mean of scores difference for the current 2-fold CV
        p_i_bar = (p_i[0] + p_i[1]) / 2
        # Compute the variance estimate for the current 2-fold CV
        s_i_sqr = (p_i[0] - p_i_bar) ** 2 + (p_i[1] - p_i_bar) ** 2 
        # Add up to the overall variance
        s_sqr += s_i_sqr

    # Compute t value as the first difference divided by the square root of variance estimate
    t_bar = p_1_1 / ((s_sqr / 5) ** .5) 
 
    return pd.Series([t_bar,np.mean(diff_scores), np.std(diff_scores),np.mean(scores_1),np.mean(scores_2),np.std(scores_1),np.std(scores_2)],index=["t_bar","Difference Mean","Difference Stdev","Mean Score 1","Mean Score 2","Stdev 1","Stdev 2"])



In [186]:
for_index

['Method',
 'frac_key',
 'direct_thres',
 'spread_thres',
 'weight_indirect',
 'range']

In [196]:
scaled_data.set_index(for_index)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,frac1,frac2,Year,top10_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
Method,frac_key,direct_thres,spread_thres,weight_indirect,range,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Colley,frac=0.5,0,0,0.25,madness,0.5,0.6,2002,0.2,-0.279778,-0.391155,0.167134,0.310972,-0.353926,-0.430557
Massey,frac=0.5,0,0,0.25,madness,0.5,0.6,2002,0.25,-0.279778,-0.391155,0.167134,0.310972,-0.353926,-0.430557
Colley,frac=0.5,0,0,0.25,madness,0.5,0.6,2003,0.3,-0.814855,-0.473606,0.05972,0.241582,-0.4706,-0.522362
Massey,frac=0.5,0,0,0.25,madness,0.5,0.6,2003,0.3,-0.814855,-0.473606,0.05972,0.241582,-0.4706,-0.522362
Colley,frac=0.5,0,0,0.25,madness,0.5,0.6,2004,0.25,-0.814855,-0.484599,0.368103,0.483164,-0.573125,-0.61523
Massey,frac=0.5,0,0,0.25,madness,0.5,0.6,2004,0.4,-0.814855,-0.484599,0.368103,0.483164,-0.573125,-0.61523
Colley,frac=0.5,0,0,0.25,madness,0.5,0.6,2005,0.35,-0.874308,-0.512083,0.319593,0.292982,-0.492752,-0.417174
Massey,frac=0.5,0,0,0.25,madness,0.5,0.6,2005,0.15,-0.874308,-0.512083,0.319593,0.292982,-0.492752,-0.417174
Colley,frac=0.5,0,0,0.25,madness,0.5,0.6,2006,0.3,-0.041967,-0.120443,0.007745,-0.192751,-0.167608,0.091348
Massey,frac=0.5,0,0,0.25,madness,0.5,0.6,2006,0.2,-0.041967,-0.120443,0.007745,-0.192751,-0.167608,0.091348


In [187]:
dummy_svr_results = scaled_data.groupby(for_index).apply(evaluate,feature_columns,
                                                         target_column,
                                                         DummyRegressor(),SVR(gamma='scale'),
                                                         param_grid1 = {},
                                                         param_grid2 = {'C': [0.1,1,10], 
                                                                        'epsilon': [0.1,0.5,1.],
                                                                        'kernel': ['linear']#, 'poly', 'rbf', 'sigmoid'],
                                                                       })


In [188]:
dummy_svr_results

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,t_bar,Difference Mean,Difference Stdev,Mean Score 1,Mean Score 2,Stdev 1,Stdev 2
Method,frac_key,direct_thres,spread_thres,weight_indirect,range,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Colley,frac=0.5,0,0,0.25,madness,0.283914,0.02598,0.043992,-0.069306,-0.095286,0.015835,0.036026
Massey,frac=0.5,0,0,0.25,madness,1.085259,0.04916,0.048952,-0.072222,-0.121382,0.011275,0.048174


In [189]:
from sklearn.linear_model import LinearRegression

dummy_linear_results = scaled_data.groupby(for_index).apply(evaluate,feature_columns,
                                                            target_column,
                                                            DummyRegressor(),LinearRegression(),
                                                            param_grid1 = {},
                                                            param_grid2 = {})


In [190]:
dummy_linear_results

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,t_bar,Difference Mean,Difference Stdev,Mean Score 1,Mean Score 2,Stdev 1,Stdev 2
Method,frac_key,direct_thres,spread_thres,weight_indirect,range,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Colley,frac=0.5,0,0,0.25,madness,0.296564,0.161822,0.190088,-0.069306,-0.231128,0.015835,0.197077
Massey,frac=0.5,0,0,0.25,madness,0.212473,0.223405,0.247342,-0.072222,-0.295627,0.011275,0.25477


In [193]:
scaled_data.set_index('Method').loc['Colley'][[target_column]+feature_columns].corr()

Unnamed: 0,top10_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top10_intersection,1.0,0.235198,0.33286,-0.270931,-0.293501,0.334932,0.339445
delta_lop,0.235198,1.0,0.957045,-0.86135,-0.892216,0.950884,0.947786
delta_hillside,0.33286,0.957045,1.0,-0.939114,-0.960669,0.996449,0.994316
nfrac_xstar_lop,-0.270931,-0.86135,-0.939114,1.0,0.992538,-0.945268,-0.951195
nfrac_xstar_hillside,-0.293501,-0.892216,-0.960669,0.992538,1.0,-0.963285,-0.975179
diameter_lop,0.334932,0.950884,0.996449,-0.945268,-0.963285,1.0,0.995465
diameter_hillside,0.339445,0.947786,0.994316,-0.951195,-0.975179,0.995465,1.0


In [194]:
data.set_index('Method').loc['Colley'][[target_column]+feature_columns].corr()

Unnamed: 0,top10_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top10_intersection,1.0,0.235198,0.33286,-0.270931,-0.293501,0.334932,0.339445
delta_lop,0.235198,1.0,0.957045,-0.86135,-0.892216,0.950884,0.947786
delta_hillside,0.33286,0.957045,1.0,-0.939114,-0.960669,0.996449,0.994316
nfrac_xstar_lop,-0.270931,-0.86135,-0.939114,1.0,0.992538,-0.945268,-0.951195
nfrac_xstar_hillside,-0.293501,-0.892216,-0.960669,0.992538,1.0,-0.963285,-0.975179
diameter_lop,0.334932,0.950884,0.996449,-0.945268,-0.963285,1.0,0.995465
diameter_hillside,0.339445,0.947786,0.994316,-0.951195,-0.975179,0.995465,1.0


In [195]:
scaled_data.set_index('Method').loc['Massey'][[target_column]+feature_columns].corr()

Unnamed: 0,top10_intersection,delta_lop,delta_hillside,nfrac_xstar_lop,nfrac_xstar_hillside,diameter_lop,diameter_hillside
top10_intersection,1.0,0.039379,0.072486,-0.071799,-0.023966,0.072613,0.034406
delta_lop,0.039379,1.0,0.957045,-0.86135,-0.892216,0.950884,0.947786
delta_hillside,0.072486,0.957045,1.0,-0.939114,-0.960669,0.996449,0.994316
nfrac_xstar_lop,-0.071799,-0.86135,-0.939114,1.0,0.992538,-0.945268,-0.951195
nfrac_xstar_hillside,-0.023966,-0.892216,-0.960669,0.992538,1.0,-0.963285,-0.975179
diameter_lop,0.072613,0.950884,0.996449,-0.945268,-0.963285,1.0,0.995465
diameter_hillside,0.034406,0.947786,0.994316,-0.951195,-0.975179,0.995465,1.0
