**Magic Commands that are useful**
* autoreload is helpful because it reloads packages automatically once they are changed (i.e., pyrankability)
* matploblib inline is helpful because it tells the notebook to output figures to the notebook

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

**Convention on imports**
* It is convention to put builtin imports first
* Then standard third party imports
* Then custom imports (pyrankability)

In [2]:
import copy
import os

In [3]:
import pandas as pd
import numpy as np

In [4]:
import sys
# We need to include the path to pyrankability. This could be different for Tim, but altneratively he could point to your copy
sys.path.insert(0,"/disk/home/amy/rankability_toolbox_dev")

In [5]:
sys.path.insert(0,"/disk/home/amy/sensitivity_study/src")

In [6]:
from sensitivity_tests import *

In [7]:
import pyrankability

In [8]:
from base import *

In [9]:
games={}
years = ["2002","2003","2004","2005","2006","2007","2008","2009","2010","2011","2012","2013","2014","2015","2016","2017","2018"]
for year in years:
    games[year] = read_data('data/%steams.txt'%year,'data/%sgames.txt'%year,'data/%sMadnessTeams.txt'%year)
games[year]

Unnamed: 0,team2,team1,notsure1,date,H_A_N1,points1,H_A_N2,points2,team1_name,team2_name,team1_madness,team2_madness
0,1,16,737011,2017-11-13,1,83,-1,69,Arkansas_St,Abilene_Chr,0,0
1,1,41,737114,2018-02-24,-1,74,1,72,Cent_Arkansas,Abilene_Chr,0,0
3,1,143,737018,2017-11-20,-1,75,1,67,Lipscomb,Abilene_Chr,1,0
4,1,143,737045,2017-12-17,1,67,-1,65,Lipscomb,Abilene_Chr,1,0
5,1,199,737056,2017-12-28,1,77,-1,74,New_Orleans,Abilene_Chr,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
5535,351,318,737048,2017-12-20,1,91,-1,74,Utah_St,Youngstown_St,0,0
5536,351,339,737086,2018-01-27,1,85,-1,67,WI_Green_Bay,Youngstown_St,0,0
5537,351,340,737084,2018-01-25,1,66,-1,55,WI_Milwaukee,Youngstown_St,0,0
5538,351,347,737074,2018-01-15,1,77,-1,67,Wright_St,Youngstown_St,1,0


In [10]:
for year in games.keys():
    print("Dates for %s:"%year,games[year]['date'].min(),games[year]['date'].max())

Dates for 2002: 2001-11-08 00:00:00 2002-03-03 00:00:00
Dates for 2003: 2002-11-14 00:00:00 2003-03-09 00:00:00
Dates for 2004: 2003-11-13 00:00:00 2004-03-07 00:00:00
Dates for 2005: 2004-11-11 00:00:00 2005-03-06 00:00:00
Dates for 2006: 2005-11-08 00:00:00 2006-03-05 00:00:00
Dates for 2007: 2006-11-07 00:00:00 2007-03-04 00:00:00
Dates for 2008: 2007-11-05 00:00:00 2008-03-09 00:00:00
Dates for 2009: 2008-11-10 00:00:00 2009-03-08 00:00:00
Dates for 2010: 2009-11-09 00:00:00 2010-03-07 00:00:00
Dates for 2011: 2010-11-08 00:00:00 2011-03-06 00:00:00
Dates for 2012: 2011-11-07 00:00:00 2012-03-04 00:00:00
Dates for 2013: 2012-11-09 00:00:00 2013-03-10 00:00:00
Dates for 2014: 2013-11-08 00:00:00 2014-03-09 00:00:00
Dates for 2015: 2014-11-14 00:00:00 2015-03-08 00:00:00
Dates for 2016: 2015-11-13 00:00:00 2016-03-06 00:00:00
Dates for 2017: 2016-11-11 00:00:00 2017-03-05 00:00:00
Dates for 2018: 2017-11-10 00:00:00 2018-03-04 00:00:00


In [138]:
# d_ij is defined as the difference in score between a game between i and j (scorei-scorej)
# d_kj is defined as the difference in score between a game between k and j (scorek-scorej)
# I tried to write the if statements below out for clarity and they can be simplified
def support_map_vectorized1(linked,num_indirect_equal_direct=3):
    # columns
    # 'team_j', 'team_i_name', 'team_i_score', 'team_i_H_A_N',
    # 'team_j_i_score', 'team_j_i_H_A_N', 'game_i_j', 'team_k_name',
    # 'team_k_score', 'team_k_H_A_N', 'team_j_k_score', 'team_j_k_H_A_N',
    # 'game_k_j'
    linked["direct"] = linked["team_i_name"] == linked["team_k_name"]
    # | (linked["team_i_name"] == linked["team_j_k_name"]) | (linked["team_k_name"] == linked["team_j_k_name"])
    for_index1 = linked[["team_i_name","team_k_name"]].copy()
    for_index1.loc[linked["direct"]] = linked.loc[linked["direct"],["team_i_name","team_j_name"]]
    for_index1.columns = ["team1","team2"]
    for_index2 = linked[["team_k_name","team_i_name"]].copy()
    for_index2.loc[linked["direct"]] = linked.loc[linked["direct"],["team_j_name","team_i_name"]]
    for_index2.columns = ["team1","team2"]
    index_ik = pd.MultiIndex.from_frame(for_index1,sortorder=0)
    index_ki = pd.MultiIndex.from_frame(for_index2,sortorder=0)
    
    #######################################
    # part to modify
    # direct
    d_ik = linked['team_i_score'] - linked['team_j_i_score']
    direct_thres = 1
    support_ik = num_indirect_equal_direct*(linked["direct"] & (d_ik > direct_thres)).astype(int)
    support_ki = num_indirect_equal_direct*(linked["direct"] & (d_ik < -direct_thres)).astype(int)

    # indirect
    d_ij = linked["team_i_score"] - linked["team_j_i_score"]
    d_kj = linked["team_k_score"] - linked["team_j_k_score"]
    
    # always a positive and it captures that if i beat j by 5 points and k beat j by 2 points then this spread is 3
    spread = np.abs(d_ij - d_kj) 
    
    support_ik += ((~linked["direct"]) & (d_ij > 0) & (d_kj > 0) & (d_ij > d_kj) & (spread > 10)).astype(int)
    support_ik += ((~linked["direct"]) & (d_ij < 0) & (d_kj < 0) & (d_ij > d_kj) & (spread > 15)).astype(int)
    support_ik += ((~linked["direct"]) & (d_ij > 0) & (d_kj < 0) & (spread > 2)).astype(int)
    
    support_ki += ((~linked["direct"]) & (d_kj > 0) & (d_ij > 0) & (d_kj > d_ij) & (spread > 10)).astype(int)
    support_ki += ((~linked["direct"]) & (d_kj < 0) & (d_ij < 0) & (d_kj > d_ij) & (spread > 15)).astype(int)
    support_ki += ((~linked["direct"]) & (d_kj > 0) & (d_ij < 0) & (spread > 2)).astype(int)
    
    # end part to modify
    #######################################    
    linked["support_ik"]=support_ik
    linked["index_ik"]=index_ik
    linked["support_ki"]=support_ki
    linked["index_ki"]=index_ki
    
    print('Direct')
    print(sum(linked["direct"] & (linked["support_ik"]>0)), sum(linked["direct"] & (linked["support_ki"]>0)))
    print('Indirect')
    print(sum((~linked["direct"]) & (linked["support_ik"]>0)), sum(~linked["direct"] & (linked["support_ki"]>0)))
    
    prepare_ret = linked.drop_duplicates(subset='games', keep='first')[["support_ik","support_ki"]]
    ret1 = prepare_ret.set_index(index_ik)["support_ik"]
    ret2 = prepare_ret.set_index(index_ki)["support_ki"]
    ret = ret1.append(ret2)
    ret = ret.groupby(level=[0,1]).sum()

    return ret



In [139]:
Vs = {"map1": {}}
for year in games.keys():
    madness_teams = np.unique(list(games[year].team1_name.loc[games[year].team1_madness == 1]) + list(games[year].team2_name.loc[games[year].team2_madness == 1]))
    game_list = list(games[year].index)
    
    game_df = pd.DataFrame({"team1_name":games[year]['team1_name'],
                            "team1_score":games[year]['points1'],
                            "team1_H_A_N": games[year]['H_A_N1'],
                            "team2_name":games[year]['team2_name'],
                            "team2_score":games[year]['points2'],
                            "team2_H_A_N": games[year]['H_A_N1']})
    
    Vs["map1"][year] = pyrankability.construct.V_count_vectorized(game_df,support_map_vectorized1).loc[madness_teams,madness_teams]

Direct
5102 553
Indirect
9614 62560
Direct
5071 496
Indirect
8808 60371
Direct
4930 537
Indirect
8892 58788
Direct
5080 522
Indirect
9019 61549
Direct
5169 511
Indirect
8950 63005
Direct
5442 531
Indirect
10616 71208
Direct
5587 548
Indirect
10876 73352
Direct
5652 576
Indirect
10486 75063
Direct
5721 517
Indirect
10905 75135
Direct
5657 534
Indirect
10736 74452
Direct
5679 491
Indirect
11248 73932
Direct
5750 540
Indirect
11132 76234
Direct
5745 565
Indirect
10654 75476
Direct
5802 557
Indirect
10544 75608
Direct
5782 556
Indirect
10432 76682
Direct
5801 553
Indirect
10758 77446
Direct
5832 558
Indirect
11657 78169


### Grab the acc teams from 2014 and all games they played

In [140]:
acc_teams = ["Boston_College", 
             "Clemson", 
             "Duke", 
             "Georgia_Tech", 
             "Florida_St", 
             "NC_State", 
             "Syracuse", 
             "Louisville", 
             "Miami_FL", 
             "North_Carolina", 
             "Notre_Dame", 
             "Pittsburgh", 
             "Virginia", 
             "Virginia_Tech", 
             "Wake_Forest"]
acc_teams = [team.replace("the ","").replace(" ","_").replace("_University","").replace("_Institute","").replace("_of_Technology","_Tech") for team in acc_teams]
pd.Series(acc_teams)

0     Boston_College
1            Clemson
2               Duke
3       Georgia_Tech
4         Florida_St
5           NC_State
6           Syracuse
7         Louisville
8           Miami_FL
9     North_Carolina
10        Notre_Dame
11        Pittsburgh
12          Virginia
13     Virginia_Tech
14       Wake_Forest
dtype: object

In [141]:
year="2014"
game_df = pd.DataFrame({"team1_name":games[year]['team1_name'],
                        "team1_score":games[year]['points1'],
                        "team1_H_A_N": games[year]['H_A_N1'],
                        "team2_name":games[year]['team2_name'],
                        "team2_score":games[year]['points2'],
                        "team2_H_A_N": games[year]['H_A_N1']})

In [142]:
teams = list(game_df.loc[game_df.team1_name.isin(acc_teams) | game_df.team1_name.isin(acc_teams)].team1_name.value_counts().index)
teams += list(game_df.loc[game_df.team1_name.isin(acc_teams) | game_df.team1_name.isin(acc_teams)].team2_name.value_counts().index)
something_missing=False
for acc_team in acc_teams:
    if acc_team not in teams:
        print("missing",acc_team)
        something_missing = True
something_missing

False

In [143]:
acc_game_df = game_df.loc[game_df.team1_name.isin(acc_teams) | game_df.team2_name.isin(acc_teams)]
acc_game_df

Unnamed: 0,team1_name,team1_score,team1_H_A_N,team2_name,team2_score,team2_H_A_N
53,Duke,74,0,Alabama,64,0
107,Pittsburgh,58,1,Albany_NY,46,1
157,NC_State,98,1,Appalachian_St,77,1
174,Miami_FL,60,0,Arizona_St,57,0
257,Notre_Dame,93,1,Army,60,1
...,...,...,...,...,...,...
5291,Boston_College,89,0,Washington,78,0
5352,Virginia_Tech,87,1,West_Virginia,82,1
5399,Virginia_Tech,81,1,Winthrop,63,1
5414,Louisville,79,1,WKU,63,1


In [144]:
acc_game_df.to_csv("acc_game_info.csv",index=False)

In [223]:
import random
seed = random.randint(0,100000)
np.random.seed(69229)
#np.random.seed(seed)
print(seed)
sample_acc_game_df = acc_game_df.sample(frac=0.05)
sample_acc_game_df.to_csv("sample_acc_game_info.csv",index=False)
sample_acc_V = pyrankability.construct.V_count_vectorized(sample_acc_game_df,support_map_vectorized1)
sample_acc_V = sample_acc_V.loc[sample_acc_V.columns.isin(acc_teams),sample_acc_V.columns.isin(acc_teams)]
stacked_sample_acc_V = sample_acc_V.stack()
stacked_sample_acc_V.loc[stacked_sample_acc_V!=0]

8451
Direct
16 1
Indirect
3 10


team1           team2         
Georgia_Tech    Boston_College    3.0
                Notre_Dame        3.0
                Virginia_Tech     1.0
Miami_FL        Florida_St        3.0
NC_State        Notre_Dame        3.0
                Virginia_Tech     1.0
North_Carolina  Wake_Forest       6.0
Notre_Dame      Virginia_Tech     3.0
Pittsburgh      Clemson           3.0
                Virginia_Tech     3.0
Syracuse        Notre_Dame        1.0
Virginia_Tech   Clemson           1.0
Wake_Forest     North_Carolina    3.0
dtype: float64

In [224]:
stacked_sample_acc_V.loc[stacked_sample_acc_V!=0].to_csv("stacked_sample_acc_V.csv")

In [214]:
game_df.head().to_csv("sample_game_info.csv",index=False)

In [215]:
thresholds = [0,2,5,10]
results = {"lop":{},"massey":{},"colley":{}}
for t in thresholds:
    results["hillside(t=%d)"%t] = {}
    #results["hillside(t=%d,indices)"%t] = {}
for key in results.keys():
    for V_method in Vs.keys():
        results[key][V_method] = {}

In [216]:
for V_method in Vs.keys():
    for year in games.keys():
        print(V_method,year)
        V = Vs[V_method][year]
        # we have already captured indirect information from the other teams into our V
        for t in thresholds:
            C = pyrankability.construct.C_count(V,t)
            results["hillside(t=%d)"%(t)][V_method][year] = pyrankability.rank.solve(V,c_orig=C,method='hillside',lazy=False, cont=True)
        results["lop"][V_method][year] = pyrankability.rank.solve(V.fillna(0),method='lop',lazy=False, cont=True)
        try:
            results["massey"][V_method][year] = MasseyRankingAlgorithm().rank(V.fillna(0).values),MasseyRatingAlgorithm().rank(V.values)
        except Exception as e:
            results["massey"][V_method][year] = str(e)  
        try:
            results["colley"][V_method][year] = ColleyRankingAlgorithm().rank(V.fillna(0).values),ColleyRatingAlgorithm().rank(V.values)
        except Exception as e:
            results["colley"][V_method][year] = str(e)                
            

map1 2002
map1 2003
map1 2004
map1 2005
map1 2006
map1 2007
map1 2008
map1 2009
map1 2010
map1 2011
map1 2012
map1 2013
map1 2014
map1 2015
map1 2016
map1 2017
map1 2018


In [217]:
V = Vs[V_method]["2018"]
k,details=pyrankability.rank.solve(V.fillna(0),method='lop',lazy=False, cont=True,verbose=True)

Parameter OutputFlag unchanged
   Value: 1  Min: 0  Max: 1  Default: 1
Updating opjective in 0.0853 seconds
Start optimization 0
Changed value of parameter Threads to 7
   Prev: 0  Min: 0  Max: 1024  Default: 0
Changed value of parameter Method to 2
   Prev: -1  Min: -1  Max: 5  Default: -1
Changed value of parameter Crossover to 0
   Prev: -1  Min: -1  Max: 5  Default: -1
Optimize a model with 83328 rows, 2016 columns and 249984 nonzeros
Coefficient statistics:
  Matrix range     [1e+00, 1e+00]
  Objective range  [1e+00, 2e+01]
  Bounds range     [1e+00, 1e+00]
  RHS range        [1e+00, 1e+00]
Presolve time: 0.08s
Presolved: 2016 rows, 85344 columns, 252000 nonzeros
Ordering time: 0.00s

Barrier statistics:
 AA' NZ     : 1.250e+05
 Factor NZ  : 1.402e+06 (roughly 50 MBytes of memory)
 Factor Ops : 1.238e+09 (less than 1 second per iteration)
 Threads    : 7

                  Objective                Residual
Iter       Primal          Dual         Primal    Dual     Compl     Time
 

In [220]:
perm=np.array(details['P'][0])

In [221]:
V.iloc[perm,:].iloc[:,perm].sum()

team2
Villanova         60.0
Virginia          35.0
Gonzaga           27.0
Duke              57.0
North_Carolina    87.0
                  ... 
UMBC              51.0
Montana           36.0
CS_Fullerton      54.0
Iona              68.0
Lipscomb          92.0
Length: 64, dtype: float64

### Save the results for later

In [222]:
import joblib
joblib.dump({"results":results,"Vs":Vs,"games":games},"MarchMadnessAnalysis.joblib.z")

['MarchMadnessAnalysis.joblib.z']