# RPLib
## NCAA Men's Basketball Dataset, ACC

1. We need a team list for every year. We need to adapt the code below so it handles a dynamic team list for each year.
2. There is an overall date range:
   (selection_sunday - 35 TO selection_sunday - 7) of games to consider. but within that range, let's move one game at a time. 

In [1]:
major_description = "Study of ACC"
print(major_description)

Study of ACC


In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import copy
import os
import pandas as pd
import numpy as np
import networkx as nx
from scipy.stats import pearsonr
from scipy.stats import skew
from tqdm import tqdm
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import itertools
import joblib
from scipy import stats

In [4]:
from pathlib import Path
home = str(Path.home())
home

'/home/jupyter-arman'

In [5]:
import sys
sys.path.insert(0,"%s/rankability_toolbox_dev"%home)
import pyrankability

In [6]:
sys.path.insert(0,"%s/marchmadness_study/"%home)
import base

In [7]:
sys.path.insert(0,"%s/RPLib"%home)
import pyrplib

In [8]:
games={}
remaining_games={}
madness_teams={}
all_teams={}
years = ["2002","2003","2004","2005","2006","2007","2008","2009","2010","2011","2012","2013","2014","2015","2016","2017","2018"]
for year in years:
    games[year],remaining_games[year] = base.read_data(f'{home}/marchmadness_study/data/%steams.txt'%year,f'{home}/marchmadness_study/data/%sgames.txt'%year,f'{home}/marchmadness_study/data/%sMadnessTeams.txt'%year)
    madness_teams[year] = list(np.unique(list(games[year].team1_name.loc[games[year].team1_madness == 1]) + list(games[year].team2_name.loc[games[year].team2_madness == 1])))
    all_teams[year] = list(np.unique(list(games[year].team1_name) + list(games[year].team2_name)))
print(year)
games[year]

2018


Unnamed: 0,team2,team1,notsure1,date,H_A_N1,points1,H_A_N2,points2,team1_name,team2_name,team1_madness,team2_madness
4147,256,170,737008,2017-11-10,1,92,-1,77,Minnesota,SC_Upstate,0,0
4288,265,293,737008,2017-11-10,1,75,-1,50,Texas_Tech,South_Alabama,1,0
925,56,326,737008,2017-11-10,1,75,-1,60,Villanova,Columbia,1,0
521,32,176,737008,2017-11-10,1,79,-1,78,Monmouth_NJ,Bucknell,0,1
4252,263,235,737008,2017-11-10,1,105,-1,74,Purdue,SIUE,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1985,121,152,737122,2018-03-04,0,65,0,49,Loyola-Chicago,Illinois_St,1,0
933,57,114,737122,2018-03-04,1,81,-1,71,Houston,Connecticut,1,0
1370,81,164,737122,2018-03-04,1,90,-1,70,Memphis,East_Carolina,0,0
1483,88,143,737122,2018-03-04,-1,108,1,96,Lipscomb,FL_Gulf_Coast,1,0


## What does a dataset look like?

In [9]:
remaining_games[year].head()

Unnamed: 0,team2,team1,notsure1,date,H_A_N1,points1,H_A_N2,points2,team1_name,team2_name,team1_madness,team2_madness
2,1,73,737130,2018-03-12,1,80,-1,73,Drake,Abilene_Chr,0,0
32,2,313,737125,2018-03-07,1,97,-1,90,UNLV,Air_Force,0,0
42,3,79,737126,2018-03-08,0,67,0,58,E_Michigan,Akron,0,0
59,4,137,737128,2018-03-10,0,86,0,63,Kentucky,Alabama,1,1
68,4,326,737135,2018-03-17,0,81,0,58,Villanova,Alabama,1,1


## Select just the teams from the ACC

In [10]:
list(pd.Series(games['2018'].team1_name.unique()).sort_values())

['Abilene_Chr',
 'Air_Force',
 'Akron',
 'Alabama',
 'Alabama_A&M',
 'Alabama_St',
 'Albany_NY',
 'Alcorn_St',
 'American_Univ',
 'Appalachian_St',
 'Arizona',
 'Arizona_St',
 'Ark_Little_Rock',
 'Ark_Pine_Bluff',
 'Arkansas',
 'Arkansas_St',
 'Army',
 'Auburn',
 'Austin_Peay',
 'BYU',
 'Ball_St',
 'Baylor',
 'Belmont',
 'Bethune-Cookman',
 'Binghamton',
 'Boise_St',
 'Boston_College',
 'Boston_Univ',
 'Bowling_Green',
 'Bradley',
 'Brown',
 'Bryant',
 'Bucknell',
 'Buffalo',
 'Butler',
 'CS_Bakersfield',
 'CS_Fullerton',
 'CS_Northridge',
 'CS_Sacramento',
 'C_Michigan',
 'Cal_Poly',
 'California',
 'Campbell',
 'Canisius',
 'Cent_Arkansas',
 'Central_Conn',
 'Charleston_So',
 'Charlotte',
 'Chattanooga',
 'Chicago_St',
 'Cincinnati',
 'Citadel',
 'Clemson',
 'Cleveland_St',
 'Coastal_Car',
 'Col_Charleston',
 'Colgate',
 'Colorado',
 'Colorado_St',
 'Columbia',
 'Connecticut',
 'Coppin_St',
 'Cornell',
 'Creighton',
 'Dartmouth',
 'Davidson',
 'Dayton',
 'DePaul',
 'Delaware',
 'Dela

In [11]:
#acc_teams = ["Duke","Miami_FL","Wake_Forest","Boston_College","Notre_Dame","Pittsburgh","Virginia","Florida_St","Virginia_Tech","Georgia_Tech","Clemson","North_Carolina","Louisville","Syracuse","NC_State"]
#len(acc_teams)

In [12]:
acc_teams_by_year = {}
for year in years:
    acc_teams_by_year[year] = ["Duke","Miami_FL","Wake_Forest","Boston_College","Notre_Dame","Pittsburgh","Virginia","Florida_St","Virginia_Tech","Georgia_Tech","Clemson","North_Carolina","Louisville","Syracuse","NC_State"]

## Restrict to games where one of these teams played at least

In [13]:
games, remaining_games = pyrplib.utils.filter_teams(games, remaining_games, acc_teams_by_year)
len(games['2018'])

## Run Massey and Colley
Parameters are selected below

In [16]:
direct_thress = [0] # might be of interest to see how sensitive to preprocessing, but not now
spread_thress = [0]
weight_indirects = [0,0.1,0.5,1]

In [17]:
games['2002']

Unnamed: 0,team2,team1,notsure1,date,H_A_N1,points1,H_A_N2,points2,team1_name,team2_name,team1_madness,team2_madness
2204,147,261,731166,2001-11-12,1,78,-1,58,Syracuse,Manhattan,0,0
1023,69,261,731168,2001-11-14,1,74,-1,60,Syracuse,DePaul,0,0
3145,214,180,731168,2001-11-14,1,95,-1,51,NC_State,Prairie_View,1,0
3442,234,180,731169,2001-11-15,1,78,-1,56,NC_State,San_Jose_St,1,0
4532,309,303,731170,2001-11-16,1,105,-1,74,Virginia,W_Michigan,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
617,43,141,731276,2002-03-02,1,90,-1,88,Louisville,Charlotte,0,1
4606,314,211,731276,2002-03-02,1,92,-1,65,Pittsburgh,Weber_St,1,0
4435,303,151,731277,2002-03-03,1,112,-1,92,Maryland,Virginia,1,0
2816,190,73,731277,2002-03-03,1,93,-1,68,Duke,North_Carolina,1,0


### Break up the new information into weeks

In [18]:
from datetime import timedelta

#days_to_subtracts = [int(d) for d in np.arange(7+28,0,-7)]#[7+28]
days_to_subtracts = [int(d) for d in np.arange(7+28,30,-1)]

delta = timedelta(days=days_to_subtracts[0]) # sample
delta

datetime.timedelta(days=35)

In [19]:
days_to_subtracts

[35, 34, 33, 32, 31]

In [20]:
massey_rankings = {}
colley_rankings = {}
massey_rs = {}
colley_rs = {}
colley_perms = {}
massey_perms = {}

for year, acc_teams in tqdm(acc_teams_by_year.items()):
    team_domain = acc_teams
    team_range = acc_teams # for simplicity we are assuming acc teams is consitent. it is not of course from year to year

    columns = ["days_left","direct_thres","spread_thres","weight_indirect"]+team_range
    massey_rankings[year] = pd.DataFrame(columns=columns)
    colley_rankings[year] = pd.DataFrame(columns=columns)
    massey_rs[year] = pd.DataFrame(columns=columns)
    colley_rs[year] = pd.DataFrame(columns=columns)
    massey_perms[year] = pd.DataFrame(columns=columns)
    colley_perms[year] = pd.DataFrame(columns=columns)

    game_df = pd.DataFrame({"team1_name":games[year]['team1_name'],
                            "team1_score":games[year]['points1'],
                            "team1_H_A_N": games[year]['H_A_N1'],
                            "team2_name":games[year]['team2_name'],
                            "team2_score":games[year]['points2'],
                            "team2_H_A_N": games[year]['H_A_N1'],
                            "date": games[year]['date']
                           }).sort_values(by='date')#.drop('date',axis=1)
    mask = game_df.team1_name.isin(team_domain) & game_df.team2_name.isin(team_domain)
    game_df = game_df.loc[mask]

    keys = list(itertools.product(days_to_subtracts,direct_thress,spread_thress,weight_indirects))

    def compute(days_left,direct_thres,spread_thres,weight_indirect,team_range,all_teams,game_df,selection_sunday):
        #upper = int(len(game_df)*frac)
        #game_df_sample = game_df.iloc[:upper,:]
        delta = timedelta(days=days_left)
        game_df_sample = game_df.loc[game_df["date"] <= pd.to_datetime(selection_sunday,format="%m/%d/%Y")-delta].drop('date',axis=1)

        map_func = lambda linked: pyrankability.construct.colley_matrices(linked,direct_thres=direct_thres,spread_thres=spread_thres)
        colley_matrix,colley_b,indirect_colley_matrix,indirect_colley_b = pyrankability.construct.map_vectorized(game_df_sample,map_func)
        colley_matrix = colley_matrix.reindex(index=all_teams,columns=all_teams)
        colley_b = colley_b.reindex(all_teams)
        indirect_colley_matrix = indirect_colley_matrix.reindex(index=all_teams,columns=all_teams)
        indirect_colley_b = indirect_colley_b.reindex(all_teams)
        inxs = []
        for team in team_range:
            inxs.append(int(np.where(colley_b.index == team)[0][0]))
        ranking1,r1,perm1 = pyrankability.rank.ranking_from_matrices(colley_matrix.fillna(0),colley_b.fillna(0),np.array(inxs))
        indirect_ranking1,indirect_r1,indirect_perm1 = pyrankability.rank.ranking_from_matrices(indirect_colley_matrix.fillna(0),indirect_colley_b.fillna(0),np.array(inxs))
        if weight_indirect > 0:
            r1 = r1+weight_indirect*indirect_r1
            perm1, ranking1 = pyrankability.rank.perm_ranking_from_r(r1)
        
        map_func = lambda linked: pyrankability.construct.massey_matrices(linked,direct_thres=direct_thres,spread_thres=spread_thres)
        massey_matrix,massey_b,indirect_massey_matrix,indirect_massey_b = pyrankability.construct.map_vectorized(game_df_sample,map_func)
        massey_matrix = massey_matrix.reindex(index=all_teams,columns=all_teams)
        massey_b = massey_b.reindex(all_teams)
        indirect_massey_matrix = indirect_massey_matrix.reindex(index=all_teams,columns=all_teams)
        indirect_massey_b = indirect_massey_b.reindex(all_teams)
        inxs = []
        for team in team_range:
            inxs.append(int(np.where(massey_b.index == team)[0][0]))
        ranking2,r2,perm2 = pyrankability.rank.ranking_from_matrices(massey_matrix.fillna(0),massey_b.fillna(0),np.array(inxs))
        indirect_ranking2,indirect_r2,indirect_perm2 = pyrankability.rank.ranking_from_matrices(massey_matrix.fillna(0),massey_b.fillna(0),np.array(inxs))
        if weight_indirect > 0:
            r2 = r2+weight_indirect*indirect_r2
            perm2, ranking2 = pyrankability.rank.perm_ranking_from_r(r2)
            
        ranking_values1 = [days_left,direct_thres,spread_thres,weight_indirect]+list(ranking1)
        ranking_values2 = [days_left,direct_thres,spread_thres,weight_indirect]+list(ranking2)
        r_values1 = [days_left,direct_thres,spread_thres,weight_indirect]+list(r1)
        r_values2 = [days_left,direct_thres,spread_thres,weight_indirect]+list(r2)
        perm_values1 = [days_left,direct_thres,spread_thres,weight_indirect]+list(perm1)
        perm_values2 = [days_left,direct_thres,spread_thres,weight_indirect]+list(perm2)
        
        return (pd.Series(ranking_values1,index=columns),
                pd.Series(ranking_values2,index=columns),
                pd.Series(r_values1,index=columns),
                pd.Series(r_values2,index=columns),
                pd.Series(perm_values1,index=columns),
                pd.Series(perm_values2,index=columns)              
               )

    #frac,direct_thres,spread_thres,weight_indirect = keys[0]
    #for frac,direct_thres,spread_thres,weight_indirect in keys:
    #    compute(frac,direct_thres,spread_thres,weight_indirect,team_range,all_teams[year],game_df)
    results = Parallel(n_jobs=-1)(delayed(compute)(days_left,direct_thres,spread_thres,weight_indirect,team_range,all_teams[year],game_df,base.selectionSundays[year]) for days_left,direct_thres,spread_thres,weight_indirect in keys)

    c = 0
    for i,key in enumerate(keys):
        days_left,direct_thres,spread_thres,weight_indirect = key
        colley,massey,colley_r,massey_r,colley_perm,massey_perm = results[i]
        massey.name = c
        colley.name = c
        colley_r.name=c
        massey_r.name=c
        colley_perm.name=c
        massey_perm.name=c
        massey_rankings[year] = massey_rankings[year].append(massey)
        colley_rankings[year] = colley_rankings[year].append(colley)
        massey_rs[year] = massey_rs[year].append(massey_r)
        colley_rs[year] = colley_rs[year].append(colley_r)
        massey_perms[year] = massey_perms[year].append(massey_perm)
        colley_perms[year] = colley_perms[year].append(colley_perm)
        c+=1

100%|██████████| 17/17 [00:18<00:00,  1.13it/s]


In [21]:
colley_perms[year].drop(['days_left','direct_thres','spread_thres','weight_indirect'],axis=1).shape

(20, 15)

## What do we have after running

In [22]:
display(colley_rankings[year])

Unnamed: 0,days_left,direct_thres,spread_thres,weight_indirect,Duke,Miami_FL,Wake_Forest,Boston_College,Notre_Dame,Pittsburgh,Virginia,Florida_St,Virginia_Tech,Georgia_Tech,Clemson,North_Carolina,Louisville,Syracuse,NC_State
0,35.0,0.0,0.0,0.0,3.0,6.0,14.0,10.0,13.0,15.0,1.0,8.0,9.0,11.0,2.0,7.0,5.0,12.0,4.0
1,35.0,0.0,0.0,0.1,3.0,6.0,14.0,10.0,13.0,15.0,1.0,8.0,9.0,11.0,2.0,7.0,5.0,12.0,4.0
2,35.0,0.0,0.0,0.5,4.0,6.0,14.0,10.0,13.0,15.0,1.0,8.0,9.0,11.0,2.0,7.0,5.0,12.0,3.0
3,35.0,0.0,0.0,1.0,4.0,6.0,14.0,10.0,13.0,15.0,1.0,8.0,9.0,11.0,2.0,7.0,5.0,12.0,3.0
4,34.0,0.0,0.0,0.0,3.0,5.0,14.0,10.0,13.0,15.0,1.0,7.0,9.0,12.0,2.0,6.0,8.0,11.0,4.0
5,34.0,0.0,0.0,0.1,3.0,5.0,14.0,10.0,13.0,15.0,1.0,7.0,9.0,12.0,2.0,6.0,8.0,11.0,4.0
6,34.0,0.0,0.0,0.5,4.0,5.0,14.0,10.0,13.0,15.0,1.0,7.0,9.0,12.0,2.0,6.0,8.0,11.0,3.0
7,34.0,0.0,0.0,1.0,4.0,6.0,14.0,10.0,13.0,15.0,1.0,7.0,9.0,12.0,2.0,5.0,8.0,11.0,3.0
8,33.0,0.0,0.0,0.0,4.0,5.0,14.0,12.0,13.0,15.0,1.0,8.0,9.0,11.0,2.0,6.0,7.0,10.0,3.0
9,33.0,0.0,0.0,0.1,4.0,5.0,14.0,12.0,13.0,15.0,1.0,8.0,9.0,11.0,2.0,6.0,7.0,10.0,3.0


## Select your parameters

In [23]:
columns = {'direct_thres': [0], 'spread_thres': [0], 'weight_indirect': [0,0.1,0.5,1], 'Method': ['Massey','Colley']}
sel_df = pyrplib.utils.get_sel_df(columns)
sel_df

[[0], [0], [0, 0.1, 0.5, 1], ['Massey', 'Colley']]
(0, 0, 0, 'Massey')
(0, 0, 0, 'Colley')
(0, 0, 0.1, 'Massey')
(0, 0, 0.1, 'Colley')
(0, 0, 0.5, 'Massey')
(0, 0, 0.5, 'Colley')
(0, 0, 1, 'Massey')
(0, 0, 1, 'Colley')


Unnamed: 0,direct_thres,spread_thres,weight_indirect,Method
0,0,0,0.0,Massey
1,0,0,0.0,Colley
2,0,0,0.1,Massey
3,0,0,0.1,Colley
4,0,0,0.5,Massey
5,0,0,0.5,Colley
6,0,0,1.0,Massey
7,0,0,1.0,Colley


In [24]:
## TODO: add in more options if we want
#direct_thress = [0] # might be of interest to see how sensitive to preprocessing, but not now
#spread_thress = [0]
#weight_indirects = [0,0.1,0.5,1]
#methods = ['Massey','Colley']
#sel_df = pd.DataFrame(columns=['direct_thres','spread_thres','weight_indirect','Method'])
#c = 0
#for dt,st,wi,method in itertools.product(direct_thress,spread_thress,weight_indirects,methods):
#    print([dt,st,wi,method])
#    sel_df = sel_df.append(pd.Series([dt,st,wi,method],index=sel_df.columns,name=c))
#    c+=1
#
#sel_df

In [25]:
top_k = 5
feature_names = [f'top{top_k}_jaccard',f'top{top_k}_union_tau','tau']
ms = pd.DataFrame(columns=['days_left1','days_left2',"direct_thres","spread_thres","weight_indirect",'Method','Year','rankings1','rankings2']+feature_names)

pair_days_to_subtracts = [sorted(days_to_subtracts) for days_to_subtracts in list(itertools.combinations(days_to_subtracts,2))]

c=0
for index,row in sel_df.iterrows():
    dt,st,iw = row.loc['direct_thres'],row.loc['spread_thres'],row.loc['weight_indirect']
    method = row.loc['Method']
    for year in years:
        for days_left1,days_left2 in pair_days_to_subtracts:
            if method == 'Massey':
                rankings = massey_rankings[year].set_index(["direct_thres","spread_thres","weight_indirect"]).loc[(dt,st,iw)]
            elif method == 'Colley':
                rankings = colley_rankings[year].set_index(["direct_thres","spread_thres","weight_indirect"]).loc[(dt,st,iw)]
            else:
                raise Exception('Unsupported')
            rankings1 = rankings.set_index('days_left').loc[days_left1].T
            rankings1_top_k = rankings1.loc[rankings1 < top_k]
            rankings2 = rankings.set_index('days_left').loc[days_left2].T
            rankings2_top_k = rankings2.loc[rankings2 < top_k]
            val = len(set(rankings1_top_k.index).intersection(set(rankings2_top_k.index)))/len(set(rankings1_top_k.index).union(set(rankings2_top_k.index)))#top_k
            union = set(rankings1_top_k.index).union(rankings2_top_k.index)
            rankings1_union = rankings1.loc[union]
            rankings2_union = rankings2.loc[union]
            tau_union, p_value_union = stats.kendalltau(np.argsort(rankings1_union.values), np.argsort(rankings2_union.values))
            tau,p_value = stats.kendalltau(np.argsort(rankings1.values), np.argsort(rankings2.loc[rankings1.index].values))
            entry = pd.Series([days_left1,days_left2,dt,st,iw,method,year,rankings1,rankings2,val,tau_union,tau],name=c,index=ms.columns)
            c+=1
            ms=ms.append(entry)

  
  app.launch_new_instance()


In [26]:
ms['days_diff'] = ms.days_left2 - ms.days_left1
ms.head()

Unnamed: 0,days_left1,days_left2,direct_thres,spread_thres,weight_indirect,Method,Year,rankings1,rankings2,top5_jaccard,top5_union_tau,tau,days_diff
0,34,35,0,0,0,Massey,2002,Duke 1 Miami_FL 11 Wake...,Duke 1 Miami_FL 11 Wake...,1.0,1.0,1.0,1
1,33,35,0,0,0,Massey,2002,Duke 1 Miami_FL 11 Wake...,Duke 1 Miami_FL 11 Wake...,1.0,1.0,1.0,2
2,32,35,0,0,0,Massey,2002,Duke 1 Miami_FL 11 Wake...,Duke 1 Miami_FL 11 Wake...,0.6,0.4,0.352381,3
3,31,35,0,0,0,Massey,2002,Duke 1 Miami_FL 10 Wake...,Duke 1 Miami_FL 11 Wake...,0.6,0.4,0.295238,4
4,33,34,0,0,0,Massey,2002,Duke 1 Miami_FL 11 Wake...,Duke 1 Miami_FL 11 Wake...,1.0,1.0,1.0,1


In [27]:
ms

Unnamed: 0,days_left1,days_left2,direct_thres,spread_thres,weight_indirect,Method,Year,rankings1,rankings2,top5_jaccard,top5_union_tau,tau,days_diff
0,34,35,0,0,0,Massey,2002,Duke 1 Miami_FL 11 Wake...,Duke 1 Miami_FL 11 Wake...,1.000000,1.000000,1.000000,1
1,33,35,0,0,0,Massey,2002,Duke 1 Miami_FL 11 Wake...,Duke 1 Miami_FL 11 Wake...,1.000000,1.000000,1.000000,2
2,32,35,0,0,0,Massey,2002,Duke 1 Miami_FL 11 Wake...,Duke 1 Miami_FL 11 Wake...,0.600000,0.400000,0.352381,3
3,31,35,0,0,0,Massey,2002,Duke 1 Miami_FL 10 Wake...,Duke 1 Miami_FL 11 Wake...,0.600000,0.400000,0.295238,4
4,33,34,0,0,0,Massey,2002,Duke 1 Miami_FL 11 Wake...,Duke 1 Miami_FL 11 Wake...,1.000000,1.000000,1.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1355,32,34,0,0,1,Colley,2018,Duke 3 Miami_FL 4 Wake...,Duke 4 Miami_FL 6 Wake...,0.600000,-0.200000,0.123810,2
1356,31,34,0,0,1,Colley,2018,Duke 7 Miami_FL 4 Wake...,Duke 4 Miami_FL 6 Wake...,0.333333,-0.333333,0.123810,3
1357,32,33,0,0,1,Colley,2018,Duke 3 Miami_FL 4 Wake...,Duke 4 Miami_FL 6 Wake...,0.600000,-0.200000,0.523810,1
1358,31,33,0,0,1,Colley,2018,Duke 7 Miami_FL 4 Wake...,Duke 4 Miami_FL 6 Wake...,0.333333,-0.333333,0.333333,2


In [28]:
import altair as alt

alt.Chart(ms).mark_bar().encode(
    alt.X(f'tau', bin=True),
    y='count()',
    row='days_diff'
)

In [29]:
import altair as alt

alt.Chart(ms).mark_bar().encode(
    alt.X(f'top{top_k}_union_tau', bin=True),
    y='count()',
    row='days_diff'
)

In [30]:
import altair as alt

alt.Chart(ms).mark_bar().encode(
    alt.X(f'top{top_k}_jaccard', bin=True),
    y='count()',
    row='days_diff'
)

In [31]:
sensitivity_target = ms.copy()

sensitivity_data = {}
for year in tqdm(years):
    sensitivity_data[year] = {}
    for days_left in days_to_subtracts:
        game_df = pd.DataFrame({"team1_name":games[year]['team1_name'],
                                "team1_score":games[year]['points1'],
                                "team1_H_A_N": games[year]['H_A_N1'],
                                "team2_name":games[year]['team2_name'],
                                "team2_score":games[year]['points2'],
                                "team2_H_A_N": games[year]['H_A_N1'],
                                "date": games[year]['date']
                               }).sort_values(by='date')
        mask = game_df.team1_name.isin(team_domain) & game_df.team2_name.isin(team_domain)
        game_df = game_df.loc[mask]
        delta = timedelta(days=days_left)
        game_df_sample = game_df.loc[game_df["date"] <= pd.to_datetime(base.selectionSundays[year],format="%m/%d/%Y")-delta].drop('date',axis=1)
        sensitivity_data[year][f"days_left={days_left}"]=game_df_sample
        
joblib.dump({'description':major_description,'target':sensitivity_target,'data':sensitivity_data,'other':{'madness_teams':madness_teams,'remaining_games':remaining_games,'best_df':best_df,'top_k':top_k,'feature_names':feature_names}},"generate.joblib.z")

100%|██████████| 17/17 [00:00<00:00, 82.52it/s]


NameError: name 'best_df' is not defined