In [13]:
import numpy as np
import pandas as pd
import torch
import ast

from tqdm.notebook import tqdm
from path import Path

In [2]:
#Stadium data for obtaining training data 
stadium_county_df = pd.read_csv('Cleaned_stadium_data.csv')

#Grouped data for running Bayesian SC
grouped_df = pd.read_csv('Grouped_df.csv')

county_covid = pd.read_csv('County_Covid_Data.csv')

county_covid['date'] = pd.to_datetime(county_covid['date'])
county_covid = county_covid.loc[~(county_covid['county'] == 'Unknown')]
county_covid = county_covid.loc[~(county_covid['cases'].isnull())]
county_covid['county'] = county_covid['county'].apply(lambda x: x.lower() if isinstance(x, str) else x)

In [3]:
home_state_dict = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}

In [5]:
#Below function gets training X and training Y data for applying Bayesian Model 
def get_training_data(team_name_str, stadium_county_str, state_str, intervention_date, show_plot, week):
    # the arguments show_plot and week are never used

    # what does this do? remove whitespace?
    stadium_county_str = [x for x in stadium_county_str if x != ""]
    state_str = [x for x in state_str if x != ""]
    intervention_date = [x for x in intervention_date if x != ""]
    #Convert to lower case to avoid case insensitivity later
    stadium_county_str = [s.lower() for s in stadium_county_str]
    #To find intervention_date, we want first entry that has numbers, since some in data is text only.

    for s in intervention_date:
        if any(c.isdigit() for c in s):
            intervention_date = s


    intervention_date = pd.to_datetime(intervention_date)


    #Convert State Acronym to full state name
    for state in range(len(state_str)):
        if state_str[state] in home_state_dict:
            state_str[state] = home_state_dict[state_str[state]]
            
            
    #print(stadium_county_df)
    #Find Synthetic Counties
    synthetic_counties = list(stadium_county_df.loc[stadium_county_df['Team'] == team_name_str]['Donor_Counties'])[0]#.copy()
    #synthetic_counties = list(synthetic_counties)
    synthetic_counties = eval(synthetic_counties)
    synthetic_counties = [s.lower() for s in synthetic_counties]
    n_donors = len(synthetic_counties)
    
    #print(synthetic_counties)

    #Find Dataframe of X and Y data
    #Special case where 
    
    
    if team_name_str == 'Washington':
        stadium_county_data = county_covid.loc[(county_covid['county'].isin(stadium_county_str)) | ((county_covid['county'].isin(synthetic_counties)) & (county_covid['state'] == 'Maryland'))]
    else:
        stadium_county_data = county_covid.loc[(county_covid['county'].isin(stadium_county_str) | (county_covid['county'].isin(synthetic_counties))) & (county_covid['state'].isin(state_str))]
    
    stadium_county_data = stadium_county_data.fillna(method='bfill')
    stadium_county_data['date'] = pd.to_datetime(stadium_county_data['date'], infer_datetime_format=True)
    
    earliest_date = list(stadium_county_data.loc[stadium_county_data['county'].isin(stadium_county_str)]['date'])[0]
    
    #Start training from the earliest date of when our stadium county data becomes available.
    stadium_county_data = stadium_county_data.loc[stadium_county_data['date'] >= earliest_date]
    
    
    #Total Pivot is pivot table cases for entire dataset, training pivot is the same but for < intervention date
    total_pivot = stadium_county_data.pivot_table(columns='county', values='cases', index= 'date').reset_index()
    total_pivot = total_pivot.loc[total_pivot['date'] >= earliest_date]
    
    #Sum up stadium counties for our prediction. 
    total_pivot['Stadium_County'] = total_pivot.apply(lambda row: row[stadium_county_str].sum(), axis=1)
    #total_pivot['Stadium_County'] = total_pivot.loc[:, total_pivot.columns == (stadium_county_str[0])]
    
    
    total_pivot.drop(stadium_county_str, axis=1, inplace=True)
    
    total_pivot.fillna(0, inplace=True)
    
    training_pivot = total_pivot.loc[total_pivot['date'] < intervention_date]
    
    training_dates = training_pivot['date']
    
    total_dates = total_pivot['date']
    
    test_pivot = total_pivot.loc[total_pivot['date'] >= intervention_date]
    
    test_pivot = test_pivot.drop(['date'], axis=1)
    
    training_pivot = training_pivot.drop(['date'], axis=1)
    
    total_pivot = total_pivot.drop(['date'], axis=1)
    
    
    X_train = training_pivot.loc[:, ~training_pivot.columns.isin(['Stadium_County'])]
    
    Y_train = training_pivot['Stadium_County']
    
    total_X = total_pivot.loc[:, ~total_pivot.columns.isin(['Stadium_County'])]
    total_Y = total_pivot['Stadium_County']
    
    test_X = test_pivot.loc[:, ~test_pivot.columns.isin(['Stadium_County'])]
    test_Y = test_pivot['Stadium_County']
    

    return X_train, Y_train, test_X, test_Y, total_dates

x_train, y_train, x_test, y_test, dates = get_training_data('Cincinnati', ["Hamilton"], ["OH"], ['10/04/2020'], True, 0)

In [10]:
p = Path('dat')
p.mkdir_p()

Path('dat')

In [14]:
zipped_input = zip(grouped_df['Team'], grouped_df['County(s)'], grouped_df['State'], grouped_df['First date home stadium open to fans'])

for i, (team, county, state, date) in tqdm(enumerate(zipped_input)):
    if team in ['Arizona']:
        continue
    
    outpath = Path('dat').joinpath(team)
    outpath.mkdir_p()

    date = ast.literal_eval(date)
    county = ast.literal_eval(county)
    state = ast.literal_eval(state)

    # x are donors, y are stadiums, train is pre-intervention, test is post intervention
    x_train, y_train, x_test, y_test, dates = get_training_data(team, county, state, date, True, 0)
    x_train, y_train, x_test, y_test = torch.tensor(x_train.values),torch.tensor(y_train.values), torch.tensor(x_test.values), torch.tensor(y_test.values)

    # Combine Donor and Stadium in our traning data
    train_data = torch.cat((x_train, torch.unsqueeze(y_train, 1)), dim=1)  # this is a T x N matrix
    test_data = torch.cat((x_test, torch.unsqueeze(y_test, 1)), dim=1)  # this is a T x N matrix

    # Save data
    torch.save(train_data, outpath.joinpath('train_data.pt'))
    torch.save(train_data, outpath.joinpath('test_data.pt'))

0it [00:00, ?it/s]

test_data.pt   train_data.pt
