# Vaccine Data Process

In [32]:
# downloading the dataset 
!curl -l https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/us_state_vaccinations.csv > ../dataset/us_vac_og.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2188k  100 2188k    0     0  1906k      0  0:00:01  0:00:01 --:--:-- 1906k


In [1]:
import pandas as pd
from datetime import datetime
import datetime as datetime_og


In [2]:
vacDF = pd.read_csv('../dataset/us_vac_og.csv',parse_dates=['date'])

In [3]:
# Processing On

In [4]:
vacDF.columns

Index(['date', 'location', 'total_vaccinations', 'total_distributed',
       'people_vaccinated', 'people_fully_vaccinated_per_hundred',
       'total_vaccinations_per_hundred', 'people_fully_vaccinated',
       'people_vaccinated_per_hundred', 'distributed_per_hundred',
       'daily_vaccinations_raw', 'daily_vaccinations',
       'daily_vaccinations_per_million', 'share_doses_used'],
      dtype='object')

# Processing a State 

In [5]:
# Picking a state 
states = vacDF['location'].unique()

state_picked = states[6]
print(f'State Picked : {state_picked}')

State Picked : California


In [6]:
state_df = vacDF[vacDF['location']==state_picked]
state_df.sort_values(by='date')

Unnamed: 0,date,location,total_vaccinations,total_distributed,people_vaccinated,people_fully_vaccinated_per_hundred,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used
1968,2021-01-12,California,816301.0,3286050.0,703540.0,0.25,2.07,100089.0,1.78,8.32,,,,0.248
1969,2021-01-13,California,891489.0,3435650.0,744545.0,0.34,2.26,133689.0,1.88,8.70,75188.0,75188.0,1903.0,0.259
1970,2021-01-14,California,975293.0,3540175.0,801998.0,,2.47,,2.03,8.96,83804.0,79496.0,2012.0,0.275
1971,2021-01-15,California,1072959.0,3548575.0,865387.0,0.52,2.72,204374.0,2.19,8.98,97666.0,85553.0,2165.0,0.302
1972,2021-01-16,California,,,,,,,,,,88381.0,2237.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2291,2021-12-01,California,59768100.0,70974175.0,31186136.0,63.27,151.26,25000226.0,78.93,179.63,170492.0,150443.0,3808.0,0.842
2292,2021-12-02,California,60030910.0,71231045.0,31255707.0,63.42,151.93,25059980.0,79.10,180.28,262810.0,164917.0,4174.0,0.843
2293,2021-12-03,California,60285583.0,71475395.0,31322387.0,63.56,152.57,25115622.0,79.27,180.89,254673.0,178229.0,4511.0,0.843
2294,2021-12-04,California,60589671.0,71539745.0,31395152.0,63.73,153.34,25182341.0,79.46,181.06,304088.0,198599.0,5026.0,0.847


In [7]:
min_date = state_df['date'].min()
print(min_date)
max_date = state_df['date'].max()
print(max_date)

2021-01-12 00:00:00
2021-12-05 00:00:00


In [8]:
# 3 dates on which the vaccine numbers are sampled 
# assuming that these 3 dates capture a lot of information about the vaccine days 
vacSample1 = datetime.strptime("15/2/21", "%d/%m/%y")
vacSample2 = datetime.strptime("15/5/21", "%d/%m/%y")
vacSample3 = datetime.strptime("15/8/21", "%d/%m/%y")
vacSample4 = datetime.strptime("15/11/21", "%d/%m/%y")

In [9]:
state_df_sample = state_df[state_df['date'] < vacSample1 + datetime_og.timedelta(days=10)]
state_df_sample = state_df_sample[state_df_sample['date'] > vacSample1 - datetime_og.timedelta(days=10)]

state_df_sample_key = state_df_sample[['people_vaccinated_per_hundred',
                 'people_fully_vaccinated_per_hundred',
                 'people_vaccinated',
                'people_fully_vaccinated','date']].dropna().sample(n=1)

state_df_sample_key['state'] = state_picked
state_df_sample_key_1 = state_df_sample_key.add_prefix('S1_')


In [10]:
state_df_sample = state_df[state_df['date'] < vacSample2 + datetime_og.timedelta(days=10)]
state_df_sample = state_df_sample[state_df_sample['date'] > vacSample2 - datetime_og.timedelta(days=10)]

state_df_sample_key = state_df_sample[['people_vaccinated_per_hundred',
                 'people_fully_vaccinated_per_hundred',
                 'people_vaccinated',
                'people_fully_vaccinated','date']].dropna().sample(n=1)

state_df_sample_key['state'] = state_picked
state_df_sample_key_2 = state_df_sample_key.add_prefix('S2_')



In [11]:
state_df_sample = state_df[state_df['date'] < vacSample3 + datetime_og.timedelta(days=10)]
state_df_sample = state_df_sample[state_df_sample['date'] > vacSample3 - datetime_og.timedelta(days=10)]

state_df_sample_key = state_df_sample[['people_vaccinated_per_hundred',
                 'people_fully_vaccinated_per_hundred',
                 'people_vaccinated',
                'people_fully_vaccinated','date']].dropna().sample(n=1)

state_df_sample_key['state'] = state_picked
state_df_sample_key_3 = state_df_sample_key.add_prefix('S3_')





In [12]:
state_df_sample = state_df[state_df['date'] < vacSample4 + datetime_og.timedelta(days=10)]
state_df_sample = state_df_sample[state_df_sample['date'] > vacSample4 - datetime_og.timedelta(days=10)]

state_df_sample_key = state_df_sample[['people_vaccinated_per_hundred',
                 'people_fully_vaccinated_per_hundred',
                 'people_vaccinated',
                'people_fully_vaccinated','date']].dropna().sample(n=1)

state_df_sample_key['state'] = state_picked
state_df_sample_key_4 = state_df_sample_key.add_prefix('S4_')






In [13]:
state_df_sample_key_4 = state_df_sample_key_4.merge(state_df_sample_key_3,
                                                    left_on='S4_state',
                                                    right_on='S3_state')
state_df_sample_key_4 = state_df_sample_key_4.merge(state_df_sample_key_2,
                                                    left_on='S4_state',
                                                    right_on='S2_state')
state_df_sample_key_4 = state_df_sample_key_4.merge(state_df_sample_key_1,
                                                    left_on='S4_state',
                                                    right_on='S1_state')

In [14]:
state_df_sample_key_4.columns

Index(['S4_people_vaccinated_per_hundred',
       'S4_people_fully_vaccinated_per_hundred', 'S4_people_vaccinated',
       'S4_people_fully_vaccinated', 'S4_date', 'S4_state',
       'S3_people_vaccinated_per_hundred',
       'S3_people_fully_vaccinated_per_hundred', 'S3_people_vaccinated',
       'S3_people_fully_vaccinated', 'S3_date', 'S3_state',
       'S2_people_vaccinated_per_hundred',
       'S2_people_fully_vaccinated_per_hundred', 'S2_people_vaccinated',
       'S2_people_fully_vaccinated', 'S2_date', 'S2_state',
       'S1_people_vaccinated_per_hundred',
       'S1_people_fully_vaccinated_per_hundred', 'S1_people_vaccinated',
       'S1_people_fully_vaccinated', 'S1_date', 'S1_state'],
      dtype='object')

In [15]:
state_df_sample_key_4 = state_df_sample_key_4.drop(['S4_state', 
                            'S3_state',
                            'S2_state',
                           'S3_date',
                           'S4_date','S2_date','S1_date'], axis=1)

In [16]:
state_df_sample_key_4

Unnamed: 0,S4_people_vaccinated_per_hundred,S4_people_fully_vaccinated_per_hundred,S4_people_vaccinated,S4_people_fully_vaccinated,S3_people_vaccinated_per_hundred,S3_people_fully_vaccinated_per_hundred,S3_people_vaccinated,S3_people_fully_vaccinated,S2_people_vaccinated_per_hundred,S2_people_fully_vaccinated_per_hundred,S2_people_vaccinated,S2_people_fully_vaccinated,S1_people_vaccinated_per_hundred,S1_people_fully_vaccinated_per_hundred,S1_people_vaccinated,S1_people_fully_vaccinated,S1_state
0,76.84,62.36,30361433.0,24639061.0,65.8,53.49,25998296.0,21134019.0,51.05,34.29,20172726.0,13547291.0,12.06,3.82,4766592.0,1507829.0,California


# Generalizing for all states

In [17]:
FINAL_DATA = []
# Picking a state 
states = vacDF['location'].unique()

for state_picked in states:
    print(f'State Picked : {state_picked}')

    state_df = vacDF[vacDF['location']==state_picked]
    if state_picked =='Bureau of Prisons' or state_picked == 'Dept of Defense' or state_picked == 'Long Term Care'or state_picked == 'Veterans Health':
        continue
    state_df.sort_values(by='date')
    
    
    min_date = state_df['date'].min()
    print(min_date)
    max_date = state_df['date'].max()
    print(max_date)
    
    # 3 dates on which the vaccine numbers are sampled 
    # assuming that these 3 dates capture a lot of information about the vaccine days 
    vacSample1 = datetime.strptime("15/2/21", "%d/%m/%y")
    vacSample2 = datetime.strptime("15/5/21", "%d/%m/%y")
    vacSample3 = datetime.strptime("15/8/21", "%d/%m/%y")
    vacSample4 = datetime.strptime("15/11/21", "%d/%m/%y")
    
    state_df_sample = state_df[state_df['date'] < vacSample1 + datetime_og.timedelta(days=10)]
    state_df_sample = state_df_sample[state_df_sample['date'] > vacSample1 - datetime_og.timedelta(days=10)]

    state_df_sample_key = state_df_sample[['people_vaccinated_per_hundred',
                     'people_fully_vaccinated_per_hundred',
                     'people_vaccinated',
                    'people_fully_vaccinated','date']].dropna().sample(n=1)

    state_df_sample_key['state'] = state_picked
    state_df_sample_key_1 = state_df_sample_key.add_prefix('S1_')
    
    state_df_sample = state_df[state_df['date'] < vacSample2 + datetime_og.timedelta(days=10)]
    state_df_sample = state_df_sample[state_df_sample['date'] > vacSample2 - datetime_og.timedelta(days=10)]

    state_df_sample_key = state_df_sample[['people_vaccinated_per_hundred',
                     'people_fully_vaccinated_per_hundred',
                     'people_vaccinated',
                    'people_fully_vaccinated','date']].dropna().sample(n=1)

    state_df_sample_key['state'] = state_picked
    state_df_sample_key_2 = state_df_sample_key.add_prefix('S2_')

    state_df_sample = state_df[state_df['date'] < vacSample3 + datetime_og.timedelta(days=10)]
    state_df_sample = state_df_sample[state_df_sample['date'] > vacSample3 - datetime_og.timedelta(days=10)]

    state_df_sample_key = state_df_sample[['people_vaccinated_per_hundred',
                     'people_fully_vaccinated_per_hundred',
                     'people_vaccinated',
                    'people_fully_vaccinated','date']].dropna().sample(n=1)

    state_df_sample_key['state'] = state_picked
    state_df_sample_key_3 = state_df_sample_key.add_prefix('S3_')



    state_df_sample = state_df[state_df['date'] < vacSample4 + datetime_og.timedelta(days=10)]
    state_df_sample = state_df_sample[state_df_sample['date'] > vacSample4 - datetime_og.timedelta(days=10)]

    state_df_sample_key = state_df_sample[['people_vaccinated_per_hundred',
                     'people_fully_vaccinated_per_hundred',
                     'people_vaccinated',
                    'people_fully_vaccinated','date']].dropna().sample(n=1)

    state_df_sample_key['state'] = state_picked
    state_df_sample_key_4 = state_df_sample_key.add_prefix('S4_')





    state_df_sample_key_4 = state_df_sample_key_4.merge(state_df_sample_key_3,
                                                        left_on='S4_state',
                                                        right_on='S3_state')
    state_df_sample_key_4 = state_df_sample_key_4.merge(state_df_sample_key_2,
                                                        left_on='S4_state',
                                                        right_on='S2_state')
    state_df_sample_key_4 = state_df_sample_key_4.merge(state_df_sample_key_1,
                                                        left_on='S4_state',
                                                        right_on='S1_state')




    state_df_sample_key_4 = state_df_sample_key_4.drop(['S4_state', 
                                'S3_state',
                                'S2_state',
                               'S3_date',
                               'S4_date','S2_date','S1_date'], axis=1)
    
    
    FINAL_DATA.append(state_df_sample_key_4)


State Picked : Alabama
2021-01-12 00:00:00
2021-12-05 00:00:00
State Picked : Alaska
2021-01-12 00:00:00
2021-12-05 00:00:00
State Picked : American Samoa
2021-01-12 00:00:00
2021-12-05 00:00:00
State Picked : Arizona
2021-01-12 00:00:00
2021-12-05 00:00:00
State Picked : Arkansas
2021-01-12 00:00:00
2021-12-05 00:00:00
State Picked : Bureau of Prisons
State Picked : California
2021-01-12 00:00:00
2021-12-05 00:00:00
State Picked : Colorado
2021-01-12 00:00:00
2021-12-05 00:00:00
State Picked : Connecticut
2021-01-12 00:00:00
2021-12-05 00:00:00
State Picked : Delaware
2021-01-12 00:00:00
2021-12-05 00:00:00
State Picked : Dept of Defense
State Picked : District of Columbia
2021-01-12 00:00:00
2021-12-05 00:00:00
State Picked : Federated States of Micronesia
2021-01-12 00:00:00
2021-12-05 00:00:00
State Picked : Florida
2021-01-12 00:00:00
2021-12-05 00:00:00
State Picked : Georgia
2021-01-12 00:00:00
2021-12-05 00:00:00
State Picked : Guam
2021-01-12 00:00:00
2021-12-05 00:00:00
State

In [18]:
OUTPUT = pd.concat(FINAL_DATA)

In [19]:
OUTPUT.to_csv('../outputs/state_vac.csv')