In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

#### Two datasets were created on 4/7/2021
- one dataset includes all vaccines administered for all manufacturers (Pfizer, Moderna, Johnson & Johnson)
- the other has only vaccines adminstered for two-shot protocol manufacturers (Pfizer and Moderna)
- we read both in and then separate out the Johnson & Johnson to tell if the vaccine recipient has been fully vaccinated

In [None]:
vaccine_all = pd.read_csv('COVID_VACC_CountByZipPerDate_All_04072021.csv')
vaccine_all.head(3)

In [None]:
vaccine_all['VACC_DATE'] = pd.to_datetime(vaccine_all['VACC_DATE'])

In [None]:
max(vaccine_all['VACC_DATE'] )

In [None]:
min(vaccine_all['VACC_DATE'] )

In [None]:
vaccine_noJJ = pd.read_csv('COVID_VACC_CountByZipPerDate_Two_04072021.csv')
vaccine_noJJ.head(3)

In [None]:
vaccine_noJJ['VACC_DATE'] = pd.to_datetime(vaccine_noJJ['VACC_DATE'])

In [None]:
vaccine_noJJ.info()

In [None]:
vaccine_all.info()

In [None]:
vaccine_all.columns = ['count_all', 'vacc_date', 'dose_nbr_all', 'zipcode']

In [None]:
vaccine_noJJ.columns = ['count_2_shot', 'vacc_date', 'dose_nbr_2_shot', 'zipcode' ]

#### First we'll merge the two datasets on date, zipcode, and dose number

In [None]:
vaccine_calc = pd.merge(vaccine_all, vaccine_noJJ, how = 'outer', 
                        left_on = ['vacc_date', 'zipcode', 'dose_nbr_all'],
                        right_on = ['vacc_date', 'zipcode', 'dose_nbr_2_shot']
                       )
vaccine_calc.head()

In [None]:
vaccine_calc.shape

#### Creating a column for one-dose or two-dose protocol

In [None]:
vaccine_calc['protocol'] = ''

#### Separating the merged dataframe into first shots and second shots

In [None]:
second_shots = vaccine_calc.loc[vaccine_calc.dose_nbr_all == 2]
print(second_shots.shape)
second_shots.head()

In [None]:
first_shots = vaccine_calc.loc[vaccine_calc.dose_nbr_all == 1]
print(first_shots.shape)
first_shots.head()

#### The second shot rows can be labeled with two-shot protocol

In [None]:
second_shots.protocol = 'two-shot'
second_shots.head(2)

#### Comparing the count of first shots from the dataframe with all manufacturers to the one _without_ Johnson & Johnson
- if the count for all manufacturers is greater, the difference is the count of J&J vaccines
- if the counts are the same, the first shots are for one of the two-shot protocol manufacturers
- if the count for all manufacturers is less, this is unexpected and we'll need to decide how to handle those rows

In [None]:
first_shots.loc[first_shots.count_2_shot.isnull()]

In [None]:
print('one-shot-protocol:', first_shots.loc[(first_shots.count_all > first_shots.count_2_shot) | (first_shots.count_2_shot.isnull())].shape[0])
print('two-shot-protocol:', first_shots.loc[first_shots.count_all == first_shots.count_2_shot].shape[0])
print('problem rows:', first_shots.loc[first_shots.count_all < first_shots.count_2_shot].shape[0])

#### Pulling the rows that contain J&J to a separate dataframe called `mixed`
- the difference between `count_all` and `count_2_shot` is the number of J&J vaccines given
- we'll save the ones that aren't mixed to a dataframe called `firsts_for_protocol2`

In [None]:
firsts_for_protocol2 = first_shots.loc[first_shots.count_all == first_shots.count_2_shot]
firsts_for_protocol2['protocol'] = 'two-shot'
firsts_for_protocol2.head()

In [None]:
mixed = first_shots.loc[(first_shots.count_all > first_shots.count_2_shot) | (first_shots.count_2_shot.isnull())]
mixed.head()

In [None]:
mixed['diff'] = mixed.count_all - mixed.count_2_shot
mixed.head()

In [None]:
mixed.vacc_date.min()

#### Creating a copy of the mixed dataframe called jj to hold only the Johnson and Johnson stats
- update the count for the JJ dataframe to be the diff
- populate the protocol column with 'one-shot'

In [None]:
jj = mixed.copy(deep = True)
jj.head()

- update the count -- the count_2_shot has the count of two-shot protocol vaccines administered; we'll set this column in our jj copy to represent the number of Johnson & Johnson vaccines given so that the numbers are in the same column when we re-combine the data

In [None]:
jj['count_2_shot'] = jj['diff']
jj['protocol'] = 'one-shot'
jj.head()

- update the mixed protocol to populate the protocol column with 'two-shot' since we are pulling out the J&J counts elsewhere

In [None]:
mixed['protocol'] = 'two-shot'

In [None]:
firsts_for_protocol2.shape

In [None]:
vacc_protocols = pd.concat([firsts_for_protocol2, mixed, jj, second_shots]).sort_values(['vacc_date', 'zipcode'])
vacc_protocols.head(3)

In [None]:
vacc_protocols.shape

In [None]:
vacc_protocols = vacc_protocols.drop(columns = ['count_all', 'dose_nbr_all', 'diff'])
vacc_protocols.head()

In [None]:
vacc_protocols.columns = ['vacc_date', 'zipcode', 'count_given', 'dose_nbr', 'protocol']
vacc_protocols.sort_values(['vacc_date', 'zipcode'])
vacc_protocols.head()

In [None]:
#vacc_protocols.to_csv('../vaccine_with_protocol.csv', index = False)