In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

#### Two datasets were created on 4/7/2021
- one dataset includes all vaccines administered for all manufacturers (Pfizer, Moderna, Johnson & Johnson)
- the other has only vaccines adminstered for two-shot protocol manufacturers (Pfizer and Moderna)
- we read both in and then separate out the Johnson & Johnson to tell if the vaccine recipient has been fully vaccinated

In [2]:
vaccine_all = pd.read_csv('../data/COVID_VACC_CountByZipPerDate_All_04072021.csv')
vaccine_all.head(3)

Unnamed: 0,count,VACC_DATE,DOSE_COUNT,zip
0,1,12/12/2020,1,37215.0
1,1,12/14/2020,2,37205.0
2,1,12/14/2020,1,37215.0


In [3]:
vaccine_all['VACC_DATE'] = pd.to_datetime(vaccine_all['VACC_DATE'])

In [4]:
max(vaccine_all['VACC_DATE'] )

Timestamp('2021-04-07 00:00:00')

In [5]:
min(vaccine_all['VACC_DATE'] )

Timestamp('2020-12-12 00:00:00')

In [6]:
vaccine_noJJ = pd.read_csv('../data/COVID_VACC_CountByZipPerDate_Two_04072021.csv')
vaccine_noJJ.head(3)

Unnamed: 0,count,VACC_DATE,DOSE_COUNT,zip
0,1,12/12/2020,1,37215.0
1,1,12/14/2020,2,37205.0
2,1,12/14/2020,1,37215.0


In [7]:
vaccine_noJJ['VACC_DATE'] = pd.to_datetime(vaccine_noJJ['VACC_DATE'])

In [8]:
vaccine_noJJ.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5956 entries, 0 to 5955
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   count       5956 non-null   int64         
 1   VACC_DATE   5956 non-null   datetime64[ns]
 2   DOSE_COUNT  5956 non-null   int64         
 3   zip         5752 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 186.2 KB


In [9]:
vaccine_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5960 entries, 0 to 5959
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   count       5960 non-null   int64         
 1   VACC_DATE   5960 non-null   datetime64[ns]
 2   DOSE_COUNT  5960 non-null   int64         
 3   zip         5756 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 186.4 KB


In [10]:
vaccine_all.columns = ['count_all', 'vacc_date', 'dose_nbr_all', 'zipcode']

In [11]:
vaccine_noJJ.columns = ['count_2_shot', 'vacc_date', 'dose_nbr_2_shot', 'zipcode' ]

#### First we'll merge the two datasets on date, zipcode, and dose number

In [13]:
vaccine_calc = pd.merge(vaccine_all, vaccine_noJJ, how = 'outer', 
                        left_on = ['vacc_date', 'zipcode', 'dose_nbr_all'],
                        right_on = ['vacc_date', 'zipcode', 'dose_nbr_2_shot']
                       )
vaccine_calc.head()

Unnamed: 0,count_all,vacc_date,dose_nbr_all,zipcode,count_2_shot,dose_nbr_2_shot
0,1,2020-12-12,1,37215.0,1.0,1.0
1,1,2020-12-14,2,37205.0,1.0,2.0
2,1,2020-12-14,1,37215.0,1.0,1.0
3,1,2020-12-15,1,37221.0,1.0,1.0
4,1,2020-12-16,1,37215.0,1.0,1.0


In [14]:
vaccine_calc.shape

(5960, 6)

#### Creating a column for one-dose or two-dose protocol

In [15]:
vaccine_calc['protocol'] = ''

#### Separating the merged dataframe into first shots and second shots

In [16]:
second_shots = vaccine_calc.loc[vaccine_calc.dose_nbr_all == 2]
print(second_shots.shape)
second_shots.head()

(2719, 7)


Unnamed: 0,count_all,vacc_date,dose_nbr_all,zipcode,count_2_shot,dose_nbr_2_shot,protocol
1,1,2020-12-14,2,37205.0,1.0,2.0,
33,1,2020-12-18,2,37072.0,1.0,2.0,
38,1,2020-12-18,2,37138.0,1.0,2.0,
43,1,2020-12-18,2,37204.0,1.0,2.0,
52,1,2020-12-18,2,37212.0,1.0,2.0,


In [17]:
first_shots = vaccine_calc.loc[vaccine_calc.dose_nbr_all == 1]
print(first_shots.shape)
first_shots.head()

(3241, 7)


Unnamed: 0,count_all,vacc_date,dose_nbr_all,zipcode,count_2_shot,dose_nbr_2_shot,protocol
0,1,2020-12-12,1,37215.0,1.0,1.0,
2,1,2020-12-14,1,37215.0,1.0,1.0,
3,1,2020-12-15,1,37221.0,1.0,1.0,
4,1,2020-12-16,1,37215.0,1.0,1.0,
5,1,2020-12-17,1,,1.0,1.0,


#### The second shot rows can be labeled with two-shot protocol

In [18]:
second_shots.protocol = 'two-shot'
second_shots.head(2)

Unnamed: 0,count_all,vacc_date,dose_nbr_all,zipcode,count_2_shot,dose_nbr_2_shot,protocol
1,1,2020-12-14,2,37205.0,1.0,2.0,two-shot
33,1,2020-12-18,2,37072.0,1.0,2.0,two-shot


#### Comparing the count of first shots from the dataframe with all manufacturers to the one _without_ Johnson & Johnson
- if the count for all manufacturers is greater, the difference is the count of J&J vaccines
- if the counts are the same, the first shots are for one of the two-shot protocol manufacturers
- if the count for all manufacturers is less, this is unexpected and we'll need to decide how to handle those rows

In [19]:
first_shots.loc[first_shots.count_2_shot.isnull()]

Unnamed: 0,count_all,vacc_date,dose_nbr_all,zipcode,count_2_shot,dose_nbr_2_shot,protocol
4227,1,2021-03-09,1,37213.0,,,
4819,1,2021-03-19,1,37213.0,,,
4844,1,2021-03-20,1,37069.0,,,
4879,11,2021-03-20,1,37213.0,,,


In [20]:
print('one-shot-protocol:', first_shots.loc[(first_shots.count_all > first_shots.count_2_shot) | (first_shots.count_2_shot.isnull())].shape[0])
print('two-shot-protocol:', first_shots.loc[first_shots.count_all == first_shots.count_2_shot].shape[0])
print('problem rows:', first_shots.loc[first_shots.count_all < first_shots.count_2_shot].shape[0])

one-shot-protocol: 734
two-shot-protocol: 2507
problem rows: 0


#### Pulling the rows that contain J&J to a separate dataframe called `mixed`
- the difference between `count_all` and `count_2_shot` is the number of J&J vaccines given
- we'll save the ones that aren't mixed to a dataframe called `firsts_for_protocol2`

In [21]:
firsts_for_protocol2 = first_shots.loc[first_shots.count_all == first_shots.count_2_shot]
firsts_for_protocol2['protocol'] = 'two-shot'
firsts_for_protocol2.head()

Unnamed: 0,count_all,vacc_date,dose_nbr_all,zipcode,count_2_shot,dose_nbr_2_shot,protocol
0,1,2020-12-12,1,37215.0,1.0,1.0,two-shot
2,1,2020-12-14,1,37215.0,1.0,1.0,two-shot
3,1,2020-12-15,1,37221.0,1.0,1.0,two-shot
4,1,2020-12-16,1,37215.0,1.0,1.0,two-shot
5,1,2020-12-17,1,,1.0,1.0,two-shot


In [22]:
mixed = first_shots.loc[(first_shots.count_all > first_shots.count_2_shot) | (first_shots.count_2_shot.isnull())]
mixed.head()

Unnamed: 0,count_all,vacc_date,dose_nbr_all,zipcode,count_2_shot,dose_nbr_2_shot,protocol
3815,108,2021-03-02,1,37214.0,107.0,1.0,
3819,109,2021-03-02,1,37216.0,108.0,1.0,
4010,60,2021-03-06,1,,59.0,1.0,
4012,260,2021-03-06,1,37013.0,256.0,1.0,
4023,94,2021-03-06,1,37115.0,93.0,1.0,


In [23]:
mixed['diff'] = mixed.count_all - mixed.count_2_shot
mixed.head()

Unnamed: 0,count_all,vacc_date,dose_nbr_all,zipcode,count_2_shot,dose_nbr_2_shot,protocol,diff
3815,108,2021-03-02,1,37214.0,107.0,1.0,,1.0
3819,109,2021-03-02,1,37216.0,108.0,1.0,,1.0
4010,60,2021-03-06,1,,59.0,1.0,,1.0
4012,260,2021-03-06,1,37013.0,256.0,1.0,,4.0
4023,94,2021-03-06,1,37115.0,93.0,1.0,,1.0


In [24]:
mixed.vacc_date.min()

Timestamp('2021-03-02 00:00:00')

#### Creating a copy of the mixed dataframe called jj to hold only the Johnson and Johnson stats
- update the count for the JJ dataframe to be the diff
- populate the protocol column with 'one-shot'

In [25]:
jj = mixed.copy(deep = True)
jj.head()

Unnamed: 0,count_all,vacc_date,dose_nbr_all,zipcode,count_2_shot,dose_nbr_2_shot,protocol,diff
3815,108,2021-03-02,1,37214.0,107.0,1.0,,1.0
3819,109,2021-03-02,1,37216.0,108.0,1.0,,1.0
4010,60,2021-03-06,1,,59.0,1.0,,1.0
4012,260,2021-03-06,1,37013.0,256.0,1.0,,4.0
4023,94,2021-03-06,1,37115.0,93.0,1.0,,1.0


- update the count -- the count_2_shot has the count of two-shot protocol vaccines administered; we'll set this column in our jj copy to represent the number of Johnson & Johnson vaccines given so that the numbers are in the same column when we re-combine the data

In [26]:
jj['count_2_shot'] = jj['diff']
jj['protocol'] = 'one-shot'
jj.head()

Unnamed: 0,count_all,vacc_date,dose_nbr_all,zipcode,count_2_shot,dose_nbr_2_shot,protocol,diff
3815,108,2021-03-02,1,37214.0,1.0,1.0,one-shot,1.0
3819,109,2021-03-02,1,37216.0,1.0,1.0,one-shot,1.0
4010,60,2021-03-06,1,,1.0,1.0,one-shot,1.0
4012,260,2021-03-06,1,37013.0,4.0,1.0,one-shot,4.0
4023,94,2021-03-06,1,37115.0,1.0,1.0,one-shot,1.0


- update the mixed protocol to populate the protocol column with 'two-shot' since we are pulling out the J&J counts elsewhere

In [27]:
mixed['protocol'] = 'two-shot'

In [28]:
firsts_for_protocol2.shape

(2507, 7)

In [29]:
vacc_protocols = pd.concat([firsts_for_protocol2, mixed, jj, second_shots]).sort_values(['vacc_date', 'zipcode'])
vacc_protocols.head(3)

Unnamed: 0,count_all,vacc_date,dose_nbr_all,zipcode,count_2_shot,dose_nbr_2_shot,protocol,diff
0,1,2020-12-12,1,37215.0,1.0,1.0,two-shot,
1,1,2020-12-14,2,37205.0,1.0,2.0,two-shot,
2,1,2020-12-14,1,37215.0,1.0,1.0,two-shot,


In [30]:
vacc_protocols.shape

(6694, 8)

In [31]:
vacc_protocols = vacc_protocols.drop(columns = ['count_all', 'dose_nbr_all', 'diff'])
vacc_protocols.head()

Unnamed: 0,vacc_date,zipcode,count_2_shot,dose_nbr_2_shot,protocol
0,2020-12-12,37215.0,1.0,1.0,two-shot
1,2020-12-14,37205.0,1.0,2.0,two-shot
2,2020-12-14,37215.0,1.0,1.0,two-shot
3,2020-12-15,37221.0,1.0,1.0,two-shot
4,2020-12-16,37215.0,1.0,1.0,two-shot


In [34]:
vacc_protocols.columns = ['vacc_date', 'zipcode', 'count_given', 'dose_nbr', 'protocol']
vacc_protocols.sort_values(['vacc_date', 'zipcode'])
vacc_protocols.head()

Unnamed: 0,vacc_date,zipcode,count_given,dose_nbr,protocol
0,2020-12-12,37215.0,1.0,1.0,two-shot
1,2020-12-14,37205.0,1.0,2.0,two-shot
2,2020-12-14,37215.0,1.0,1.0,two-shot
3,2020-12-15,37221.0,1.0,1.0,two-shot
4,2020-12-16,37215.0,1.0,1.0,two-shot


In [35]:
vacc_protocols.to_csv('../data/vaccine_with_protocol.csv', index = False)