In [1]:
#! python3
import os
import pandas as pd
import numpy as np
from datetime import datetime

# display more than 5 columns
desired_width = 320
pd.set_option('display.width', desired_width)
np.set_printoptions(linewidth=desired_width)
pd.set_option('display.max_columns', 10)

# set directory
os.chdir(r'C:\Users\kinan\Desktop\StatisticsandDataScience\Modern Data Analytics\Data\Medical')
data = pd.read_csv('COVID-19_Vaccinations_in_the_United_States_County.csv', header=[0])

  exec(code_obj, self.user_global_ns, self.user_ns)


In [2]:
data.head()

Unnamed: 0,Date,FIPS,MMWR_week,Recip_County,Recip_State,...,Census2019_5PlusPop,Census2019_5to17Pop,Census2019_12PlusPop,Census2019_18PlusPop,Census2019_65PlusPop
0,05/03/2022,1013,18,Butler County,AL,...,18348.0,3174.0,16675.0,15174.0,4049.0
1,05/03/2022,1045,18,Dale County,AL,...,45899.0,7915.0,41540.0,37984.0,8502.0
2,05/03/2022,5065,18,Izard County,AR,...,13064.0,1883.0,12084.0,11181.0,3558.0
3,05/03/2022,12043,18,Glades County,FL,...,13407.0,1584.0,12604.0,11823.0,3808.0
4,05/03/2022,13179,18,Liberty County,GA,...,55036.0,11002.0,48507.0,44034.0,5908.0


In [3]:
# Choosing only columns of interest
# Recip_State -> State
# Series_Complete_Yes -> Total number of people who are fully vaccinated
# 'Booster_Doses' -> Total number of people who are fully vaccinated and have received a booster (or additional) dose
data = data[['Date','Recip_County', 'MMWR_week', 'Recip_State', 'Series_Complete_Yes',
             'Booster_Doses']]
# cap for date (01/04/2022)
data['Date'] = pd.to_datetime(data['Date'], format="%m/%d/%Y")
data = data[data['Date'] < '2022-04-01']

In [4]:
# Checking the numer of NAs
NAs = data.columns[data.isnull().any()].tolist()
num = data.isna().sum().sum()
print('Missing values: '+ str(NAs)+'. Total number: '+str(num))
#for i in NAs:
#    print(i + '\t\t' + str(data[i].isnull().sum()))

Missing values: ['Series_Complete_Yes', 'Booster_Doses']. Total number: 1209670


In [5]:
data = data.fillna(value='NA') # missing in the data file are blank,chaning it to NA

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1555611 entries, 108366 to 1663976
Data columns (total 6 columns):
 #   Column               Non-Null Count    Dtype         
---  ------               --------------    -----         
 0   Date                 1555611 non-null  datetime64[ns]
 1   Recip_County         1555611 non-null  object        
 2   MMWR_week            1555611 non-null  int64         
 3   Recip_State          1555611 non-null  object        
 4   Series_Complete_Yes  1555611 non-null  object        
 5   Booster_Doses        1555611 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 83.1+ MB


In [7]:
#459640, 12/15/2021 is the first row when booster doses start being recorded
data.loc[(data.Date<'2021-12-15'),'Booster_Doses']=0
data.tail()

Unnamed: 0,Date,Recip_County,MMWR_week,Recip_State,Series_Complete_Yes,Booster_Doses
1663972,2020-12-13,Boulder County,51,CO,0.0,0
1663973,2020-12-13,Kit Carson County,51,CO,0.0,0
1663974,2020-12-13,Floyd County,51,IA,0.0,0
1663975,2020-12-13,Brantley County,51,GA,0.0,0
1663976,2020-12-13,Mecosta County,51,MI,0.0,0


In [8]:
sort_data=data.copy(deep=True)
sort_data.sort_values(by=['Recip_County','Date'],inplace=True, ascending=False)

In [9]:
''' We can replace NA with mean or mode. However if we know the number of boosters/series in a particular place at the preius date
we can imputing the missing value with the value from the previous day. To achieve it we need to check present day and the day
before'''
for i in range(1,len(sort_data)):
    if sort_data.iloc[i,-2]=='NA':
        if sort_data.iloc[i,2]==sort_data.iloc[i-1,2]:
            sort_data.iloc[i,-2] = sort_data.iloc[i-1,2]
        else:
            sort_data.iloc[i,-2] = 0

for i in range(1,len(sort_data)):
    if sort_data.iloc[i,-1]=='NA':
        if sort_data.iloc[i,2]==sort_data.iloc[i-1,2]:
            sort_data.iloc[i,-1] = sort_data.iloc[i-1,2]
        else:
            sort_data.iloc[i,-1] = 0

KeyboardInterrupt: 

In [None]:
sort_data[sort_data=='NA'].count()

In [None]:
# Grouping data by State
data_gr1 = sort_data.groupby(['Date', 'Recip_State'])[['Series_Complete_Yes', 'Booster_Doses']].sum()
data_gr2 = sort_data.groupby(['Date', 'Recip_State'])[['MMWR_week']].agg(lambda x:x.value_counts().index[0])

In [None]:
data_gr1.info()
data_gr2.info()

In [None]:
date = data_gr1.index.get_level_values('Date')
state = data_gr1.index.get_level_values('Recip_State')
series_complete_yes = data_gr1.iloc[:,0].to_list()
booster_doses = data_gr1.iloc[:,1].to_list()
mmwr = data_gr2.iloc[:,0].to_list()

new = {'Date':date,'State':state,'Series_Complete_Yes':series_complete_yes,'Booster_Doses':booster_doses,'MMWR_week':mmwr}
new_data = pd.DataFrame(new)
new_data = new_data[::-1]

In [None]:
new_data.to_csv(r'vaccine2.csv',sep=',',index=False)