In [280]:
import matplotlib.pyplot as plt
import pandas as pd

In [281]:
df_raw = pd.read_csv('/Users/scottye/Desktop/InfoVis/Final_Project/Nov30/OxCGRT_nat_latest.csv')
df_population = pd.read_csv('/Users/scottye/Desktop/InfoVis/Final_Project/Nov30/population_by_country_2020.csv')

In [282]:
# Preprocessing
df = df_raw[['CountryName', 'Date', 'V1_Vaccine Prioritisation (summary)', 
             'V2A_Vaccine Availability (summary)', 'V3_Vaccine Financial Support (summary)', 
             'V4_Mandatory Vaccination (summary)', 'ConfirmedCases', 'ConfirmedDeaths', 
             'MajorityVaccinated', 'PopulationVaccinated']]

df.rename(columns={'V1_Vaccine Prioritisation (summary)':'V1', 'V2A_Vaccine Availability (summary)':'V2',
                  'V3_Vaccine Financial Support (summary)':'V3', 'V4_Mandatory Vaccination (summary)':'V4',
                  'PopulationVaccinated':'PercentageVaccinated'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [283]:
# Add total population for each country
df = df[~df['CountryName'].isin(["Cote d'Ivoire", "Cape Verde", "Kosovo", "United States Virgin Islands"])]
df_population = df_population[['Country (or dependency)', 'Population (2020)']].rename(columns={'Country (or dependency)':'CountryName',
                                                                                       'Population (2020)':'Population'})
df = pd.merge(df, df_population, how='left')
df['PopulationVaccinated'] = df['PercentageVaccinated'] / 100 * df['Population'] 


In [284]:
# More processing
df['V1'] = df['V1'] / 2 * 100
df['V2'] = df['V2'] / 3 * 100
df['V3'] = df['V3'] / 5 * 100
df['V4'] = df['V4'].map({0:'False', 1:'True'})

In [285]:
# create world statistics
df_world = df[['Date', 'V1', 'V2', 'V3', 'V4', 'ConfirmedCases', 
                   'ConfirmedDeaths', 'PopulationVaccinated']].copy()
df_world = df_world.groupby('Date').agg({'V1':'mean', 'V2':'mean', 'V3':'mean', 'V4':pd.Series.mode,
                                            'ConfirmedCases':'sum', 'ConfirmedDeaths':'sum',
                                            'PopulationVaccinated':'sum'})

df_world['Population'] = 7808449406
df_world['PercentageVaccinated'] = df_world['PopulationVaccinated'] / df_world['Population'] * 100
df_world['MajorityVaccinated'] = 'NV'
df_world.loc[df_world.PercentageVaccinated > 50, 'MajorityVaccinated'] = 'V'
df_world['CountryName'] = 'World'
df_world.reset_index(inplace=True)

df = pd.concat([df, df_world])
df['PopulationVaccinated'] = df['PopulationVaccinated'].round()
df['PercentageVaccinated'] = df['PercentageVaccinated'].round(decimals = 2)
df['V2'] = df['V2'].round(decimals = 2)
#df.to_csv('/Users/scottye/Desktop/InfoVis/Final_Project/Nov30/daily.csv', encoding='utf-8', index=False)

In [276]:
# Process monthly statistics
df_m = df.copy()
df_m['Date'] = df_m['Date']//100
df_m.drop_duplicates(subset=['CountryName', 'Date'], keep='last', inplace=True)
df_m = df_m[df_m.Date != 202211]
#df_m.to_csv('/Users/scottye/Desktop/InfoVis/Final_Project/Nov30/monthly.csv', encoding='utf-8', index=False)

In [277]:
df_m

Unnamed: 0,CountryName,Date,V1,V2,V3,V4,ConfirmedCases,ConfirmedDeaths,MajorityVaccinated,PercentageVaccinated,Population,PopulationVaccinated
30,Aruba,202001,0.000000,0.00,0.000000,,0.0,0.0,NV,0.00,106845,0.000000e+00
59,Aruba,202002,0.000000,0.00,0.000000,,0.0,0.0,NV,0.00,106845,0.000000e+00
90,Aruba,202003,0.000000,0.00,0.000000,,55.0,0.0,NV,0.00,106845,0.000000e+00
120,Aruba,202004,0.000000,0.00,0.000000,,100.0,2.0,NV,0.00,106845,0.000000e+00
151,Aruba,202005,0.000000,0.00,0.000000,,101.0,3.0,NV,0.00,106845,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...
911,World,202206,98.888889,85.56,92.666667,False,545701276.0,6333723.0,V,60.60,7808449406,4.732141e+09
942,World,202207,98.603352,85.69,94.350282,False,575228934.0,6394828.0,V,61.45,7808449406,4.798493e+09
973,World,202208,98.888889,87.19,94.689266,False,600836185.0,6469770.0,V,62.05,7808449406,4.845153e+09
1003,World,202209,98.603352,86.17,93.750000,False,615547717.0,6520960.0,V,62.47,7808449406,4.878150e+09


In [278]:
# Process statistics for every 10 days
# We will have 202xxx0, 202xxx1, 202xxx2 to separate each month into three date ranges
df_t = df.copy()
df_t['Date'] = df_t['Date']//10
df_t.drop_duplicates(subset=['CountryName', 'Date'], keep='last', inplace=True)
df_t = df_t[df_t['Date'].astype(str).str.strip().str[-1] != '3']
df_t = df_t[df_t.Date != 2022112]
#df_t.to_csv('/Users/scottye/Desktop/InfoVis/Final_Project/Nov30/ten_days.csv', encoding='utf-8', index=False)

In [279]:
df_t

Unnamed: 0,CountryName,Date,V1,V2,V3,V4,ConfirmedCases,ConfirmedDeaths,MajorityVaccinated,PercentageVaccinated,Population,PopulationVaccinated
8,Aruba,2020010,0.000000,0.00,0.000000,,,,,,106845,
18,Aruba,2020011,0.000000,0.00,0.000000,,,,,,106845,
28,Aruba,2020012,0.000000,0.00,0.000000,,0.0,0.0,NV,0.00,106845,0.000000e+00
39,Aruba,2020020,0.000000,0.00,0.000000,,0.0,0.0,NV,0.00,106845,0.000000e+00
49,Aruba,2020021,0.000000,0.00,0.000000,,0.0,0.0,NV,0.00,106845,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...
1012,World,2022100,98.295455,87.02,94.186047,False,619174120.0,6532463.0,V,62.70,7808449406,4.895546e+09
1022,World,2022101,98.857143,86.35,93.099415,False,624034208.0,6548681.0,V,62.81,7808449406,4.904261e+09
1032,World,2022102,98.816568,85.45,92.848485,False,627813026.0,6564226.0,V,62.87,7808449406,4.909211e+09
1043,World,2022110,99.342105,86.17,96.054422,False,631537475.0,6580375.0,V,63.03,7808449406,4.921828e+09
