In [65]:
#Import dependencies
import pandas as pd

In [66]:
#Read in h1n1 data
swine_data = pd.read_csv('resources/h1n1.csv', encoding='latin_1')
#Check data is read in
swine_data.head()

Unnamed: 0,Country,Cases,Deaths,Update Time
0,Algeria,5,0.0,7/6/2009 9:00
1,Antigua and Barbuda,2,0.0,7/6/2009 9:00
2,Argentina,2485,60.0,7/6/2009 9:00
3,Australia,5298,10.0,7/6/2009 9:00
4,Austria,19,0.0,7/6/2009 9:00


In [67]:
#Clean country names
countries = swine_data.Country.to_list()
#Remove problematic characters from the names & fix Vietnam
country_clean = []
for country in countries:
    if country == "Viet Nam":
        country = "Vietnam"
    step1 = country.strip('\xa0\xa0\xa0\xa0\xa0')
    step2 = country.strip(' *')
    step3 = country.strip('     ')
    country_clean.append(step3)
    
#Replace Country column
swine_data['Country'] = country_clean
swine_data.head()

Unnamed: 0,Country,Cases,Deaths,Update Time
0,Algeria,5,0.0,7/6/2009 9:00
1,Antigua and Barbuda,2,0.0,7/6/2009 9:00
2,Argentina,2485,60.0,7/6/2009 9:00
3,Australia,5298,10.0,7/6/2009 9:00
4,Austria,19,0.0,7/6/2009 9:00


In [68]:
#Remove Grand Total
swine_data.drop(1821,axis=0, inplace=True)
swine_data.tail()

Unnamed: 0,Country,Cases,Deaths,Update Time
1816,Switzerland,2,0.0,5/23/2009 8:00
1817,Thailand,2,0.0,5/23/2009 8:00
1818,Turkey,2,0.0,5/23/2009 8:00
1819,United Kingdom,117,0.0,5/23/2009 8:00
1820,United States of America,6552,9.0,5/23/2009 8:00


In [69]:
#Set data to the week progression in pandemic
#Assign dates to week in the pandemic
times = swine_data['Update Time'].to_list()
weeks = []
for time in times:
    #Week 1
    if time == '5/23/2009 8:00' or time == '5/25/2009 8:00' or time == '5/26/2009 6:00' or time == '5/27/2009 8:00' or time == '5/29/2009 6:00':
        weeks.append(1)
    #Week 2
    elif time == '6/8/2009 6:00' or time == '6/5/2009 6:00' or time == '6/3/2009 6:00' or time == '6/1/2009 6:00':
        weeks.append(2)
    #Week 3
    elif time == '6/17/2009 12:00' or time == '6/15/2009 17:00' or time == '6/12/2009 7:00' or time == '6/11/2009 14:00' or time == '6/10/2009 6:00':
        weeks.append(3)
    #Week 4
    elif time == '6/26/2009 7:00' or time == '6/24/2009 7:00' or time == '6/22/2009 7:00' or time == '6/19/2009 7:00':
        weeks.append(4)
    #Week 5
    elif time == '7/6/2009 9:00' or time == '7/3/2009 9:00' or time == '7/1/2009 9:00' or time == '6/29/2009 9:00':
        weeks.append(5)
#Add week number to df
swine_data['Week'] = weeks
#Drop Update Time
swine_data.drop('Update Time', axis=1, inplace=True)
swine_data.head()

Unnamed: 0,Country,Cases,Deaths,Week
0,Algeria,5,0.0,5
1,Antigua and Barbuda,2,0.0,5
2,Argentina,2485,60.0,5
3,Australia,5298,10.0,5
4,Austria,19,0.0,5


In [70]:
#Assign data for Territory Countries to Overarching Country
countries = swine_data.Country.to_list()
countries_assigned = []
for country in countries:
    #Assign UKOTs & Crown Dependencies to UK
    if 'UKOT' in country or 'Crown Dependency' in country:
        country = 'United Kingdom'
        countries_assigned.append(country)
    #Assign FOCs to France
    elif 'FOC' in country:
        country = 'France'
        countries_assigned.append(country)
    #Assign Netherland's territories to Netherland
    elif 'Netherlands' in country:
        country = 'Netherlands'
        countries_assigned.append(country)
    #Assign Puerto Rico & US Virgin Islands to USA
    elif country == 'Puerto Rico' or country == 'Virgin Islands':
        country = 'United States'
        countries_assigned.append(country)
    #Fix Macedonia's name
    elif country == 'The former Yugoslav Republic of Macedonia':
        country = 'North Macedonia'
        countries_assigned.append(country)
    #Fix US name
    elif country == 'United States of America':
        country = 'United States'
        countries_assigned.append(country)
    #Set all Irans to Iran
    elif country == 'Iran, Islamic Republic':
        country = 'Iran'
        countries_assigned.append(country)
    #Fix North Korea name
    elif country == 'Korea, Republic of':
        country = 'Republic of Korea'
        countries_assigned.append(country)
    #If no fixing necessary append
    else:
        countries_assigned.append(country)
#Reset Country column in df
swine_data.Country = countries_assigned
swine_data.head()

Unnamed: 0,Country,Cases,Deaths,Week
0,Algeria,5,0.0,5
1,Antigua and Barbuda,2,0.0,5
2,Argentina,2485,60.0,5
3,Australia,5298,10.0,5
4,Austria,19,0.0,5


In [71]:
#Group data by Week and Country
swine_group = swine_data.groupby(['Week', 'Country'])
swine_group.count().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Cases,Deaths
Week,Country,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Argentina,5,5
1,Australia,5,5
1,Austria,5,5
1,Bahrain,2,2
1,Belgium,5,5


In [72]:
#Aggregate(Total) Cases and Deathe for each country by week into new df
swine_clean = pd.DataFrame({'Total Cases':swine_group.Cases.sum(), 'Total Deaths':swine_group.Deaths.sum()})
swine_clean.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Total Cases,Total Deaths
Week,Country,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Argentina,64,0.0
1,Australia,233,0.0
1,Austria,5,0.0
1,Bahrain,2,0.0
1,Belgium,36,0.0


In [73]:
#Send clean data to new csv for storage and recall
swine_clean.to_csv('data/h1n1_clean.csv')