### Notebook to scrub ebola dataset for Project 2

In [2]:
# Dependencies and Setup
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# File to Load (Remember to Change These)
ebola_data_to_load = "Data/ebola_2014_2016_clean.csv"
centroid_data_to_load = "Data/clean_country_centroids_az8.csv"
pop_data_to_load = "Data/pop_data_2009-2019.csv"

# Read Population Data from "Estimates" sheet
ebola_data = pd.read_csv(ebola_data_to_load, encoding="ISO-8859-1")
centroid_data = pd.read_csv(centroid_data_to_load)
pop_data = pd.read_csv(pop_data_to_load)

In [3]:
# remove spaces and '.' in column names
ebola_data.columns = [c.replace(' ', '_') for c in ebola_data.columns]
ebola_data.columns = [c.replace('.', '') for c in ebola_data.columns]

# Add year column to dataframe
ebola_data['year'] = pd.DatetimeIndex(ebola_data['Date']).year

# Find earliest and latest dates
print(ebola_data.Date.min())
print(ebola_data.Date.max())
ebola_data.head()

2014-08-29
2016-03-23


Unnamed: 0,Country,Date,No_of_suspected_cases,No_of_probable_cases,No_of_confirmed_cases,"No_of_confirmed,_probable_and_suspected_cases",No_of_suspected_deaths,No_of_probable_deaths,No_of_confirmed_deaths,"No_of_confirmed,_probable_and_suspected_deaths",year
0,Guinea,2014-08-29,25.0,141.0,482.0,648.0,2.0,141.0,287.0,430.0,2014
1,Nigeria,2014-08-29,3.0,1.0,15.0,19.0,0.0,1.0,6.0,7.0,2014
2,Sierra Leone,2014-08-29,54.0,37.0,935.0,1026.0,8.0,34.0,380.0,422.0,2014
3,Liberia,2014-08-29,382.0,674.0,322.0,1378.0,168.0,301.0,225.0,694.0,2014
4,Sierra Leone,2014-09-05,78.0,37.0,1146.0,1261.0,11.0,37.0,443.0,491.0,2014


In [4]:
# How many occurrences of each country in dataset?
ebola_data['Country'].value_counts()

Liberia                     365
Sierra Leone                259
Guinea                      259
Nigeria                     255
Senegal                     254
United States of America    245
Spain                       243
Mali                        243
United Kingdom              221
Italy                       141
Name: Country, dtype: int64

In [5]:
# Group by country and year
ebola_1 = ebola_data.groupby(["Country", "year"], as_index = False).agg(
    {
        'No_of_confirmed_cases':'max',    
        'No_of_confirmed_deaths': 'max'
    }
)

ebola_1.head()

Unnamed: 0,Country,year,No_of_confirmed_cases,No_of_confirmed_deaths
0,Guinea,2014,2397.0,1433.0
1,Guinea,2015,3351.0,2083.0
2,Guinea,2016,3351.0,2083.0
3,Italy,2015,1.0,0.0
4,Italy,2016,1.0,


In [6]:
# Merge longitude and latitude data by country
# Get name, Longitude, and Latitude from centroid_data
centroid_data_1 = centroid_data[['name', 'Longitude', 'Latitude']]
centroid_data_1['name'].replace(['United States'], 'United States of America', inplace=True)
centroid_data_1.head()

Unnamed: 0,name,Longitude,Latitude
0,Aruba,-69.982677,12.52088
1,Afghanistan,66.004734,33.835231
2,Angola,17.537368,-12.293361
3,Anguilla,-63.064989,18.223959
4,Albania,20.049834,41.14245


In [7]:
# Merge centroid lon, lat 
ebola_2 = pd.merge(ebola_1, centroid_data_1, left_on='Country', right_on='name', how = 'left')
ebola_2.head()

Unnamed: 0,Country,year,No_of_confirmed_cases,No_of_confirmed_deaths,name,Longitude,Latitude
0,Guinea,2014,2397.0,1433.0,Guinea,-10.940666,10.436216
1,Guinea,2015,3351.0,2083.0,Guinea,-10.940666,10.436216
2,Guinea,2016,3351.0,2083.0,Guinea,-10.940666,10.436216
3,Italy,2015,1.0,0.0,Italy,12.070013,42.796626
4,Italy,2016,1.0,,Italy,12.070013,42.796626


In [8]:
# Get subset of pop_data, Country, 2014, 2015, 2016
pop_data_1 = pop_data[['Country', '2014', '2015', '2016']]
pop_data_1.head()

Unnamed: 0,Country,2014,2015,2016
0,WORLD,7295290.759,7379796.967,7464021.934
1,Burundi,9844.301,10160.034,10488.002
2,Comoros,759.39,777.435,795.597
3,Djibouti,898.707,913.998,929.117
4,Eritrea,3311.444,3342.818,3376.558


In [9]:
# Merge Population Data
# Merge centroid lon, lat 
ebola_3 = pd.merge(ebola_2, pop_data_1, on='Country', how = 'left')
ebola_3.head()

Unnamed: 0,Country,year,No_of_confirmed_cases,No_of_confirmed_deaths,name,Longitude,Latitude,2014,2015,2016
0,Guinea,2014,2397.0,1433.0,Guinea,-10.940666,10.436216,11150.97,11432.096,11738.434
1,Guinea,2015,3351.0,2083.0,Guinea,-10.940666,10.436216,11150.97,11432.096,11738.434
2,Guinea,2016,3351.0,2083.0,Guinea,-10.940666,10.436216,11150.97,11432.096,11738.434
3,Italy,2015,1.0,0.0,Italy,12.070013,42.796626,60409.622,60578.489,60663.068
4,Italy,2016,1.0,,Italy,12.070013,42.796626,60409.622,60578.489,60663.068


In [None]:
# Add population column
ebola_3['population'] = ebola_3['2014'] * 1000
ebola_3['population'] = np.where(ebola_3['year'] == 2015, ebola_3['2015'], ebola_3['population'] * 1000)
ebola_3['population'] = np.where(ebola_3['year'] == 2016, ebola_3['2016'], ebola_3['population'] * 1000)
ebola_3['Pandemic'] = "Ebola"
# Replace Nan values in No_of_confirmed_deaths column to 0
ebola_3['No_of_confirmed_deaths'].fillna(0, inplace=True)
ebola_3.hea

In [None]:
# Get columns in following order: Pandemic, Country, Year, Cases, Deaths, lon,lat, population
ebola_4 = ebola_3[['Pandemic', 'Country', 'year', 'No_of_confirmed_cases', 'No_of_confirmed_deaths', 'Longitude', 'Latitude', 'population']]
ebola_5 = ebola_4.rename(columns={"year": "Year", "Longitude": "Lon", "Latitude": "Lat", "No_of_confirmed_cases": "Cases", "No_of_confirmed_deaths": "Deaths"  })
ebola_5

In [None]:
# Save scrubbed file to .csv
ebola_5.to_csv(r'Data/ebola_data.csv', index = False, header=True)