# Notebook for IART project 2

## Data Importing

The dataframe currently being imported features data from the WHO situation reports from 2020-01-22 to 2020-05-21 (currently) for 188 different countries. This data presents the number of total confirmed cases, number of total confirmed deaths, and number of total recovered patients up to that day.

In [1]:
import pandas as pd
imported_df = pd.read_csv('Data/covid_19_clean_complete.csv')
display(imported_df)

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered
0,,Afghanistan,33.000000,65.000000,1/22/20,0,0,0
1,,Albania,41.153300,20.168300,1/22/20,0,0,0
2,,Algeria,28.033900,1.659600,1/22/20,0,0,0
3,,Andorra,42.506300,1.521800,1/22/20,0,0,0
4,,Angola,-11.202700,17.873900,1/22/20,0,0,0
...,...,...,...,...,...,...,...,...
32060,,Sao Tome and Principe,0.186360,6.613081,5/21/20,251,8,4
32061,,Yemen,15.552727,48.516388,5/21/20,197,33,0
32062,,Comoros,-11.645500,43.333300,5/21/20,34,1,8
32063,,Tajikistan,38.861034,71.276093,5/21/20,2350,44,0


## Modifying the data

For the purpose of this project we will perform quality of life changes to the data to make it more adquate for our usage. The changes we will perform are:

- Change the date column to a counter of the days passed since the first data colected.
- Group the data of each country into one single line per day, this means removing the Province/State column.
- Add a column for the number of current active cases(Total cases confirmed - (Deaths and Recovered)).
- Add a column for the number of active cases of the day before.
- Remove lines in which a country has 0 confirmed cases.

### Changing Date column to counter of days passed

In [2]:
imported_df['Date'] = pd.to_datetime(imported_df['Date'])
x = pd.to_datetime('2020-01-22')
imported_df['Day'] = (imported_df['Date'] - x).dt.days
del imported_df['Date']
display(imported_df)

Unnamed: 0,Province/State,Country/Region,Lat,Long,Confirmed,Deaths,Recovered,Day
0,,Afghanistan,33.000000,65.000000,0,0,0,0
1,,Albania,41.153300,20.168300,0,0,0,0
2,,Algeria,28.033900,1.659600,0,0,0,0
3,,Andorra,42.506300,1.521800,0,0,0,0
4,,Angola,-11.202700,17.873900,0,0,0,0
...,...,...,...,...,...,...,...,...
32060,,Sao Tome and Principe,0.186360,6.613081,251,8,4,120
32061,,Yemen,15.552727,48.516388,197,33,0,120
32062,,Comoros,-11.645500,43.333300,34,1,8,120
32063,,Tajikistan,38.861034,71.276093,2350,44,0,120


### Grouping the data of each country into one single line per day

In [3]:
del imported_df['Province/State']
new_df = imported_df.groupby(['Country/Region','Day']).agg({'Lat': 'first', 'Long': 'first', 'Confirmed': 'sum',
'Deaths': 'sum', 'Recovered': 'sum'}).reset_index()
display(new_df)

Unnamed: 0,Country/Region,Day,Lat,Long,Confirmed,Deaths,Recovered
0,Afghanistan,0,33.0,65.0,0,0,0
1,Afghanistan,1,33.0,65.0,0,0,0
2,Afghanistan,2,33.0,65.0,0,0,0
3,Afghanistan,3,33.0,65.0,0,0,0
4,Afghanistan,4,33.0,65.0,0,0,0
...,...,...,...,...,...,...,...
22743,Zimbabwe,116,-20.0,30.0,44,4,17
22744,Zimbabwe,117,-20.0,30.0,46,4,18
22745,Zimbabwe,118,-20.0,30.0,46,4,18
22746,Zimbabwe,119,-20.0,30.0,48,4,18


### Adding a column for the number of current active cases.

In [4]:
new_df['Active_Cases'] = new_df['Confirmed'] - (new_df['Deaths']+new_df['Recovered'])
display(new_df)

Unnamed: 0,Country/Region,Day,Lat,Long,Confirmed,Deaths,Recovered,Active_Cases
0,Afghanistan,0,33.0,65.0,0,0,0,0
1,Afghanistan,1,33.0,65.0,0,0,0,0
2,Afghanistan,2,33.0,65.0,0,0,0,0
3,Afghanistan,3,33.0,65.0,0,0,0,0
4,Afghanistan,4,33.0,65.0,0,0,0,0
...,...,...,...,...,...,...,...,...
22743,Zimbabwe,116,-20.0,30.0,44,4,17,23
22744,Zimbabwe,117,-20.0,30.0,46,4,18,24
22745,Zimbabwe,118,-20.0,30.0,46,4,18,24
22746,Zimbabwe,119,-20.0,30.0,48,4,18,26


### Adding a column for the number of confirmed cases of the day before

In [5]:
new_df['Yesterdays_Confirmed_Cases'] = new_df['Confirmed'].shift()                    
new_df.loc[new_df['Day'] < 1, 'Yesterdays_Confirmed_Cases'] = new_df['Confirmed']
display(new_df)

Unnamed: 0,Country/Region,Day,Lat,Long,Confirmed,Deaths,Recovered,Active_Cases,Yesterdays_Confirmed_Cases
0,Afghanistan,0,33.0,65.0,0,0,0,0,0.0
1,Afghanistan,1,33.0,65.0,0,0,0,0,0.0
2,Afghanistan,2,33.0,65.0,0,0,0,0,0.0
3,Afghanistan,3,33.0,65.0,0,0,0,0,0.0
4,Afghanistan,4,33.0,65.0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...
22743,Zimbabwe,116,-20.0,30.0,44,4,17,23,42.0
22744,Zimbabwe,117,-20.0,30.0,46,4,18,24,44.0
22745,Zimbabwe,118,-20.0,30.0,46,4,18,24,46.0
22746,Zimbabwe,119,-20.0,30.0,48,4,18,26,46.0


### Adding a column for the increase in cases from the day before to the present day

In [6]:
new_df['Increase_in_Cases'] = new_df['Confirmed']-new_df['Yesterdays_Confirmed_Cases']
display(new_df)

Unnamed: 0,Country/Region,Day,Lat,Long,Confirmed,Deaths,Recovered,Active_Cases,Yesterdays_Confirmed_Cases,Increase_in_Cases
0,Afghanistan,0,33.0,65.0,0,0,0,0,0.0,0.0
1,Afghanistan,1,33.0,65.0,0,0,0,0,0.0,0.0
2,Afghanistan,2,33.0,65.0,0,0,0,0,0.0,0.0
3,Afghanistan,3,33.0,65.0,0,0,0,0,0.0,0.0
4,Afghanistan,4,33.0,65.0,0,0,0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
22743,Zimbabwe,116,-20.0,30.0,44,4,17,23,42.0,2.0
22744,Zimbabwe,117,-20.0,30.0,46,4,18,24,44.0,2.0
22745,Zimbabwe,118,-20.0,30.0,46,4,18,24,46.0,0.0
22746,Zimbabwe,119,-20.0,30.0,48,4,18,26,46.0,2.0


## Creating the dependent varible
Named Will_Infection_Ratio_Increase this varible will be the dependent varible of our classification prediction model, this varible indicates if the increase of cases will be larger the following day.


In [7]:
new_df['Will_Infection_Ratio_Increase'] = (new_df['Increase_in_Cases'].shift(-1) - new_df['Increase_in_Cases'] ) > 0                  
new_df.loc[new_df['Day'] < 1, 'Will_Infection_Ratio_Increase'] = True
display(new_df)

Unnamed: 0,Country/Region,Day,Lat,Long,Confirmed,Deaths,Recovered,Active_Cases,Yesterdays_Confirmed_Cases,Increase_in_Cases,Will_Infection_Ratio_Increase
0,Afghanistan,0,33.0,65.0,0,0,0,0,0.0,0.0,True
1,Afghanistan,1,33.0,65.0,0,0,0,0,0.0,0.0,False
2,Afghanistan,2,33.0,65.0,0,0,0,0,0.0,0.0,False
3,Afghanistan,3,33.0,65.0,0,0,0,0,0.0,0.0,False
4,Afghanistan,4,33.0,65.0,0,0,0,0,0.0,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...
22743,Zimbabwe,116,-20.0,30.0,44,4,17,23,42.0,2.0,False
22744,Zimbabwe,117,-20.0,30.0,46,4,18,24,44.0,2.0,False
22745,Zimbabwe,118,-20.0,30.0,46,4,18,24,46.0,0.0,True
22746,Zimbabwe,119,-20.0,30.0,48,4,18,26,46.0,2.0,True



### Removing lines in which the countries have 0 confirmed cases

In [8]:
new_df = new_df[new_df.Confirmed != 0]
new_df = new_df.reset_index(drop=True)
display(new_df)

Unnamed: 0,Country/Region,Day,Lat,Long,Confirmed,Deaths,Recovered,Active_Cases,Yesterdays_Confirmed_Cases,Increase_in_Cases,Will_Infection_Ratio_Increase
0,Afghanistan,33,33.0,65.0,1,0,0,1,0.0,1.0,False
1,Afghanistan,34,33.0,65.0,1,0,0,1,1.0,0.0,False
2,Afghanistan,35,33.0,65.0,1,0,0,1,1.0,0.0,False
3,Afghanistan,36,33.0,65.0,1,0,0,1,1.0,0.0,False
4,Afghanistan,37,33.0,65.0,1,0,0,1,1.0,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...
14687,Zimbabwe,116,-20.0,30.0,44,4,17,23,42.0,2.0,False
14688,Zimbabwe,117,-20.0,30.0,46,4,18,24,44.0,2.0,False
14689,Zimbabwe,118,-20.0,30.0,46,4,18,24,46.0,0.0,True
14690,Zimbabwe,119,-20.0,30.0,48,4,18,26,46.0,2.0,True


### Sort data by day

In [9]:
#new_df = new_df.sort_values(by=['Day','Country/Region'])
#new_df = new_df.reset_index(drop=True)
#display(new_df)

Unnamed: 0,Country/Region,Day,Lat,Long,Confirmed,Deaths,Recovered,Active_Cases,Yesterdays_Confirmed_Cases,Increase_in_Cases,Will_Infection_Ratio_Increase
0,China,0,31.825700,117.226400,548,17,28,503,548.0,0.0,True
1,Japan,0,36.000000,138.000000,2,0,0,2,2.0,0.0,True
2,South Korea,0,36.000000,128.000000,1,0,0,1,1.0,0.0,True
3,Taiwan*,0,23.700000,121.000000,1,0,0,1,1.0,0.0,True
4,Thailand,0,15.000000,101.000000,2,0,0,2,2.0,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...
14687,West Bank and Gaza,120,31.952200,35.233200,423,2,346,75,398.0,25.0,False
14688,Western Sahara,120,24.215500,-12.885800,6,0,6,0,6.0,0.0,False
14689,Yemen,120,15.552727,48.516388,197,33,0,164,184.0,13.0,False
14690,Zambia,120,-15.416700,28.283300,866,7,302,557,832.0,34.0,False


In [10]:
#CODE SNIPPETS
##pd.set_option('display.max_rows', 40000)
#display(new_df.loc[new_df['Country/Region'] == 'US'])