## Universidad del Valle de Guatemala
### Data Science
#### Juan Marroquin 19845
#### Carlos Raxtum 19721

In [1]:
import pandas as pd

###### functions that rename and convert the data

In [2]:
# rename columns
def rename_columns(df):
    output = df.rename(
        columns = {
            'Province/State':'subregion',
            'Country/Region': 'country',
            'Lat': 'lat',
            'Long':'long',
        }
    )
    return output

In [3]:
def melt_data(df,new_name):
    output = df.melt(id_vars = ['country','subregion','lat','long'],
                     var_name = 'date_Raw',
                     value_name = new_name
    )
    return output

In [4]:
def convert_dates(df):
    output = df.assign(
        date = pd.to_datetime(df.date_Raw,format='%m/%d/%y')
    )
    output.drop(columns = ['date_Raw'],inplace = True)
    return output

In [5]:
def arrange_data(df,new_name):
    output = df.filter(['country','subregion','date','lat','long',new_name]).sort_values(['country','subregion','date','lat','long']).reset_index(drop=True)
    return output

In [6]:
def get_csv(url,value_var_name):
    df = pd.read_csv(url)
    df = rename_columns(df)
    df = melt_data(df,value_var_name)
    df = convert_dates(df)
    df = arrange_data(df,value_var_name)
    return df

##### Get data from url

In [7]:
#get case confirmed data
url ='https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_confirmed_global.csv&filename=time_series_covid19_confirmed_global.csv'
case_confirmed = get_csv(url,'confirmed')
# get death confirmed data
url = 'https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_deaths_global.csv&filename=time_series_covid19_deaths_global.csv'
dead_confirmed = get_csv(url,'death')
#get recoverd confirmed data
url = 'https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_recovered_global.csv&filename=time_series_covid19_recovered_global.csv'
recoverd_confirmed = get_csv(url,'recovered')

In [8]:
print('case confirmed: ',case_confirmed.shape)
print('dead confirmed: ',dead_confirmed.shape)
print('recoverd confirmed: ',recoverd_confirmed.shape)

case confirmed:  (286977, 6)
dead confirmed:  (286977, 6)
recoverd confirmed:  (272082, 6)


###### drop columns before merge

In [9]:
dead_confirmed.drop(columns=['lat','long'],inplace=True)
recoverd_confirmed.drop(columns=['lat','long'],inplace=True)

##### Merge the dfs into one

In [10]:
merge_data = case_confirmed.merge(dead_confirmed, on = ['country','subregion','date'],how = 'left').merge(recoverd_confirmed, on = ['country','subregion','date'],how = 'left')

In [11]:
merge_data.head()

Unnamed: 0,country,subregion,date,lat,long,confirmed,death,recovered
0,Afghanistan,,2020-01-22,33.93911,67.709953,0,0,0.0
1,Afghanistan,,2020-01-23,33.93911,67.709953,0,0,0.0
2,Afghanistan,,2020-01-24,33.93911,67.709953,0,0,0.0
3,Afghanistan,,2020-01-25,33.93911,67.709953,0,0,0.0
4,Afghanistan,,2020-01-26,33.93911,67.709953,0,0,0.0


In [12]:
merge_data.to_csv('covid19.csv',index = False, encoding='utf-8')