## Exploration des données 

In [11]:
import os
import pandas as pd
from datetime import date, timedelta

df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/03-05-2020.csv', sep=',')
df.head(10)

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude
0,Hubei,Mainland China,2020-03-05T14:53:03,67466,2902,40592,30.9756,112.2707
1,,South Korea,2020-03-05T09:03:09,6088,35,41,36.0,128.0
2,,Italy,2020-03-05T17:43:03,3858,148,414,43.0,12.0
3,,Iran,2020-03-05T13:43:04,3513,107,739,32.0,53.0
4,Guangdong,Mainland China,2020-03-05T09:23:03,1351,7,1181,23.3417,113.4244
5,Henan,Mainland China,2020-03-05T01:48:26,1272,22,1239,33.882,113.614
6,Zhejiang,Mainland China,2020-03-05T09:43:03,1215,1,1124,29.1832,120.0934
7,Hunan,Mainland China,2020-03-05T08:43:03,1018,4,938,27.6104,111.7088
8,Anhui,Mainland China,2020-03-05T04:33:02,990,6,970,31.8257,117.2264
9,Jiangxi,Mainland China,2020-03-05T01:16:58,935,1,901,27.614,115.7221


In [2]:
BASE_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/'


### Boucle de récupération des fichiers

In [3]:
from datetime import date, timedelta
sdate = date(2020, 1, 22)
edate = date(2020, 3, 5)
delta = edate - sdate

In [4]:
RAWFILES_DIR = '../data/raw'

for i in range(delta.days + 1):
    day=sdate + timedelta(days=i)
    day_label = day.strftime("%m-%d-%Y") + '.csv'
    virus_df=pd.read_csv(BASE_URL + day_label, sep=',')
    virus_df.to_csv(os.path.join(RAWFILES_DIR, day_label), index=False)


In [5]:
# Method2
BASE_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/{}.csv'

In [6]:
for i in range(delta.days + 1):
    day=sdate + timedelta(days=i)
    day_label = day.strftime("%m-%d-%Y")
    virus_df=pd.read_csv(BASE_URL.format(day_label), sep=',')
    virus_df.to_csv(os.path.join(RAWFILES_DIR, day_label+'.csv'), index=False)

## Constitution de la table de références lat/log

In [7]:
import glob
RAWFILES_DIR = '../data/raw'

df_list = []
for file in glob.glob(os.path.join(RAWFILES_DIR,'*csv')):
    virus_df=pd.read_csv(file, sep=',')
    if 'Latitude' in virus_df.columns and 'Longitude' in virus_df.columns:
        df_list.append(virus_df)
all_df = pd.concat(df_list)
all_df.shape


(750, 8)

In [8]:
RAWFILES_DIR = '../data/processed'


In [9]:
(all_df[['Province/State','Country/Region','Latitude','Longitude']
       ].drop_duplicates(subset=['Province/State','Country/Region']
                        ).sort_values(['Province/State','Country/Region']
                                     ).to_csv(os.path.join(RAWFILES_DIR,'lon_lag.csv'
                                                          ), index=False))


In [10]:
lon_lag = pd.read_csv(os.path.join(RAWFILES_DIR,'lon_lag.csv'))
df_list = []
for file in glob.glob(os.path.join(RAWFILES_DIR,'*csv')):
    virus_df=pd.read_csv(file, sep=',')
    if not ('Latitude' in virus_df.columns and 'Longitude' in virus_df.columns):
        virus_df = virus_df.merge(lon_lag, on=['Province/State','Country/Region'], how='left')
    
    df_list.append(virus_df.assign(source=os.path.basename(file))) # add a source columns in the table
        
all_df = pd.concat(df_list)
all_df

Unnamed: 0,Province/State,Country/Region,Latitude,Longitude,source
0,"Montreal, QC",Canada,45.5017,-73.5673,lon_lag.csv
1,"Norfolk County, MA",US,42.1767,-71.1449,lon_lag.csv
2,Anhui,Mainland China,31.8257,117.2264,lon_lag.csv
3,Beijing,Mainland China,40.1824,116.4142,lon_lag.csv
4,"Bergen County, NJ",US,40.9263,-74.0770,lon_lag.csv
...,...,...,...,...,...
172,,Tunisia,34.0000,9.0000,lon_lag.csv
173,,UK,55.0000,-3.0000,lon_lag.csv
174,,Ukraine,48.3794,31.1656,lon_lag.csv
175,,United Arab Emirates,24.0000,54.0000,lon_lag.csv
