In [1]:
import subprocess
import os

import pandas as pd

import requests
from bs4 import BeautifulSoup

import json

pd.set_option('display.max.rows',500)

![CRISP_DM](../reports/figures/CRISP_DM.png)

# Data Understanding

* RKI, webscrape (webscraping) https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html
* John Hopkins (GITHUB) https://github.com/CSSEGISandData/COVID-19.git
* REST API services to retrieve data https://npgeo-corona-npgeo-de.hub.arcgis.com/

### GIHUB csv data
git clone/pull https://github.com/CSSEGISandData/COVID-19.git

In [2]:
#Please replace the git installation path if nessecary!
#It was tested in Windows.
#In Windows 11 it is not necessary to put the path but just the command directly!
#Also because the path is registered into os environment variables.
git_pull = subprocess.Popen("git pull",
                            cwd = os.path.dirname('../data/raw/COVID-19/'),
                          shell = True,
                         stdout = subprocess.PIPE,
                         stderr = subprocess.PIPE)

(out, error) = git_pull.communicate()

print("Error : " + str(error))
print("out : " + str(out))

Error : b''
out : b'Already up to date.\n'


In [3]:
data_path='../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
pd_raw=pd.read_csv(data_path)

In [4]:
pd_raw

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,6/20/22,6/21/22,6/22/22,6/23/22,6/24/22,6/25/22,6/26/22,6/27/22,6/28/22,6/29/22
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,181725,181808,181912,181987,182033,182072,182149,182228,182324,182403
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,277444,277663,277940,278211,278504,278793,279077,279077,279167,280298
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,265985,265993,266006,266015,266025,266030,266038,266049,266062,266073
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,43449,43449,43774,43774,43774,43774,43774,43774,43774,43774
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,99761,99761,99761,99761,99761,99761,99761,99761,101320,101320
5,,Antarctica,-71.9499,23.347,0,0,0,0,0,0,...,11,11,11,11,11,11,11,11,11,11
6,,Antigua and Barbuda,17.0608,-61.7964,0,0,0,0,0,0,...,8581,8581,8590,8590,8625,8625,8625,8625,8625,8625
7,,Argentina,-38.4161,-63.6167,0,0,0,0,0,0,...,9341492,9341492,9341492,9341492,9341492,9341492,9367172,9367172,9367172,9367172
8,,Armenia,40.0691,45.0382,0,0,0,0,0,0,...,423104,423104,423104,423104,423104,423104,423104,423243,423243,423243
9,Australian Capital Territory,Australia,-35.4735,149.0124,0,0,0,0,0,0,...,147942,148996,150123,151113,152199,153012,153939,155047,156472,157678


### Webscrapping

In [5]:
page = requests.get("https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html")

In [6]:
soup = BeautifulSoup(page.content,'html.parser')

In [7]:
html_table=soup.find('table')

In [8]:
all_rows=html_table.find_all('tr')

In [9]:
final_data_list=[]

In [10]:
for pos,rows in enumerate(all_rows):
    col_list=[each_col.get_text(strip=True) for each_col in rows.find_all('td')]
    final_data_list.append(col_list)

In [11]:
data_frame = pd.DataFrame(final_data_list).dropna()

In [12]:
pd_daily_status = data_frame.reset_index(drop=True).rename(columns={0:'state',
                                                                   1:'Number of cases',
                                                                   2:'Difference_with_the_day_before',
                                                                   3:'Cases_in_the_last_7_days',
                                                                   4:'7-days-incidence',
                                                                   5:'Fatal'})

In [13]:
pd_daily_status.head()

Unnamed: 0,state,Number of cases,Difference_with_the_day_before,Cases_in_the_last_7_days,7-days-incidence,Fatal
0,Baden-Württem­berg,3.860.465,15.016,61.47,5536,16.332
1,Bayern,5.158.687,20.36,80.146,6099,24.393
2,Berlin,1.103.337,3.431,15.508,4232,4.648
3,Branden­burg,824.477,2.776,10.852,4288,5.718
4,Bremen,217.164,1.07,5.337,7847,791.0


### REST API calls

In [14]:
data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronafälle_in_den_Bundesländern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

In [15]:
json_object=json.loads(data.content)

In [16]:
type(json_object)

dict

In [17]:
json_object.keys()

dict_keys(['objectIdFieldName', 'uniqueIdField', 'globalIdFieldName', 'geometryProperties', 'geometryType', 'spatialReference', 'fields', 'features'])

In [18]:
full_list=[]
for pos,each_dict in enumerate (json_object['features'][:]):
    full_list.append(each_dict['attributes'])

In [19]:
pd_full_list = pd.DataFrame(full_list)

In [20]:
pd_full_list.head()

Unnamed: 0,OBJECTID_1,LAN_ew_AGS,LAN_ew_GEN,LAN_ew_BEZ,LAN_ew_EWZ,OBJECTID,Fallzahl,Aktualisierung,AGS_TXT,GlobalID,faelle_100000_EW,Shape__Area,Shape__Length,Death,cases7_bl_per_100k,cases7_bl,death7_bl,cases7_bl_per_100k_txt,AdmUnitId
0,1,1,Schleswig-Holstein,Land,2910875,15,837055,1656540000000,1,fc5ba936-c95c-432c-8a33-9eb2f30b660f,28756.13003,45737310000.0,2881496.0,2612,1036.904711,30183,4,10369,1
1,2,2,Hamburg,Freie und Hansestadt,1852478,6,633772,1656540000000,2,0f3e860c-5181-4d3f-a421-1d51f50315ea,34212.120198,2089396000.0,418800.2,2724,648.64468,12016,3,6486,2
2,3,3,Niedersachsen,Land,8003421,9,2632555,1656540000000,3,3fd77024-c29b-4843-9be8-682ad48e60c9,32892.871686,129983600000.0,4008988.0,9684,965.849479,77301,6,9658,3
3,4,4,Bremen,Freie Hansestadt,680130,5,217164,1656540000000,4,4132268b-54de-4327-ac1e-760e915112f1,31929.778131,1119157000.0,335717.7,791,784.702924,5337,2,7847,4
4,5,5,Nordrhein-Westfalen,Land,17925570,10,5746467,1656540000000,5,561d658f-3ee5-46e3-bc95-3528c6558ab9,32057.373908,87829360000.0,2648673.0,25753,782.407477,140251,16,7824,5


In [21]:
pd_full_list.to_csv('../data/raw/NPGEO/GER_state_data.csv',sep=';')

### API access via REST service, e.g. USA data(Skipped because the website is shutdown)

Example of a REST conform interface (attention registration mandatory)(Skipped because the website is shutdown)

www.smartable.ai(Website is shutdown)