# Imports

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
covid_path = "../datasets/COVID19BE_CASES_MUNI.csv"
df = pd.read_csv(covid_path)

FileNotFoundError: [Errno 2] No such file or directory: '../datasets/COVID19BE_CASES_MUNI.csv'

In [None]:
geoloc_path = "../datasets/postal_code_nis_disctric.csv"
df_geo = pd.read_csv(geoloc_path)

# Exploratory data analysis

In [None]:
df.head()

In [None]:
df.shape

### Remove columns

We need covid cases per municipality. Many columns on this dataset are useless. Before any further analysis, we will remove them.

In [None]:
df.columns

In [None]:
df.drop(columns=['TX_ADM_DSTR_DESCR_NL', 'TX_ADM_DSTR_DESCR_FR', 'PROVINCE', 'REGION'], inplace=True)

In [None]:
df.head(3)

### Missing values

This dataset seems to have some missing values. We will remove them.

In [None]:
df.info()

In [None]:
df.isna().sum()

Drop all "Na/NA/NaN"

In [None]:
df = df.dropna()

### Cases

Cases has a "<5" (less than five) value.

As it's not possible in my frametime to re-construct the dataset and get back real numbers, I'll just replace them by 1.

In [None]:
df['CASES'].value_counts()

Replace "<5" by 1 and set this columns as int

In [None]:
df['CASES'].replace('<5', 1, inplace=True)

In [None]:
df['CASES'] = df['CASES'].astype(int)

In [None]:
df['CASES'].dtype

### Date

Later we'll get data by week. We need our 'Date' columns to becode a pandas datetime:

In [None]:
df['DATE'] = pd.to_datetime(df['DATE'])

In [None]:
df.info()

### NIS5

The "NIS5" or "INS code" is an unique identifier for each belgian municipality. It is useful to merge this dataset with localisation dataset.

In [None]:
df['NIS5'] = df['NIS5'].astype(int)

In [None]:
df.info()

# Dataset transformation

To comply with the streamlit app, the data need to be transformed as the following:

 - Represent cases by weeks
 - Have geolocation data
 - Be reshaped as one dataframe per week, containing as many entries/location as cases/location.

### Group by week

We substract one week (7 days), as we want to sum for the week ahead of the date, not the week before that date

In [None]:
df['week_date'] = df['DATE'] - pd.to_timedelta(7, unit='d')

In [None]:
df = df.groupby(['NIS5', pd.Grouper(key='DATE', freq='W-MON')])['CASES'].sum().reset_index().sort_values('DATE')

In [None]:
df.shape

In [None]:
df.head()

### Merge with geolocation