# DATA BESTAT

## Imports and settings

In [2]:
import urllib3
import pandas as pd
import bs4
from pathlib import Path
from json import loads
import matplotlib.pyplot as plt
import matplotlib as mpl
import re
import math

mpl.rcParams["figure.figsize"] = 10,8
mpl.rcParams["axes.grid"] = True
mpl.rcParams["axes.grid.which"] = "both"

In [1]:
%matplotlib inline

## Scrapping and data gathering

In [6]:
https = urllib3.PoolManager(scheme="https")
domain = "https://bestat.statbel.fgov.be"
resp = https.request("GET", f"{domain}/bestat/crosstable.xhtml?view=5fee32f5-29b0-40df-9fb9-af43d1ac9032")
page = bs4.BeautifulSoup(resp.data.decode(), "html.parser")
p = re.compile("result/CSV")
a = page.find_all("a", href=p) #resultset
df = pd.read_csv(method_domain + a[0].get("href")).drop(["Mannen en vrouwen", "Alle leeftijden"], axis = 1)
df.Geslacht.fillna("M+V", inplace = True)
df.Leeftijdsgroep.fillna("Totaal", inplace = True)
df["G_cat"] = df.Geslacht.astype("category")
df.G_cat.cat.categories = ["F", "M", "MF"]
df["A_cat"] = df.Leeftijdsgroep.astype("category")
df.A_cat.cat.categories = [">64", "<18", "Tot", "19-64"]
processed = df.drop(["Geslacht", "Leeftijdsgroep"], axis = 1)

In [4]:
processed

Unnamed: 0,Bevolking op 01 januari 2010,Bevolking op 01 januari 2011,Bevolking op 01 januari 2012,Bevolking op 01 januari 2013,Bevolking op 01 januari 2014,Bevolking op 01 januari 2015,Bevolking op 01 januari 2016,Bevolking op 01 januari 2017,Bevolking op 01 januari 2018,Bevolking op 01 januari 2019,Bevolking op 01 januari 2020,G_cat,A_cat
0,1082489,1090685,1097544,1104403,1109146,1112811,1116898,1120828,1123794,1125978,1129474,MF,<18
1,3369251,3404220,3419361,3426953,3430647,3438304,3446878,3451947,3457154,3464413,3472205,MF,19-64
2,1075944,1086127,1105242,1120710,1136414,1152835,1166602,1181308,1197216,1212787,1230898,MF,>64
3,5527684,5581032,5622147,5652066,5676207,5703950,5730378,5754083,5778164,5803178,5832577,MF,Tot
4,1131666,1140348,1148091,1154313,1159599,1164347,1168683,1173811,1177701,1179409,1182566,M,<18
5,3396339,3433208,3446480,3454760,3457286,3462994,3472890,3480405,3486865,3496306,3503918,M,19-64
6,784216,796678,819230,838415,857424,877753,895959,913789,933340,952513,973580,M,>64
7,5312221,5370234,5413801,5447488,5474309,5505094,5537532,5568005,5597906,5628228,5660064,M,Tot
8,2214155,2231033,2245635,2258716,2268745,2277158,2285581,2294639,2301495,2305387,2312040,F,<18
9,6765590,6837428,6865841,6881713,6887933,6901298,6919768,6932352,6944019,6960719,6976123,F,19-64


In [8]:
num_ifo_cat = processed.groupby(by=["G_cat", "A_cat"]).sum().T
num_ifo_cat.to_pickle("number_of_civil.pkl")
# bestat gives categories: <18, 19-64, >64
# completely different categorization as covid data, look for other dataset?
num_ifo_cat

G_cat,F,F,F,F,M,M,M,M,MF,MF,MF,MF
A_cat,>64,<18,Tot,19-64,>64,<18,Tot,19-64,>64,<18,Tot,19-64
Bevolking op 01 januari 2010,1860160,2214155,10839905,6765590,784216,1131666,5312221,3396339,1075944,1082489,5527684,3369251
Bevolking op 01 januari 2011,1882805,2231033,10951266,6837428,796678,1140348,5370234,3433208,1086127,1090685,5581032,3404220
Bevolking op 01 januari 2012,1924472,2245635,11035948,6865841,819230,1148091,5413801,3446480,1105242,1097544,5622147,3419361
Bevolking op 01 januari 2013,1959125,2258716,11099554,6881713,838415,1154313,5447488,3454760,1120710,1104403,5652066,3426953
Bevolking op 01 januari 2014,1993838,2268745,11150516,6887933,857424,1159599,5474309,3457286,1136414,1109146,5676207,3430647
Bevolking op 01 januari 2015,2030588,2277158,11209044,6901298,877753,1164347,5505094,3462994,1152835,1112811,5703950,3438304
Bevolking op 01 januari 2016,2062561,2285581,11267910,6919768,895959,1168683,5537532,3472890,1166602,1116898,5730378,3446878
Bevolking op 01 januari 2017,2095097,2294639,11322088,6932352,913789,1173811,5568005,3480405,1181308,1120828,5754083,3451947
Bevolking op 01 januari 2018,2130556,2301495,11376070,6944019,933340,1177701,5597906,3486865,1197216,1123794,5778164,3457154
Bevolking op 01 januari 2019,2165300,2305387,11431406,6960719,952513,1179409,5628228,3496306,1212787,1125978,5803178,3464413
