# Notities

## Handige links
* [Statline portaal](https://opendata.cbs.nl/statline/portal.html?_la=nl&_catalog=CBS)
* [Snelstartgids open data](https://www.cbs.nl/nl-nl/onze-diensten/open-data/statline-als-open-data/snelstartgids)
* [repo Github voor cbsodata](https://github.com/J535D165/cbsodata)
* [PyPi documentatis cbsodata](https://pypi.org/project/cbsodata/)
* [Sphinx docs cbsodata](https://cbsodata.readthedocs.io/en/latest/index.html)
* [GitHub voorbeelden OData 3](https://github.com/statistiekcbs/CBS-Open-Data-v3)
* [GitHub voorbeelden OData 4](https://github.com/statistiekcbs/CBS-Open-Data-v4/tree/master/Python)
* [Statline voor derden](https://www.cbs.nl/nl-nl/cijfers/statline/statline-voor-derden)

# Imports

In [None]:
import pandas as pd
import cbsodata

# Functies

In [None]:
def get_relevant_tables(url:str, 
                        keywords=[], 
                        frequency=['Eenmalig', 'Perjaar', 'Perkwartaal', 'Onregelmatig', 'Permaand', 'Pertweejaar', 'Perhalfjaar', 'Tweemaalperjaar'], 
                        select_columns=['Identifier', 'ShortTitle','ColumnCount', 'RecordCount', 'Updated', 'Period', 'Summary', 'Frequency', 'ShortDescription', 'ExplanatoryText']):
    """
    Method to get relevant tables in the CBS database based on keywords in the title
    
    :params str url: URL of the catalog of the CBS databases, i.e.: 'opendata.cbs.nl'
    :params list[str] keywords: List of words to search for in the title.
    :params list[str] frequency: List of frequencies (level of accuracy in the data) where tables should be returned.
    :select_columns list[str] select_columns: Columns that should be included in the returned database. 
    """
    tables_list = cbsodata.get_table_list(catalog_url=url)
    df_tables = pd.DataFrame(tables_list)
    print(f"Total number of tables in this url is: {df_tables.shape[0]}")
    print(f"The columns in these tables are: {df_tables.columns}")
    total_identifier_list = []
    for keyword in keywords:
        identifier_list = [i['Identifier'] for i in tables_list if keyword.lower() in i['Title'].lower()]
        total_identifier_list = total_identifier_list + identifier_list
    columns_to_include = list(set(select_columns).intersection(df_tables.columns))
    df = df_tables[(df_tables['Identifier'].isin(total_identifier_list))&(df_tables['Frequency'].isin(frequency))][columns_to_include]
    return df

# Explore possible tables

In [None]:
toc = pd.DataFrame(cbsodata.get_table_list())

In [None]:
toc.shape[0]

## URL = 'opendata.cbs.nl'

In [None]:
df_tables_cbs_opendata = get_relevant_tables(url='opendata.cbs.nl', 
                                             keywords=['wmo', 'bevolking', 'gemeente', 'wijk'])
df_tables_cbs_opendata.shape

In [None]:
df_tables_cbs_opendata.sample(5)

## URL = 'dataderden.cbs.nl'

In [None]:
df_tables_cbs_derden = get_relevant_tables(url='dataderden.cbs.nl', 
                                             keywords=['wmo'])
df_tables_cbs_derden

# Load data

In [None]:
# DRAAIT LANG ALS JE ALLES OPVRAAGT
url = 'dataderden.cbs.nl'
df_to_collect = df_tables_cbs_derden
dict_cbs_tables = {}
for i in df_to_collect['Identifier'][:2]:
    print(i)
    try:
        df = pd.DataFrame(cbsodata.get_data(str(i), catalog_url=url))
    except Exception:
        df = pd.DataFrame()
        pass
    dict_cbs_tables[i] = df

In [None]:
dict_cbs_tables['40072NED'].sample(10)