# Imports

In [None]:
%load_ext autoreload
%autoreload 2

import os
import dsttools
import pandas as pd

In [None]:
# create folders if necessary
if not os.path.exists('data'): os.makedirs('data')

# Fetch and convert data

**Step 1:** Find availalble years and variables for each dataset.

In [None]:
# a. general setup
projectid = '706248'
datafolder = os.getcwd() + '/data' # data will be saved here
    
# b. load information
allyearsdict,allvarset,years,allvars = dsttools.load_info(projectid)

# c. example: some available datasets
for i,(dataset,yearlist) in enumerate(allyearsdict.items()):
    print(dataset,yearlist)
    
print('')
        
# d. example: some available dataset and variable combinations
for i,element in enumerate(allvarset):
    print(element[0],element[1])
    if i > 5: break

**Show variables in a dataset:**

In [None]:
dsttools.show_variables(years,allvars,'BEFUPD')

**Step 2:** Select variables and their renaming conventions for each dataset:

In [None]:
bef_name = 'befupd'
ind_name = 'indupd01'

# a. variables (list for each dataset)
bef_vars = [('pnr',),
            (f'{bef_name}SourceYear','year'),
            ('koen','sex'),
            ('foed_dag','birthday')]

ind_vars = [('pnr',),
            (f'{ind_name}SourceYear','year'),
            ('dispon_13',),
            ('perindkialt_13',),
            ('loenmv_13',),
            ('netovskud_13',),
            ('netovskud_gl',),
            ('offpens_efterlon_13',),
            ('privat_pension_13',),
            ('corloen',)]

bfl_vars = [('pnr',),
            ('bflSourceYear','year'),
            ('ajo_job_start_dato','start_date'),
            ('ajo_job_slut_dato','end_date'),  
            ('ajo_fuldtid_beskaeftiget','hours'),
            ('ajo_smalt_loenbeloeb','wage_narrow'),
            ('ajo_bredt_loenbeloeb','wage_broad')]            

# note: ('x',) -> fetch variable x and keep its name
# note: ('x','y') -> fetch variable x and rename it to y

# b. datasets (dict)
datasets = {}
datasets[bef_name] = {'vars':bef_vars,'years':[2012,2012],'overwrite':False}
datasets[ind_name] = {'vars':ind_vars,'years':[2011,2018],'overwrite':False}
datasets['bfl'] = {'vars':bfl_vars,'years':[2011,2018],'overwrite':False}

# note: overwrite -> fetch and convert no matter what, else check if the dataset is already on disk

# c. random sub samples (dict)
random_sub_samples = {'dataset':bef_name,'year':2012,'seed':17}

**Step 3:** Fetch all the data:

In [None]:
# create folders if necessary
if not os.path.exists(datafolder):
    os.makedirs(datafolder)
if not os.path.exists('logs'):
    os.makedirs('logs')    

In [None]:
%time dsttools.fetch(projectid,datasets,datafolder)

**Step 4:** Draw pnr's for random sub-samples:

In [None]:
%time dsttools.draw_random_sub_samples(random_sub_samples,datafolder)

**Step 5:** Convert all the data to parquet:

In [None]:
%time dsttools.convert_to_parquet(projectid,datasets,datafolder)

**Step 6:** Combine all year files a to single file:

In [None]:
%time dsttools.combine_years(projectid,datasets,datafolder)