### This notebook is not directly used in the application. Indeed, it is only used to fill the "wave" folder with a unique dataset for each wave (i.e: all stata files merged together). We use directly the files created by this notebook. 

<br>

> `open_wave(wave_index)` : opens all files from a given wave and returns a list of all the datasets

In [69]:
import pandas as pd
import os
from functools import reduce
import numpy as np
from pandas.io.stata import StataReader


def open_wave(wave, path):
    
    PATH = f"{path}rawdata/wave{str(wave)}"
    dfList = []
    
    #iterate over all files that end with ".dta" and add them to a dfList
    for index, dirs, files in os.walk(PATH):
        for file in files:
            if file.endswith(".dta"):
                print(os.path.join(index, file))

                if wave in [6,7,8]:
                    reader = pd.io.stata.StataReader(os.path.join(index, file))
                    df = reader.read()
                else:
                    df = pd.read_stata(
                        os.path.join(index, file),
                        chunksize=1,
                        order_categoricals=False).read()
                
                #add the current df to the list
                dfList.append(df)
            
    return dfList

In [72]:
#merge and save all data into 1 df per wave

#my path used to open and save the data
path = "/Users/josephbarbier/Desktop/PROJETpython/"

for i in range(7,9):
    
    #merge all dfs into one df
    data = reduce(lambda left, right:
                  pd.merge(left,
                           right,
                           on = ["mergeid"],
                           how = "outer",
                           suffixes = (None, "_y2")),
                    open_wave(i, path=path))
    
    #all wave 3 variables start with "sl_" (for SHARE LIFE) and we remove them
    #because otherwise our regex functions don't work
    if i==3:
        data.columns = data.columns.str.replace(r'^sl_', '', regex=True)

    #remove columns containing the "_y2" pattern since they are dupplicate
    data = data[data.columns.drop(list(data.filter(regex='_y2$')))]
    print(f"Shape of wave {i}:", data.shape)
    data.to_csv(f"wave/wave{i}", index=False)
    print(f"The file of wave {i} has been saved!\n")

/Users/josephbarbier/Desktop/PROJETpython/rawdata/wave7/sharew7_rel8-0-0_ho.dta
/Users/josephbarbier/Desktop/PROJETpython/rawdata/wave7/sharew7_rel8-0-0_fs.dta
/Users/josephbarbier/Desktop/PROJETpython/rawdata/wave7/sharew7_rel8-0-0_ra.dta
/Users/josephbarbier/Desktop/PROJETpython/rawdata/wave7/sharew7_rel8-0-0_gv_children.dta
/Users/josephbarbier/Desktop/PROJETpython/rawdata/wave7/sharew7_rel8-0-0_ph.dta
/Users/josephbarbier/Desktop/PROJETpython/rawdata/wave7/sharew7_rel8-0-0_gv_weights.dta
/Users/josephbarbier/Desktop/PROJETpython/rawdata/wave7/sharew7_rel8-0-0_sp.dta
/Users/josephbarbier/Desktop/PROJETpython/rawdata/wave7/sharew7_rel8-0-0_dn.dta
/Users/josephbarbier/Desktop/PROJETpython/rawdata/wave7/sharew7_rel8-0-0_rp.dta
/Users/josephbarbier/Desktop/PROJETpython/rawdata/wave7/sharew7_rel8-0-0_ft.dta
/Users/josephbarbier/Desktop/PROJETpython/rawdata/wave7/sharew7_rel8-0-0_ex.dta
/Users/josephbarbier/Desktop/PROJETpython/rawdata/wave7/sharew7_rel8-0-0_hh.dta
/Users/josephbarbier/De