### This notebook is not directly used in the streamlit app. Indeed, it is only used to create a unique dataset for each wave (i.e: all stata files merged together). However, we directly use the files created by this notebook. If you want to run it again, remember to change the path used.

<br>

> `open_wave(wave, path)` : opens all files from a given wave and returns a list of all the datasets. We use in the next cell of this notebook by merging all these datasets and saving the result

In [1]:
import pandas as pd
import os
from functools import reduce
import numpy as np
from pandas.io.stata import StataReader


def open_wave(wave, path):
    
    #init the list that will contains all the df
    dfList = []
    
    #iterate over all files that end with ".dta" in a given wave and add them to a dfList
    for index, dirs, files in os.walk(path):
        for file in files:
            if file.startswith(f"sharew{str(wave)}"):
                if file.endswith(".dta"):
                    print(os.path.join(index, file))
    
                    if wave in [6,7,8]:
                        reader = pd.io.stata.StataReader(os.path.join(index, file))
                        df = reader.read()
                    else:
                        df = pd.read_stata(
                            os.path.join(index, file),
                            chunksize=1,
                            order_categoricals=False).read()
                    
                    #add the current df to the list
                    dfList.append(df)
            
    return dfList

In [2]:
#path used to open and save the data
path = "/Users/josephbarbier/Desktop/PROJETpython/"

for i in range(1,9):
    
    #merge all dfs into one df
    data = reduce(lambda left, right:
                  pd.merge(left,
                           right,
                           on = ["mergeid"],
                           how = "outer",
                           suffixes = (None, "_y2")),
                    open_wave(i, path=path))
    
    #since all wave 3 variables start with "sl_" (for Share Life),
    #we remove them because otherwise our regex functions don't work
    if i==3:
        data.columns = data.columns.str.replace(r'^sl_', '', regex=True)

    #remove columns containing the "_y2" pattern since they are dupplicate
    data = data[data.columns.drop(list(data.filter(regex='_y2$')))]
    
    #print informations in order to know what the algo is currently doing
    print(f"Shape of wave {i}:", data.shape)
    data.to_csv(f"wave{i}", index=False)
    print(f"The file of wave {i} has been saved!\n")

/Users/josephbarbier/Desktop/PROJETpython/sharew1_rel8-0-0_ch.dta
/Users/josephbarbier/Desktop/PROJETpython/sharew1_rel8-0-0_ac.dta
/Users/josephbarbier/Desktop/PROJETpython/sharew1_rel8-0-0_co.dta
/Users/josephbarbier/Desktop/PROJETpython/sharew1_rel8-0-0_gv_housing.dta
/Users/josephbarbier/Desktop/PROJETpython/sharew1_rel8-0-0_as.dta
/Users/josephbarbier/Desktop/PROJETpython/sharew1_rel8-0-0_dn.dta
/Users/josephbarbier/Desktop/PROJETpython/sharew1_rel8-0-0_sp.dta
/Users/josephbarbier/Desktop/PROJETpython/sharew1_rel8-0-0_ph.dta
/Users/josephbarbier/Desktop/PROJETpython/sharew1_rel8-0-0_ho.dta
/Users/josephbarbier/Desktop/PROJETpython/sharew1_rel8-0-0_gs.dta
/Users/josephbarbier/Desktop/PROJETpython/sharew1_rel8-0-0_hh.dta
/Users/josephbarbier/Desktop/PROJETpython/sharew1_rel8-0-0_ft.dta
/Users/josephbarbier/Desktop/PROJETpython/sharew1_rel8-0-0_ex.dta
/Users/josephbarbier/Desktop/PROJETpython/sharew1_rel8-0-0_gv_isced.dta
/Users/josephbarbier/Desktop/PROJETpython/sharew1_rel8-0-0_iv.