### This notebook is not directly used in the streamlit app. Indeed, it is only used to create a unique dataset for each wave (i.e: all stata files merged together). However, we directly use the files created by this notebook. If you want to run it again, remember to change the path used.

<br>

> `open_wave(wave, path)` : opens all files from a given wave and returns a list of all the datasets. We use in the next cell of this notebook by merging all these datasets and saving the result

In [2]:
import pandas as pd
import os
from functools import reduce
import numpy as np
from pandas.io.stata import StataReader


def open_wave(wave, path):
    
    #define path
    path = path + f"/sharew{wave}_rel8-0-0_ALL_datasets_stata/"
    
    #init the list that will contains all the df
    dfList = []
    
    #datasets list we don't want to open
    not_open = ["imputations",
                "ilextra",
                "dropoff",
                "exrates",
                "cv_r",
                "vignettes",
                "xt",
                "survey",
                "rc",
                "_re.dta",
                "accelerometer"]
    
    #iterate over all files that end with ".dta" in a given wave and add them to a dfList
    for index, dirs, files in os.walk(path):
        for file in files:
            
            #skip datasets in the not_open list
            x = False
            for i in not_open:
                #skip if the file contains at least one of the word in not_open
                if x==False:
                    if i in os.path.join(index, file):
                        x = True
            
            #print file name
            if ((file.endswith(".dta")) and (x==0)):
                print(os.path.join(index, file))
                
                #open the file
                if wave in [6,7,8]:
                    reader = pd.io.stata.StataReader(os.path.join(index, file))
                    df = reader.read()
                else:
                    df = pd.read_stata(
                        os.path.join(index, file),
                        chunksize=1,
                        order_categoricals=False).read()
                
                #don't keep datasets not containing the mergeid
                if "mergeid" in df.columns:
                
                    #add the current opened df to the list
                    dfList.append(df)
                
                #print file shape
                print(df.shape, "\n")
            
    return dfList

In [3]:
#path used to open and save the data
path = os.getcwd()

for i in range(1,9):
    
    #merge all dfs into one df
    data = reduce(lambda left, right:
                  pd.merge(left,
                           right,
                           on = ["mergeid"],
                           how = "outer",
                           suffixes = (None, "_y2")),
                    open_wave(wave=i, path=path))
    
    #since all wave 3 variable names start with "sl_" (for ShareLife),
    #we remove them because otherwise our regex functions don't work
    if i==3:
        data.columns = data.columns.str.replace(r'^sl_', '', regex=True)

    #remove columns containing the "_y2" pattern since they are dupplicate
    data = data[data.columns.drop(list(data.filter(regex='_y2$')))]
    
    #print informations in order to know what the algo is currently doing
    print(f"Shape of wave {i}:", data.shape)
    data.to_csv(f"wave{i}", index=False)
    print(f"The file of wave {i} has been saved! \n \n \n")

/Users/josephbarbier/Desktop/share/sharew1_rel8-0-0_ALL_datasets_stata/sharew1_rel8-0-0_ch.dta
(30419, 203) 

/Users/josephbarbier/Desktop/share/sharew1_rel8-0-0_ALL_datasets_stata/sharew1_rel8-0-0_ac.dta
(30419, 90) 

/Users/josephbarbier/Desktop/share/sharew1_rel8-0-0_ALL_datasets_stata/sharew1_rel8-0-0_co.dta
(30419, 14) 

/Users/josephbarbier/Desktop/share/sharew1_rel8-0-0_ALL_datasets_stata/sharew1_rel8-0-0_gv_housing.dta
(30419, 11) 

/Users/josephbarbier/Desktop/share/sharew1_rel8-0-0_ALL_datasets_stata/sharew1_rel8-0-0_as.dta
(30419, 127) 

/Users/josephbarbier/Desktop/share/sharew1_rel8-0-0_ALL_datasets_stata/sharew1_rel8-0-0_dn.dta
(30419, 75) 

/Users/josephbarbier/Desktop/share/sharew1_rel8-0-0_ALL_datasets_stata/sharew1_rel8-0-0_sp.dta
(30419, 169) 

/Users/josephbarbier/Desktop/share/sharew1_rel8-0-0_ALL_datasets_stata/sharew1_rel8-0-0_ph.dta
(30419, 137) 

/Users/josephbarbier/Desktop/share/sharew1_rel8-0-0_ALL_datasets_stata/sharew1_rel8-0-0_ho.dta
(30419, 68) 

/Users/

KeyboardInterrupt: 