### This notebook is not directly used in the application. Indeed, it is only used to fill the "wave" folder with a unique dataset for each wave (i.e: all stata files merged together). We use directly the files created by this notebook. 

<br>

> `open_wave(wave_index)` : opens all files from a given wave and returns a list of all the datasets

In [47]:
import pandas as pd
import os
from functools import reduce
import numpy as np

def open_wave(wave, path):
    
    PATH = f"{path}rawdata/wave{str(wave)}"
    dfList = []
    
    #iterate over all files that end with ".dta" and add them to a dfList
    for index, dirs, files in os.walk(PATH):
        for file in files:
            if file.endswith(".dta"):
                df = pd.read_stata(
                    os.path.join(index, file),
                    chunksize=1,
                    convert_categoricals=False).read()
                
                #add the current df to the list
                dfList.append(df)
            
    return dfList

In [45]:
#merge and save all data into 1 df per wave
path = "/Users/josephbarbier/Desktop/PROJETpython/"
for i in range(1,9):
    
    #merge all dfs into one df
    data = reduce(lambda left, right:
                  pd.merge(left,
                           right,
                           on = ["mergeid"],
                           how = "outer",
                           suffixes = (None, "_y2")),
                    open_wave(i, path=path))
    
    #all wave 3 variables start with "sl_" (for SHARE LIFE) and we remove them
    #because otherwise our regex functions don't work
    if i==3:
        data.columns = data.columns.str.replace(r'^sl_', '', regex=True)

    #remove columns containing the "_y2" pattern since they are dupplicate
    data = data[data.columns.drop(list(data.filter(regex='_y2$')))]
    print(f"Shape of wave {i}:", data.shape)
    data.to_csv(f"wave/wave{i}", index=False)
    print(f"The file of wave {i} has been saved!\n")

Shape of wave 1: (30419, 1649)
The file of wave 1 has been saved!

Shape of wave 2: (37143, 1897)
The file of wave 2 has been saved!

Shape of wave 3: (28463, 3008)
The file of wave 3 has been saved!

Shape of wave 4: (58000, 2475)
The file of wave 4 has been saved!

Shape of wave 5: (66065, 3059)
The file of wave 5 has been saved!

Shape of wave 6: (68085, 4193)
The file of wave 6 has been saved!

Shape of wave 7: (77202, 6562)
The file of wave 7 has been saved!

Shape of wave 8: (46733, 4298)
The file of wave 8 has been saved!



> `compute_stats(dataframe, wave)` : calculates mean, median and standard deviation for all quantitative variables for a given wave

In [48]:
##compute and save all results
#for wave in tqdm(range(1,9)):
#    
#    #open data
#    data = pd.read_csv(f"wave{wave}.csv", low_memory=False)
#    print(data.shape)
#
#    #apply compute_stats function
#    test = data.copy()
#    stats = compute_stats(test, wave=wave)
#    
#    #save results
#    stats.to_csv(f"stats_wave{wave}.csv", index=False)
#    stats.head()

In [49]:
#all_stats = pd.Series(dtype=np.float64)
##check shape of all stats
#for wave in tqdm(range(1,9)):
#    data = pd.read_csv(f"stats_wave{wave}.csv", low_memory=False)
#    #print(f"Descriptive stats of wave{wave} \n", data.head(), "\n")
#    all_stats = pd.concat([all_stats, data])
#
#print(all_stats.head())