# Function that open all files of a given wave at a time

In [47]:
import pandas as pd
from tqdm import tqdm
import numpy as np

In [16]:
def open_wave(wave_index):
    
    #create a list of files name
    prefixes_files_name = ['ac', 'as', 'ax', 'br', 'cc', 'cf', 'ch', 'co',
                           'cs', 'dn', 'dq', 'ep', 'ex', 'fs', 'ft',
                           'gl', 'gs', 'gs', 'gv_big5', 'gv_children',
                           'gv_health', 'gv_housing', 'gv_isced', 'gv_isco',
                           'gv_networks', 'gv_weights', 'gvgrossnet', 'hc', 'hh',
                           'ho', 'it', 'mh', 'pf', 'ph', 'ra', 'rc', 're',
                           'rp', 'sp', 'sr', 'st', 'te', 'tv', 'wq', 'ws', 'xt']
    fileList = [f"sharew{wave_index}_rel8-0-0_{i}.dta" for i in prefixes_files_name]
    
    #read all df
    dfList = []
    for file in fileList:
        try:
            thisDF = pd.read_stata(f"wave{wave_index}/{file}", convert_categoricals=False)
            dfList.append(thisDF)
        except FileNotFoundError:
            pass

    #merge all df
    data = thisDF["mergeid"]
    for i in tqdm(range(len(dfList))):
        data = pd.merge(data, dfList[i], how='outer')
    
    #return the all
    return data


#merge and save all data

wave1 = open_wave(1)
wave1.to_csv('wave1.csv', index=False)

wave2 = open_wave(2)
wave2.to_csv('wave2.csv', index=False)

wave3 = open_wave(3)
wave3.to_csv('wave3.csv', index=False)

wave4 = open_wave(4)
wave4.to_csv('wave4.csv', index=False)

wave5 = open_wave(5)
wave5.to_csv('wave5.csv', index=False)

wave6 = open_wave(6)
wave6.to_csv('wave6.csv', index=False)

wave7 = open_wave(7)
wave7.to_csv('wave7.csv', index=False)

wave8 = open_wave(8)
wave8.to_csv('wave8.csv', index=False)

# Function that computes mean, sd and median for all quantitative variables

In [169]:
def compute_stats(dataframe, wave):
    
    wave = str(wave)
    
    df = dataframe.copy()
    variable = list(df.columns)

    #init lists to fill in
    mean = []
    sd = []
    median = []
    var_name = []
    
    for var in variable:
        
        #compute mean, sd and median for all quant variables
        df[var] = pd.to_numeric(df[var], errors='coerce')
        mean.append(df[var].mean())
        sd.append(df[var].std())
        median.append(df[var].median())
        var_name.append(var+"_"+wave)
        
    #concatenate all lists and name columns
    features = {'Var_name': var_name,
                'Mean': mean, 
                'Std': sd, 
                'Median': median} 
    df_to_return = pd.DataFrame(features, columns= ['Var_name','Mean','Std','Median'])
    df_to_return = df_to_return.dropna(axis=0)
    
    return df_to_return

In [170]:
#compute and save all results
for wave in tqdm(range(1,9)):
    
    #open data
    data = pd.read_csv(f"wave{wave}.csv", low_memory=False)
    print(data.shape)

    #apply compute_stats function
    test = data.copy()
    stats = compute_stats(test, wave=wave)
    
    #save results
    stats.to_csv(f"stats_wave{wave}.csv", index=False)
    stats.head()

  0%|          | 0/8 [00:00<?, ?it/s]

(30419, 1600)


 12%|█▎        | 1/8 [02:47<19:32, 167.45s/it]

(71305, 2025)


 25%|██▌       | 2/8 [18:57<1:03:56, 639.35s/it]

(43058, 3027)


 38%|███▊      | 3/8 [41:21<1:20:05, 961.19s/it]

(85304, 2617)


 50%|█████     | 4/8 [1:08:35<1:21:47, 1226.84s/it]

(98028, 3193)


 62%|██████▎   | 5/8 [1:46:14<1:19:57, 1599.07s/it]

(128220, 4327)


 75%|███████▌  | 6/8 [3:41:48<1:53:45, 3412.93s/it]

(88957, 6526)


 88%|████████▊ | 7/8 [7:00:44<1:43:19, 6199.36s/it]

(58527, 4383)


100%|██████████| 8/8 [9:03:34<00:00, 4076.84s/it]  


In [193]:
all_stats = pd.Series(dtype=np.float64)
#check shape of all stats
for wave in tqdm(range(1,9)):
    data = pd.read_csv(f"stats_wave{wave}.csv", low_memory=False)
    #print(f"Descriptive stats of wave{wave} \n", data.head(), "\n")
    all_stats = pd.concat([all_stats, data])

print(all_stats.head())

100%|██████████| 8/8 [00:00<00:00, 295.36it/s]

    0   Var_name           Mean           Std   Median
0 NaN   exrate_1       2.563018  2.864668e+00      1.0
1 NaN   as003e_1  151841.692249  1.223841e+07   5000.0
2 NaN  as003v1_1    3320.559777  8.666762e+02   3600.0
3 NaN  as003v2_1    6590.547301  1.734869e+03   7100.0
4 NaN  as003v3_1   13208.835095  3.475529e+03  14000.0





In [184]:
data = pd.read_csv("stats_wave7.csv", low_memory=False)
data.head(20)

Unnamed: 0,Var_name,Mean,Std,Median
0,country_7,28.531719,14.971177,25.0
1,language_7,29.699301,15.163099,28.0
2,ac012__7,7.291627,2.379563,8.0
3,ac014__7,2.422281,1.19365,2.0
4,ac015__7,2.663727,1.19149,3.0
5,ac016__7,3.004339,1.189289,3.0
6,ac017__7,1.736182,1.010346,2.0
7,ac018__7,2.976267,1.184026,3.0
8,ac019__7,2.504735,1.251186,3.0
9,ac020__7,1.449935,0.89455,1.0
