In [1]:
import os
import pandas as pd
from tqdm import tqdm

## Get all datasets properties and store them

### Code

The following code **iterates over all files** in the root directory containing all SHARE datasets. It extracts the properties of each dataset and stores them in a pandas dataframe. The properties are:
- dataset name
- number of rows
- number of columns
- column names

### Details

- For some reason, one or multiple files, when trying to read them, throw a `ValueError`. However, `pandas` gives us the solution by telling us we should add `convert_categoricals=False` to the `read_csv` function when this is happening. This is the reason why we have a `try` and `except` block in the code.

In [20]:
# initiate constants
directory = '../../SHARE/data/'
file_names = []
waves = []
n_rows = []
n_columns = []
columns = []

# iterate through all files in the directory
for root, dirs, files in tqdm(os.walk(directory)):
    for file in files:
        if file.endswith('.dta'):
            try:
                dataset = pd.read_stata(os.path.join(root, file))
            except ValueError:
                dataset = pd.read_stata(os.path.join(root, file), convert_categoricals=False)
            file_names.append(file)
            waves.append(file[6])
            n_rows.append(len(dataset))
            n_columns.append(len(dataset.columns))
            columns.append(list(dataset.columns))

# create a dataframe with the results
df = pd.DataFrame({
    'file_name': file_names,
    'wave': waves,
    'n_rows': n_rows,
    'n_columns': n_columns,
    'columns': columns
})

9it [01:21,  9.06s/it]


## Discriminate normal and particular datasets

According to the [official documentation](https://share-eric.eu/data/faqs-support):

*"The naming of variables is harmonised across waves. Variable names in the CAPI instrument data use the following format: mmXXXyyy_LL. “mm” is the module identifier, e.g. DN for the demographics module, “XXX” refers to the question number, e.g. 001, and “yyy” are optional digits for dummy variables (indicated by “d”), euro conversion (indicated by “e”) or unfolding brackets (indicated by “ub”). The separation character “_” is followed by “LL” optional digits for category or loop indication (“outer loop”)."*

For this reason, we add **boolean indicator** columns to the dataframe to discriminate normal datasets from particular ones. This allows us to see that approximately 10% of the datasets are particular, and maybe useless for us.

In [36]:
def is_file_normal(file_name: str) -> bool:
    """
    Detect if the last 3 elements of the string follow the pattern _ab, 
    where 'a' and 'b' are single letters.
    
    Args:
    - file_name (str): The string to be checked.
    Returns:
    - bool: True if the pattern is found, False otherwise.
    """

    # special case the General Variables files
    if '_gv_' in file_name:
        return True
    
    # remove file extension
    suffix = file_name[:-4]

    # check if the suffix is long enough
    if len(suffix) < 3:
        return False
    
    # check if the last three elements follow the pattern
    last_three = suffix[-3:]
    if last_three[0] == '_' and last_three[1].isalpha() and last_three[2].isalpha():
        return True
    else:
        return False
    
# filter out the files that are not normal
df['is_normal'] = df['file_name'].apply(is_file_normal)
print(round(df.is_normal.value_counts()/len(df)*100,2))

is_normal
True     89.43
False    10.57
Name: count, dtype: float64


In [42]:
# save the dataframe to a csv file
df.to_csv('data_info.csv', index=False)
df.sample(10)

Unnamed: 0,file_name,wave,n_rows,n_columns,columns,is_normal
244,sharew2_rel8-0-0_br.dta,2,37143,23,"[mergeid, hhid2, mergeidp2, coupleid2, country...",True
99,sharew3_rel8-0-0_gv_weights.dta,3,28463,14,"[mergeid, hhid3, mergeidp3, coupleid3, country...",True
59,sharew4_rel8-0-0_co.dta,4,58000,25,"[mergeid, hhid4, mergeidp4, coupleid4, country...",True
254,sharew2_rel8-0-0_gv_imputations.dta,2,185715,245,"[mergeid, hhid2, mergeidp2, coupleid2, country...",True
55,sharew4_rel8-0-0_gv_exrates.dta,4,29,69,"[country, euro, currency, exrate_w1, exrate_w2...",True
31,sharew1_rel8-0-0_gv_weights.dta,1,30419,14,"[mergeid, hhid1, mergeidp1, coupleid1, country...",True
175,sharew8_rel8-0-0_cf.dta,8,46733,75,"[mergeid, hhid8, mergeidp8, coupleid8, country...",True
30,sharew1_rel8-0-0_ep_ilextra.dta,1,575,25,"[mergeid, hhid1, mergeidp1, coupleid1, country...",False
183,sharew8_rel8-0-0_gv_accelerometer_hour.dta,8,153600,256,"[mergeid, country, language, month, year, week...",True
34,sharew4_rel8-0-0_mh.dta,4,58000,35,"[mergeid, hhid4, mergeidp4, coupleid4, country...",True


## Get column names for each wave

In [49]:
for wave in range(1,9):
    print(len(df['wave'] == str(wave)))

265
265
265
265
265
265
265
265


In [66]:
df = pd.read_csv('data_info.csv')

# iterate through the waves
for wave in range(1,9):
    columns = []
    subset = df[df['wave'] == wave]
    for row in subset['columns']:
        row = row[1:-1].replace("'", "").split(', ')
        columns.extend(list(row))
    columns = list(set(columns))
    print(f'Wave {wave} has {len(columns)} unique columns.')

    # save as txt file
    with open(f'columns/wave_{wave}_columns.txt', 'w') as f:
        for item in columns:
            f.write("%s\n" % item)

Wave 1 has 2201 unique columns.
Wave 2 has 2496 unique columns.
Wave 3 has 3264 unique columns.
Wave 4 has 4203 unique columns.
Wave 5 has 3816 unique columns.
Wave 6 has 5406 unique columns.
Wave 7 has 8014 unique columns.
Wave 8 has 6478 unique columns.
