In [2]:
import pandas as pd

In [6]:
disease_to_variable = {
    'Cancer': 'ph006d10',
    'Alzheimer': 'ph006d16',
    'Parkinson': 'ph006d12',
    'Diabetes': 'ph006d5'
}
code_to_name = {code: name for name, code in disease_to_variable.items()}
year_to_wave = {
    2004: 1,
    2006: 2,
    2008: 3,
    2011: 4,
    2013: 5,
    2015: 6,
    2017: 7,
    2019: 8
}
wave_to_year = {wave: year for year, wave in year_to_wave.items()}

def get_rates(wave, variable, code_to_name=code_to_name, wave_to_year=wave_to_year):
    
    # open datasets
    path_ph = f'../SHARE/data/sharew{wave}_rel8-0-0_ALL_datasets_stata/sharew{wave}_rel8-0-0_ph.dta'
    path_dn = f'../SHARE/data/sharew{wave}_rel8-0-0_ALL_datasets_stata/sharew{wave}_rel8-0-0_dn.dta'
    ph = pd.read_stata(path_ph)
    dn = pd.read_stata(path_dn)
    dn.drop(['country'], axis=1, inplace=True)
    df = pd.merge(ph, dn, on='mergeid')
    columns = [variable, 'country', 'dn003_']
    df = df[columns]
    variable_name = code_to_name[variable]
    df.rename(columns={variable: variable_name,
                    'dn003_': 'YearOfBirth',
                    'country': 'Country'},
                    inplace=True)

    # deal with NaN
    values_to_replace = ["Don't know", "Refusal",
                     "Implausible value/suspected wrong", 
                     "Not codable", "Not answered",
                     "Not yet coded", "Not applicable"]
    df.replace(values_to_replace, float('NaN'), inplace=True)

    # create age variable and filter
    df['YearOfBirth'] = df['YearOfBirth'].astype(float)
    year = wave_to_year[wave]
    df["Age"] = year - df["YearOfBirth"]
    df = df.loc[df.Age >= 50,]

    # change cancer variable to binary
    df.dropna(subset=[variable_name], inplace=True)
    df[variable_name] = df[variable_name].replace({'Selected': 1, 'Not selected': 0})
    df[variable_name] = df[variable_name].astype(int)
    
    # compute cancer rates
    rates = df.groupby('Country', observed=True)[variable_name].mean()
    return rates

for wave in range(1, 9):
    for disease, variable in disease_to_variable.items():
        try:
            print(f'Wave {wave}, {disease}')
            rates = get_rates(wave, variable)
            rates.to_csv(f'data/{disease}_rates_wave{wave}.csv')
        except KeyError:
            print(f'Wave {wave}, {disease} not in wave')
        except FileNotFoundError:
            print(f'Wave {wave}, {disease} not in wave')

Wave 1, Cancer
Wave 1, Alzheimer
Wave 1, Alzheimer not in wave
Wave 1, Parkinson
Wave 1, Diabetes
Wave 2, Cancer
Wave 2, Alzheimer
Wave 2, Parkinson
Wave 2, Diabetes
Wave 3, Cancer
Wave 3, Cancer not in wave
Wave 3, Alzheimer
Wave 3, Alzheimer not in wave
Wave 3, Parkinson
Wave 3, Parkinson not in wave
Wave 3, Diabetes
Wave 3, Diabetes not in wave
Wave 4, Cancer
Wave 4, Alzheimer
Wave 4, Parkinson
Wave 4, Diabetes
Wave 5, Cancer
Wave 5, Alzheimer
Wave 5, Parkinson
Wave 5, Diabetes
Wave 6, Cancer
Wave 6, Alzheimer
Wave 6, Parkinson
Wave 6, Diabetes
Wave 7, Cancer
Wave 7, Alzheimer
Wave 7, Parkinson
Wave 7, Diabetes
Wave 8, Cancer
Wave 8, Alzheimer
Wave 8, Parkinson
Wave 8, Diabetes


In [4]:
import geopandas as gpd
world = gpd.read_file('../../Desktop/cieri/ne_110m_admin_0_countries/ne_110m_admin_0_countries.shp')
world.to_csv('data/world.csv')