# Data collection and cleaning: ukb669914

In [1]:
# Import the json module, which allows us to work with JSON files in Python.
import json
with open('/Users/marinacamacho/Desktop/Master_I/var.json') as f:
    var_temp = json.load(f)

# Import the numpy module. Numpy is a library in Python that provides support for large, 
# multi-dimensional arrays and matrices, along with a large collection of high-level 
# mathematical functions to operate on these arrays.
import numpy as np

# Import the pandas module, which allows us to work with data structures and data analysis tools.
# Given it's a large dataset, the 'nrows=1' argument is used to read only the first row of the CSV file.
import pandas as pd
df_ = pd.read_csv('/Users/marinacamacho/Desktop/Master_I/Raw_Data/ukb669914.csv', nrows=1)  # Read only the first column

In [2]:
df_

Unnamed: 0,eid,50-0.0,50-1.0,50-2.0,50-3.0,74-0.0,74-1.0,74-2.0,74-3.0,84-0.0,...,130936-0.0,130938-0.0,130940-0.0,130942-0.0,130944-0.0,130946-0.0,130948-0.0,130950-0.0,130952-0.0,130954-0.0
0,1000010,169,,,,4,,,,2008,...,,,,,,,,,,


In [3]:
columns_including_minus_0 = df_.filter(regex='.*-1.*')

In [4]:
# Variables that we don't have a second assesment, hence cannot be traced
not_treaceable = ['20122-1.0']

In [5]:
# Initialize a dictionary named var_temp. The keys are original variable names and the values are the new names that we want to assign to them.
# The general structure is 'original_name' : 'new_name'. This dictionary is used for renaming variables from the original dataset,
# making the variable names more understandable and easier to work with.
# This dictionary will contain general external exposures.
var_temp = {'eid': 'f.eid',
 '50-1.0' : 'Standing_height',
 '74-1.0': 'Fasting_time',
 '84-1.0': 'Cancer_age_0.0',
 '84-1.1': 'Cancer_age_0.1',
 '84-1.2': 'Cancer_age_0.2',
 '84-1.3': 'Cancer_age_0.3',
 '84-1.4': 'Cancer_age_0.4',
 '84-1.5': 'Cancer_age_0.5',
 '134-1.0': 'Number_cancers',
 '136-1.0': 'Number_operation',
 '806-1.0': 'Job_walking_standing',
 '816-1.0': 'Job_heavy_manual',
 '826-1.0': 'Job_involves_shift',
 '1031-1.0': 'Frequency_friend_family_visits',
 '1050-1.0': 'Time_outdoors_summer',
 '1060-1.0': 'Time_outdoors_winter',
 '1110-1.0': 'Length_phone_use',
 '1120-1.0': 'Weekly_phone_use',
 '1140-1.0': 'Difference_phone_use',
 '1170-1.0': 'Getting_up_in_morning',
 '1180-1.0': 'Morning_evening_person',
 '1210-1.0': 'Snoring',
 '1498-1.0': 'Coffee_intake',
 '1717-1.0': 'Skin_colour',
 '1920-1.0': 'Mood_swings',
 '1930-1.0': 'Miserableness',
 '1940-1.0': 'Irritability',
 '1950-1.0': 'Sensitivity',
 '1960-1.0': 'Fed-up_feelings',
 '1970-1.0': 'Nervous_feelings',
 '1980-1.0': 'Worrier/Anxious_feelings',
 '1990-1.0': 'Tense',
 '2000-1.0': 'Worry_too_long_after_embarrassment',
 '2010-1.0': 'Suffer_from_nerves',
 '2020-1.0': 'Loneliness_isolation',
 '2030-1.0': 'Guilty_feelings',
 '2040-1.0': 'Risk_taking',
 '2110-1.0': 'Able_to_confide',
 '2178-1.0': 'Overall_health_rating',
 '2237-1.0': 'Plays_computer_games',
 '2247-1.0': 'Hearing_difficulty/problems',
 '2267-1.0': 'Use_of_sun/uv_protection',
 '2277-1.0': 'Frequency_of_solarium/sunlamp',
 '2415-1.0': 'Had_major_operation',
 '2453-1.0': 'Cancer_diagnosed_by_doctor',
 '2463-1.0': 'Fractured/broken_bones',
 '2473-1.0': 'Other_serious_condition',
 '2844-1.0': 'Had_other_major_operations',
 '4526-1.0': 'Happiness',
 '4537-1.0': 'Work/job_satisfaction',
 '4548-1.0': 'Health_satisfaction',
 '4559-1.0': 'Family_relationship satisfaction',
 '4570-1.0': 'Friendships_satisfaction',
 '4581-1.0': 'Financial_satisfaction',
 '4642-1.0': 'Ever_manic/hyper',
 '4653-1.0': 'Ever_highly_irritable/argumentative',
 '4728-1.0': 'Leg_pain_on_walking',
 '4803-1.0': 'Tinnitus',
 '5663-1.0': 'Length_manic/irritable_episode',
 '5674-1.0': 'Severity_of_manic/irritable_episodes',
 '6149-1.0': 'Mouth/teeth_problems_0',
 '6149-1.1': 'Mouth/teeth_problems_1',
 '6149-1.2': 'Mouth/teeth_problems_2',
 '6149-1.3': 'Mouth/teeth_problems_3',
 '6149-1.4': 'Mouth/teeth_problems_4',
 '6149-1.5': 'Mouth/teeth_problems_5',
 '6156-1.0': 'Manic/hyper_symptoms_0',
 '6156-1.1': 'Manic/hyper_symptoms_1',
 '6156-1.2': 'Manic/hyper_symptoms_2',
 '6156-1.3': 'Manic/hyper_symptoms_3',
 '6159-1.0': 'Pain_type_0',
 '6159-1.1': 'Pain_type_1',
 '6159-1.2': 'Pain_type_2',
 '6159-1.3': 'Pain_type_3',
 '6159-1.4': 'Pain_type_4',
 '6159-1.5': 'Pain_type_5',
 '6159-1.6': 'Pain_type_6',
 '6160-1.0': 'Leisure/social_activities_0',
 '6160-1.1': 'Leisure/social_activities_1',
 '6160-1.2': 'Leisure/social_activities_2',
 '6160-1.3': 'Leisure/social_activities_3',
 '6160-1.4': 'Leisure/social_activities_4',
 '20023-1.0': 'Mean_time_to_identify_matches',
 '20107-1.0': 'Illnesses_father_0',
 '20107-1.1': 'Illnesses_father_1',
 '20107-1.2': 'Illnesses_father_2',
 '20107-1.3': 'Illnesses_father_3',
 '20107-1.4': 'Illnesses_father_4',
 '20107-1.5': 'Illnesses_father_5',
 '20107-1.6': 'Illnesses_father_6',
 '20107-1.7': 'Illnesses_father_7',
 '20107-1.8': 'Illnesses_father_8',
 '20107-1.9': 'Illnesses_father_9',
 '20110-1.0': 'Illnesses_mother_0',
 '20110-1.1': 'Illnesses_mother_1',
 '20110-1.2': 'Illnesses_mother_2',
 '20110-1.3': 'Illnesses_mother_3',
 '20110-1.4': 'Illnesses_mother_4',
 '20110-1.5': 'Illnesses_mother_5',
 '20110-1.6': 'Illnesses_mother_6',
 '20110-1.7': 'Illnesses_mother_7',
 '20110-1.8': 'Illnesses_mother_8',
 '20110-1.9': 'Illnesses_mother_9',
 '20110-1.10': 'Illnesses_mother_10',
 '20111-1.0': 'Illnesses_siblings_0',
 '20111-1.1': 'Illnesses_siblings_1',
 '20111-1.2': 'Illnesses_siblings_2',
 '20111-1.3': 'Illnesses_siblings_3',
 '20111-1.4': 'Illnesses_siblings_4',
 '20111-1.5': 'Illnesses_siblings_5',
 '20111-1.6': 'Illnesses_siblings_6',
 '20111-1.7': 'Illnesses_siblings_7',
 '20111-1.8': 'Illnesses_siblings_8',
 '20111-1.9': 'Illnesses_siblings_9',
 '20111-1.10': 'Illnesses_siblings_10',
 '20111-1.11': 'Illnesses_siblings_11',
 '20122-0.0': 'Bipolar_disorder_status',
 '21002-1.0': 'Weight'}

In [6]:
# Download all the data (not just the columns names) with the new variable names. 
df_ = pd.read_csv('/Users/marinacamacho/Desktop/Master_I/Raw_Data/ukb669914.csv', usecols = var_temp.keys())
df_ = df_.rename(columns = var_temp )

KeyboardInterrupt: 

In [None]:
# Create a new column 'Cancer_age_first' in the dataframe df_. 
# This column is created by applying a lambda function across the dataframe rows (axis=1) 
# which takes the minimum value among the 'Cancer_age_0.0', 'Cancer_age_0.1', ..., 'Cancer_age_0.5' columns for each row.
df_['Cancer_age_first'] = df_.apply(lambda row: min(row['Cancer_age_0.0'], row['Cancer_age_0.1'], row['Cancer_age_0.2'], row['Cancer_age_0.3'], row['Cancer_age_0.4'], row['Cancer_age_0.5']), axis=1)
del df_['Cancer_age_0.0']
del df_['Cancer_age_0.1']
del df_['Cancer_age_0.2']
del df_['Cancer_age_0.3']
del df_['Cancer_age_0.4']
del df_['Cancer_age_0.5']

In [None]:
# Creating a dictionary to map the code to its corresponding meaning
code_meaning = {
    1: 'Mouth_ulcers',
    2: 'Painful_gums',
    3: 'Bleeding_gums',
    4: 'Loose_teeth',
    5: 'Toothache',
    6: 'Dentures',
    -7: 'None_of_the_above',
    -3: 'Prefer_not_to_answer'
}

# Specify the column names you want to extract
columns_to_extract = ['Mouth/teeth_problems_0', 'Mouth/teeth_problems_1', 'Mouth/teeth_problems_2',
                      'Mouth/teeth_problems_3', 'Mouth/teeth_problems_4', 'Mouth/teeth_problems_5']

# Create a new dataframe with only the extracted columns
extracted_df = df_[columns_to_extract].copy()

# Create a new dataframe to store the results
result_df = pd.DataFrame()

# Iterate over each code and meaning in the dictionary
for code, meaning in code_meaning.items():
    # Check if the code is positive or negative
    is_positive = code > 0

    # Create a boolean mask indicating where the code is present in the extracted columns
    code_mask = extracted_df.isin([code])

    # Count the occurrences of the code in each row
    code_counts = code_mask.sum(axis=1)

    # Create a new column with the meaning and initialize it as 1 if the code is present, else 0
    result_df[meaning] = np.where(code_counts > 0, 1, np.nan)

# Concatenate the result dataframe with the original dataframe
df_ = pd.concat([df_, result_df], axis=1)

df_['Mouth_ulcers'] = np.where(df_['None_of_the_above'] == 1, 0, df_['Mouth_ulcers'])
df_['Painful_gums'] = np.where(df_['None_of_the_above'] == 1, 0, df_['Painful_gums'])
df_['Bleeding_gums'] = np.where(df_['None_of_the_above'] == 1, 0, df_['Bleeding_gums'])
df_['Loose_teeth'] = np.where(df_['None_of_the_above'] == 1, 0, df_['Loose_teeth'])
df_['Toothache'] = np.where(df_['None_of_the_above'] == 1, 0, df_['Toothache'])
df_['Dentures'] = np.where(df_['None_of_the_above'] == 1, 0, df_['Dentures'])

del df_['Mouth/teeth_problems_0']
del df_['Mouth/teeth_problems_1']
del df_['Mouth/teeth_problems_2']
del df_['Mouth/teeth_problems_3']
del df_['Mouth/teeth_problems_4']
del df_['Mouth/teeth_problems_5']

del df_['None_of_the_above']
del df_['Prefer_not_to_answer']

In [None]:
# Assuming your dataframe is named 'df' with the renamed columns
# Creating a dictionary to map the code to its corresponding meaning
code_meaning = {
    11: 'Active_than_usual',
    12: 'Talkative_than_usual',
    13: 'Needed_less_sleep_than_usual',
    14: 'Creative_than_usual',
    15: 'All_of_the_above',
    -7: 'None_of_the_above',
}

# Specify the column names you want to extract
columns_to_extract = ['Manic/hyper_symptoms_0', 'Manic/hyper_symptoms_1', 'Manic/hyper_symptoms_2',
                      'Manic/hyper_symptoms_3']

# Create a new dataframe with only the extracted columns
extracted_df = df_[columns_to_extract].copy()

# Create a new dataframe to store the results
result_df = pd.DataFrame()

# Iterate over each code and meaning in the dictionary
for code, meaning in code_meaning.items():
    # Check if the code is positive or negative
    is_positive = code > 0

    # Create a boolean mask indicating where the code is present in the extracted columns
    code_mask = extracted_df.isin([code])

    # Count the occurrences of the code in each row
    code_counts = code_mask.sum(axis=1)

    # Create a new column with the meaning and initialize it as 1 if the code is present, else 0
    result_df[meaning] = np.where(code_counts > 0, 1, np.nan)

# Concatenate the result dataframe with the original dataframe
df_ = pd.concat([df_, result_df], axis=1)

df_['Active_than_usual'] = np.where(df_['None_of_the_above'] == 1, 0, df_['Active_than_usual'])
df_['Talkative_than_usual'] = np.where(df_['None_of_the_above'] == 1, 0, df_['Talkative_than_usual'])
df_['Needed_less_sleep_than_usual'] = np.where(df_['None_of_the_above'] == 1, 0, df_['Needed_less_sleep_than_usual'])
df_['Creative_than_usual'] = np.where(df_['None_of_the_above'] == 1, 0, df_['Creative_than_usual'])

df_['Active_than_usual'] = np.where(df_['All_of_the_above'] == 1, 1, df_['Active_than_usual'])
df_['Talkative_than_usual'] = np.where(df_['All_of_the_above'] == 1, 1, df_['Talkative_than_usual'])
df_['Needed_less_sleep_than_usual'] = np.where(df_['All_of_the_above'] == 1, 1, df_['Needed_less_sleep_than_usual'])
df_['Creative_than_usual'] = np.where(df_['All_of_the_above'] == 1, 1, df_['Creative_than_usual'])

del df_['Manic/hyper_symptoms_0']
del df_['Manic/hyper_symptoms_1']
del df_['Manic/hyper_symptoms_2']
del df_['Manic/hyper_symptoms_3']

del df_['None_of_the_above']

In [None]:
# Assuming your dataframe is named 'df' with the renamed columns
# Creating a dictionary to map the code to its corresponding meaning
code_meaning = {
1:'Headache',
2:'Facial_pain',
3:'Neck_shoulder_pain',
4:'Back_pain',
5:'Stomach_abdominal_pain',
6:'Hip_pain',
7:'Knee_pain',
8:'Pain_body',
-7:'None_of_the_above',
-3:'Prefer_not_to_answer'
}

# Specify the column names you want to extract
columns_to_extract = ['Pain_type_0', 'Pain_type_1', 'Pain_type_2',
                      'Pain_type_3', 'Pain_type_4', 'Pain_type_5',
                     'Pain_type_6']

# Create a new dataframe with only the extracted columns
extracted_df = df_[columns_to_extract].copy()

# Create a new dataframe to store the results
result_df = pd.DataFrame()

# Iterate over each code and meaning in the dictionary
for code, meaning in code_meaning.items():
    # Check if the code is positive or negative
    is_positive = code > 0

    # Create a boolean mask indicating where the code is present in the extracted columns
    code_mask = extracted_df.isin([code])

    # Count the occurrences of the code in each row
    code_counts = code_mask.sum(axis=1)

    # Create a new column with the meaning and initialize it as 1 if the code is present, else 0
    result_df[meaning] = np.where(code_counts > 0, 1, np.nan)

# Concatenate the result dataframe with the original dataframe
df_ = pd.concat([df_, result_df], axis=1)

df_['Headache'] = np.where(df_['None_of_the_above'] == 1, 0, df_['Headache'])
df_['Facial_pain'] = np.where(df_['None_of_the_above'] == 1, 0, df_['Facial_pain'])
df_['Neck_shoulder_pain'] = np.where(df_['None_of_the_above'] == 1, 0, df_['Neck_shoulder_pain'])
df_['Back_pain'] = np.where(df_['None_of_the_above'] == 1, 0, df_['Back_pain'])
df_['Stomach_abdominal_pain'] = np.where(df_['None_of_the_above'] == 1, 0, df_['Stomach_abdominal_pain'])
df_['Hip_pain'] = np.where(df_['None_of_the_above'] == 1, 0, df_['Hip_pain'])
df_['Knee_pain'] = np.where(df_['None_of_the_above'] == 1, 0, df_['Knee_pain'])
df_['Pain_body'] = np.where(df_['None_of_the_above'] == 1, 0, df_['Pain_body'])

del df_['Pain_type_0']
del df_['Pain_type_1']
del df_['Pain_type_2']
del df_['Pain_type_3']
del df_['Pain_type_4']
del df_['Pain_type_5']
del df_['Pain_type_6']

del df_['None_of_the_above']
del df_['Prefer_not_to_answer']

In [None]:
# Assuming your dataframe is named 'df' with the renamed columns
# Creating a dictionary to map the code to its corresponding meaning
code_meaning = {
1:'Sports_club_or_gym',
2:'Pub_or_social_club',
3:'Religious_group',
4:'Adult_education_class',
5:'Other_group_activity',
-7:'None_of_the_above',
-3:'Prefer_not_to_answer'
}

# Specify the column names you want to extract
columns_to_extract = ['Leisure/social_activities_0', 'Leisure/social_activities_1', 'Leisure/social_activities_2',
                      'Leisure/social_activities_3', 'Leisure/social_activities_4']

# Create a new dataframe with only the extracted columns
extracted_df = df_[columns_to_extract].copy()

# Create a new dataframe to store the results
result_df = pd.DataFrame()

# Iterate over each code and meaning in the dictionary
for code, meaning in code_meaning.items():
    # Check if the code is positive or negative
    is_positive = code > 0

    # Create a boolean mask indicating where the code is present in the extracted columns
    code_mask = extracted_df.isin([code])

    # Count the occurrences of the code in each row
    code_counts = code_mask.sum(axis=1)

    # Create a new column with the meaning and initialize it as 1 if the code is present, else 0
    result_df[meaning] = np.where(code_counts > 0, 1, np.nan)

# Concatenate the result dataframe with the original dataframe
df_ = pd.concat([df_, result_df], axis=1)

df_['Sports_club_or_gym'] = np.where(df_['None_of_the_above'] == 1, 0, df_['Sports_club_or_gym'])
df_['Pub_or_social_club'] = np.where(df_['None_of_the_above'] == 1, 0, df_['Pub_or_social_club'])
df_['Religious_group'] = np.where(df_['None_of_the_above'] == 1, 0, df_['Religious_group'])
df_['Adult_education_class'] = np.where(df_['None_of_the_above'] == 1, 0, df_['Adult_education_class'])
df_['Other_group_activity'] = np.where(df_['None_of_the_above'] == 1, 0, df_['Other_group_activity'])

del df_['Leisure/social_activities_0']
del df_['Leisure/social_activities_1']
del df_['Leisure/social_activities_2']
del df_['Leisure/social_activities_3']
del df_['Leisure/social_activities_4']

del df_['None_of_the_above']
del df_['Prefer_not_to_answer']

In [None]:
# Assuming your dataframe is named 'df' with the renamed columns
# Creating a dictionary to map the code to its corresponding meaning
code_meaning = {
14:'Hip_fracture_F',
13:'Prostate_cancer_F',
12:'Severe_depression_F',
11:'Parkinson_disease_F',
10:'Alzheimer_disease/dementia_F',
9:'Diabetes_F',
8:'High_blood_pressure_F',
6:'Chronic_bronchitis/emphysema_F',
5:'Breast_cancer_F',
4:'Bowel_cancer_F',
3:'Lung_cancer_F',
2:'Stroke_F',
1:'Heart_disease_F',
-11:'Do_not_know_(group_1)_F',
-13:'Prefer_not_to_answer_(group_1)_F',
-17:'None_of_the_above_(group_1)_F',
-21:'Do_not_know_(group_2)_F',
-23:'Prefer_not_to_answer_(group_2)_F',
-27:'None_of_the_above_(group_2)_F'
}

# Specify the column names you want to extract
columns_to_extract = ['Illnesses_father_0', 'Illnesses_father_1', 'Illnesses_father_2',
                      'Illnesses_father_3', 'Illnesses_father_4', 'Illnesses_father_5',
                      'Illnesses_father_6', 'Illnesses_father_7', 'Illnesses_father_8',
                     'Illnesses_father_9']

# Create a new dataframe with only the extracted columns
extracted_df = df_[columns_to_extract].copy()

# Create a new dataframe to store the results
result_df = pd.DataFrame()

# Iterate over each code and meaning in the dictionary
for code, meaning in code_meaning.items():
    # Check if the code is positive or negative
    is_positive = code > 0

    # Create a boolean mask indicating where the code is present in the extracted columns
    code_mask = extracted_df.isin([code])

    # Count the occurrences of the code in each row
    code_counts = code_mask.sum(axis=1)

    # Create a new column with the meaning and initialize it as 1 if the code is present, else 0
    result_df[meaning] = np.where(code_counts > 0, 1, 0)

# Concatenate the result dataframe with the original dataframe
df_ = pd.concat([df_, result_df], axis=1)

df_['Hip_fracture_F'] = np.where((df_['Prefer_not_to_answer_(group_1)_F'] == 1) & (df_['Prefer_not_to_answer_(group_2)_F'] == 1), np.nan, df_['Hip_fracture_F'])
df_['Prostate_cancer_F'] = np.where((df_['Prefer_not_to_answer_(group_1)_F'] == 1) & (df_['Prefer_not_to_answer_(group_2)_F'] == 1), np.nan, df_['Prostate_cancer_F'])
df_['Severe_depression_F'] = np.where((df_['Prefer_not_to_answer_(group_1)_F'] == 1) & (df_['Prefer_not_to_answer_(group_2)_F'] == 1), np.nan, df_['Severe_depression_F'])
df_['Parkinson_disease_F'] = np.where((df_['Prefer_not_to_answer_(group_1)_F'] == 1) & (df_['Prefer_not_to_answer_(group_2)_F'] == 1), np.nan, df_['Parkinson_disease_F'])
df_['Alzheimer_disease/dementia_F'] = np.where((df_['Prefer_not_to_answer_(group_1)_F'] == 1) & (df_['Prefer_not_to_answer_(group_2)_F'] == 1), np.nan, df_['Alzheimer_disease/dementia_F'])
df_['Diabetes_F'] = np.where((df_['Prefer_not_to_answer_(group_1)_F'] == 1) & (df_['Prefer_not_to_answer_(group_2)_F'] == 1), np.nan, df_['Diabetes_F'])
df_['High_blood_pressure_F'] = np.where((df_['Prefer_not_to_answer_(group_1)_F'] == 1) & (df_['Prefer_not_to_answer_(group_2)_F'] == 1), np.nan, df_['High_blood_pressure_F'])
df_['Chronic_bronchitis/emphysema_F'] = np.where((df_['Prefer_not_to_answer_(group_1)_F'] == 1) & (df_['Prefer_not_to_answer_(group_2)_F'] == 1), np.nan, df_['Chronic_bronchitis/emphysema_F'])
df_['Breast_cancer_F'] = np.where((df_['Prefer_not_to_answer_(group_1)_F'] == 1) & (df_['Prefer_not_to_answer_(group_2)_F'] == 1), np.nan, df_['Breast_cancer_F'])
df_['Bowel_cancer_F'] = np.where((df_['Prefer_not_to_answer_(group_1)_F'] == 1) & (df_['Prefer_not_to_answer_(group_2)_F'] == 1), np.nan, df_['Bowel_cancer_F'])
df_['Lung_cancer_F'] = np.where((df_['Prefer_not_to_answer_(group_1)_F'] == 1) & (df_['Prefer_not_to_answer_(group_2)_F'] == 1), np.nan, df_['Lung_cancer_F'])
df_['Stroke_F'] = np.where((df_['Prefer_not_to_answer_(group_1)_F'] == 1) & (df_['Prefer_not_to_answer_(group_2)_F'] == 1), np.nan, df_['Stroke_F'])
df_['Heart_disease_F'] = np.where((df_['Prefer_not_to_answer_(group_1)_F'] == 1) & (df_['Prefer_not_to_answer_(group_2)_F'] == 1), np.nan, df_['Heart_disease_F'])

df_['Hip_fracture_F'] = np.where((df_['Do_not_know_(group_1)_F'] == 1) & (df_['Do_not_know_(group_2)_F'] == 1), np.nan, df_['Hip_fracture_F'])
df_['Prostate_cancer_F'] = np.where((df_['Do_not_know_(group_1)_F'] == 1) & (df_['Do_not_know_(group_2)_F'] == 1), np.nan, df_['Prostate_cancer_F'])
df_['Severe_depression_F'] = np.where((df_['Do_not_know_(group_1)_F'] == 1) & (df_['Do_not_know_(group_2)_F'] == 1), np.nan, df_['Severe_depression_F'])
df_['Parkinson_disease_F'] = np.where((df_['Do_not_know_(group_1)_F'] == 1) & (df_['Do_not_know_(group_2)_F'] == 1), np.nan, df_['Parkinson_disease_F'])
df_['Alzheimer_disease/dementia_F'] = np.where((df_['Do_not_know_(group_1)_F'] == 1) & (df_['Do_not_know_(group_2)_F'] == 1), np.nan, df_['Alzheimer_disease/dementia_F'])
df_['Diabetes_F'] = np.where((df_['Do_not_know_(group_1)_F'] == 1) & (df_['Do_not_know_(group_2)_F'] == 1), np.nan, df_['Diabetes_F'])
df_['High_blood_pressure_F'] = np.where((df_['Do_not_know_(group_1)_F'] == 1) & (df_['Do_not_know_(group_2)_F'] == 1), np.nan, df_['High_blood_pressure_F'])
df_['Chronic_bronchitis/emphysema_F'] = np.where((df_['Do_not_know_(group_1)_F'] == 1) & (df_['Do_not_know_(group_2)_F'] == 1), np.nan, df_['Chronic_bronchitis/emphysema_F'])
df_['Breast_cancer_F'] = np.where((df_['Do_not_know_(group_1)_F'] == 1) & (df_['Do_not_know_(group_2)_F'] == 1), np.nan, df_['Breast_cancer_F'])
df_['Bowel_cancer_F'] = np.where((df_['Do_not_know_(group_1)_F'] == 1) & (df_['Do_not_know_(group_2)_F'] == 1), np.nan, df_['Bowel_cancer_F'])
df_['Lung_cancer_F'] = np.where((df_['Do_not_know_(group_1)_F'] == 1) & (df_['Do_not_know_(group_2)_F'] == 1), np.nan, df_['Lung_cancer_F'])
df_['Stroke_F'] = np.where((df_['Do_not_know_(group_1)_F'] == 1) & (df_['Do_not_know_(group_2)_F'] == 1), np.nan, df_['Stroke_F'])
df_['Heart_disease_F'] = np.where((df_['Do_not_know_(group_1)_F'] == 1) & (df_['Do_not_know_(group_2)_F'] == 1), np.nan, df_['Heart_disease_F'])

del df_['Illnesses_father_0']
del df_['Illnesses_father_1']
del df_['Illnesses_father_2']
del df_['Illnesses_father_3']
del df_['Illnesses_father_4']
del df_['Illnesses_father_5']
del df_['Illnesses_father_6']
del df_['Illnesses_father_7']
del df_['Illnesses_father_8']
del df_['Illnesses_father_9']

del df_['Do_not_know_(group_1)_F']
del df_['Prefer_not_to_answer_(group_1)_F']
del df_['None_of_the_above_(group_1)_F']
del df_['Do_not_know_(group_2)_F']
del df_['Prefer_not_to_answer_(group_2)_F']
del df_['None_of_the_above_(group_2)_F']

In [None]:
# Assuming your dataframe is named 'df' with the renamed columns
# Creating a dictionary to map the code to its corresponding meaning
code_meaning = {
14:'Hip_fracture_M',
13:'Prostate_cancer_M',
12:'Severe_depression_M',
11:'Parkinson_disease_M',
10:'Alzheimer_disease/dementia_M',
9:'Diabetes_M',
8:'High_blood_pressure_M',
6:'Chronic_bronchitis/emphysema_M',
5:'Breast_cancer_M',
4:'Bowel_cancer_M',
3:'Lung_cancer_M',
2:'Stroke_M',
1:'Heart_disease_M',
-11:'Do_not_know_(group_1)_M',
-13:'Prefer_not_to_answer_(group_1)_M',
-17:'None_of_the_above_(group_1)_M',
-21:'Do_not_know_(group_2)_M',
-23:'Prefer_not_to_answer_(group_2)_M',
-27:'None_of_the_above_(group_2)_M'
}

# Specify the column names you want to extract
columns_to_extract = ['Illnesses_mother_0', 'Illnesses_mother_1', 'Illnesses_mother_2',
                      'Illnesses_mother_3', 'Illnesses_mother_4', 'Illnesses_mother_5',
                      'Illnesses_mother_6', 'Illnesses_mother_7', 'Illnesses_mother_8',
                     'Illnesses_mother_9', 'Illnesses_mother_10']

# Create a new dataframe with only the extracted columns
extracted_df = df_[columns_to_extract].copy()

# Create a new dataframe to store the results
result_df = pd.DataFrame()

# Iterate over each code and meaning in the dictionary
for code, meaning in code_meaning.items():
    # Check if the code is positive or negative
    is_positive = code > 0

    # Create a boolean mask indicating where the code is present in the extracted columns
    code_mask = extracted_df.isin([code])

    # Count the occurrences of the code in each row
    code_counts = code_mask.sum(axis=1)

    # Create a new column with the meaning and initialize it as 1 if the code is present, else 0
    result_df[meaning] = np.where(code_counts > 0, 1, 0)

# Concatenate the result dataframe with the original dataframe
df_ = pd.concat([df_, result_df], axis=1)

df_['Hip_fracture_M'] = np.where((df_['Prefer_not_to_answer_(group_1)_M'] == 1) & (df_['Prefer_not_to_answer_(group_2)_M'] == 1), np.nan, df_['Hip_fracture_M'])
df_['Prostate_cancer_M'] = np.where((df_['Prefer_not_to_answer_(group_1)_M'] == 1) & (df_['Prefer_not_to_answer_(group_2)_M'] == 1), np.nan, df_['Prostate_cancer_M'])
df_['Severe_depression_M'] = np.where((df_['Prefer_not_to_answer_(group_1)_M'] == 1) & (df_['Prefer_not_to_answer_(group_2)_M'] == 1), np.nan, df_['Severe_depression_M'])
df_['Parkinson_disease_M'] = np.where((df_['Prefer_not_to_answer_(group_1)_M'] == 1) & (df_['Prefer_not_to_answer_(group_2)_M'] == 1), np.nan, df_['Parkinson_disease_M'])
df_['Alzheimer_disease/dementia_M'] = np.where((df_['Prefer_not_to_answer_(group_1)_M'] == 1) & (df_['Prefer_not_to_answer_(group_2)_M'] == 1), np.nan, df_['Alzheimer_disease/dementia_M'])
df_['Diabetes_M'] = np.where((df_['Prefer_not_to_answer_(group_1)_M'] == 1) & (df_['Prefer_not_to_answer_(group_2)_M'] == 1), np.nan, df_['Diabetes_M'])
df_['High_blood_pressure_M'] = np.where((df_['Prefer_not_to_answer_(group_1)_M'] == 1) & (df_['Prefer_not_to_answer_(group_2)_M'] == 1), np.nan, df_['High_blood_pressure_M'])
df_['Chronic_bronchitis/emphysema_M'] = np.where((df_['Prefer_not_to_answer_(group_1)_M'] == 1) & (df_['Prefer_not_to_answer_(group_2)_M'] == 1), np.nan, df_['Chronic_bronchitis/emphysema_M'])
df_['Breast_cancer_M'] = np.where((df_['Prefer_not_to_answer_(group_1)_M'] == 1) & (df_['Prefer_not_to_answer_(group_2)_M'] == 1), np.nan, df_['Breast_cancer_M'])
df_['Bowel_cancer_M'] = np.where((df_['Prefer_not_to_answer_(group_1)_M'] == 1) & (df_['Prefer_not_to_answer_(group_2)_M'] == 1), np.nan, df_['Bowel_cancer_M'])
df_['Lung_cancer_M'] = np.where((df_['Prefer_not_to_answer_(group_1)_M'] == 1) & (df_['Prefer_not_to_answer_(group_2)_M'] == 1), np.nan, df_['Lung_cancer_M'])
df_['Stroke_M'] = np.where((df_['Prefer_not_to_answer_(group_1)_M'] == 1) & (df_['Prefer_not_to_answer_(group_2)_M'] == 1), np.nan, df_['Stroke_M'])
df_['Heart_disease_M'] = np.where((df_['Prefer_not_to_answer_(group_1)_M'] == 1) & (df_['Prefer_not_to_answer_(group_2)_M'] == 1), np.nan, df_['Heart_disease_M'])

df_['Hip_fracture_M'] = np.where((df_['Do_not_know_(group_1)_M'] == 1) & (df_['Do_not_know_(group_2)_M'] == 1), np.nan, df_['Hip_fracture_M'])
df_['Prostate_cancer_M'] = np.where((df_['Do_not_know_(group_1)_M'] == 1) & (df_['Do_not_know_(group_2)_M'] == 1), np.nan, df_['Prostate_cancer_M'])
df_['Severe_depression_M'] = np.where((df_['Do_not_know_(group_1)_M'] == 1) & (df_['Do_not_know_(group_2)_M'] == 1), np.nan, df_['Severe_depression_M'])
df_['Parkinson_disease_M'] = np.where((df_['Do_not_know_(group_1)_M'] == 1) & (df_['Do_not_know_(group_2)_M'] == 1), np.nan, df_['Parkinson_disease_M'])
df_['Alzheimer_disease/dementia_M'] = np.where((df_['Do_not_know_(group_1)_M'] == 1) & (df_['Do_not_know_(group_2)_M'] == 1), np.nan, df_['Alzheimer_disease/dementia_M'])
df_['Diabetes_M'] = np.where((df_['Do_not_know_(group_1)_M'] == 1) & (df_['Do_not_know_(group_2)_M'] == 1), np.nan, df_['Diabetes_M'])
df_['High_blood_pressure_M'] = np.where((df_['Do_not_know_(group_1)_M'] == 1) & (df_['Do_not_know_(group_2)_M'] == 1), np.nan, df_['High_blood_pressure_M'])
df_['Chronic_bronchitis/emphysema_M'] = np.where((df_['Do_not_know_(group_1)_M'] == 1) & (df_['Do_not_know_(group_2)_M'] == 1), np.nan, df_['Chronic_bronchitis/emphysema_M'])
df_['Breast_cancer_M'] = np.where((df_['Do_not_know_(group_1)_M'] == 1) & (df_['Do_not_know_(group_2)_M'] == 1), np.nan, df_['Breast_cancer_M'])
df_['Bowel_cancer_M'] = np.where((df_['Do_not_know_(group_1)_M'] == 1) & (df_['Do_not_know_(group_2)_M'] == 1), np.nan, df_['Bowel_cancer_M'])
df_['Lung_cancer_M'] = np.where((df_['Do_not_know_(group_1)_M'] == 1) & (df_['Do_not_know_(group_2)_M'] == 1), np.nan, df_['Lung_cancer_M'])
df_['Stroke_M'] = np.where((df_['Do_not_know_(group_1)_M'] == 1) & (df_['Do_not_know_(group_2)_M'] == 1), np.nan, df_['Stroke_M'])
df_['Heart_disease_M'] = np.where((df_['Do_not_know_(group_1)_M'] == 1) & (df_['Do_not_know_(group_2)_M'] == 1), np.nan, df_['Heart_disease_M'])

del df_['Illnesses_mother_0']
del df_['Illnesses_mother_1']
del df_['Illnesses_mother_2']
del df_['Illnesses_mother_3']
del df_['Illnesses_mother_4']
del df_['Illnesses_mother_5']
del df_['Illnesses_mother_6']
del df_['Illnesses_mother_7']
del df_['Illnesses_mother_8']
del df_['Illnesses_mother_9']
del df_['Illnesses_mother_10']

del df_['Do_not_know_(group_1)_M']
del df_['Prefer_not_to_answer_(group_1)_M']
del df_['None_of_the_above_(group_1)_M']
del df_['Do_not_know_(group_2)_M']
del df_['Prefer_not_to_answer_(group_2)_M']
del df_['None_of_the_above_(group_2)_M']

In [None]:
# Assuming your dataframe is named 'df' with the renamed columns
# Creating a dictionary to map the code to its corresponding meaning
code_meaning = {
14:'Hip_fracture_S',
13:'Prostate_cancer_S',
12:'Severe_depression_S',
11:'Parkinson_disease_S',
10:'Alzheimer_disease/dementia_S',
9:'Diabetes_S',
8:'High_blood_pressure_S',
6:'Chronic_bronchitis/emphysema_S',
5:'Breast_cancer_S',
4:'Bowel_cancer_S',
3:'Lung_cancer_S',
2:'Stroke_S',
1:'Heart_disease_S',
-11:'Do_not_know_(group_1)_S',
-13:'Prefer_not_to_answer_(group_1)_S',
-17:'None_of_the_above_(group_1)_S',
-21:'Do_not_know_(group_2)_S',
-23:'Prefer_not_to_answer_(group_2)_S',
-27:'None_of_the_above_(group_2)_S'
}

# Specify the column names you want to extract
columns_to_extract = ['Illnesses_siblings_0', 'Illnesses_siblings_1', 'Illnesses_siblings_2',
                      'Illnesses_siblings_3', 'Illnesses_siblings_4', 'Illnesses_siblings_5',
                      'Illnesses_siblings_6', 'Illnesses_siblings_7', 'Illnesses_siblings_8',
                     'Illnesses_siblings_9', 'Illnesses_siblings_10', 'Illnesses_siblings_11']

# Create a new dataframe with only the extracted columns
extracted_df = df_[columns_to_extract].copy()

# Create a new dataframe to store the results
result_df = pd.DataFrame()

# Iterate over each code and meaning in the dictionary
for code, meaning in code_meaning.items():
    # Check if the code is positive or negative
    is_positive = code > 0

    # Create a boolean mask indicating where the code is present in the extracted columns
    code_mask = extracted_df.isin([code])

    # Count the occurrences of the code in each row
    code_counts = code_mask.sum(axis=1)

    # Create a new column with the meaning and initialize it as 1 if the code is present, else 0
    result_df[meaning] = np.where(code_counts > 0, 1, 0)

# Concatenate the result dataframe with the original dataframe
df_ = pd.concat([df_, result_df], axis=1)

df_['Hip_fracture_S'] = np.where((df_['Prefer_not_to_answer_(group_1)_S'] == 1) & (df_['Prefer_not_to_answer_(group_2)_S'] == 1), np.nan, df_['Hip_fracture_S'])
df_['Prostate_cancer_S'] = np.where((df_['Prefer_not_to_answer_(group_1)_S'] == 1) & (df_['Prefer_not_to_answer_(group_2)_S'] == 1), np.nan, df_['Prostate_cancer_S'])
df_['Severe_depression_S'] = np.where((df_['Prefer_not_to_answer_(group_1)_S'] == 1) & (df_['Prefer_not_to_answer_(group_2)_S'] == 1), np.nan, df_['Severe_depression_S'])
df_['Parkinson_disease_S'] = np.where((df_['Prefer_not_to_answer_(group_1)_S'] == 1) & (df_['Prefer_not_to_answer_(group_2)_S'] == 1), np.nan, df_['Parkinson_disease_S'])
df_['Alzheimer_disease/dementia_S'] = np.where((df_['Prefer_not_to_answer_(group_1)_S'] == 1) & (df_['Prefer_not_to_answer_(group_2)_S'] == 1), np.nan, df_['Alzheimer_disease/dementia_S'])
df_['Diabetes_S'] = np.where((df_['Prefer_not_to_answer_(group_1)_S'] == 1) & (df_['Prefer_not_to_answer_(group_2)_S'] == 1), np.nan, df_['Diabetes_S'])
df_['High_blood_pressure_S'] = np.where((df_['Prefer_not_to_answer_(group_1)_S'] == 1) & (df_['Prefer_not_to_answer_(group_2)_S'] == 1), np.nan, df_['High_blood_pressure_S'])
df_['Chronic_bronchitis/emphysema_S'] = np.where((df_['Prefer_not_to_answer_(group_1)_S'] == 1) & (df_['Prefer_not_to_answer_(group_2)_S'] == 1), np.nan, df_['Chronic_bronchitis/emphysema_S'])
df_['Breast_cancer_S'] = np.where((df_['Prefer_not_to_answer_(group_1)_S'] == 1) & (df_['Prefer_not_to_answer_(group_2)_S'] == 1), np.nan, df_['Breast_cancer_S'])
df_['Bowel_cancer_S'] = np.where((df_['Prefer_not_to_answer_(group_1)_S'] == 1) & (df_['Prefer_not_to_answer_(group_2)_S'] == 1), np.nan, df_['Bowel_cancer_S'])
df_['Lung_cancer_S'] = np.where((df_['Prefer_not_to_answer_(group_1)_S'] == 1) & (df_['Prefer_not_to_answer_(group_2)_S'] == 1), np.nan, df_['Lung_cancer_S'])
df_['Stroke_S'] = np.where((df_['Prefer_not_to_answer_(group_1)_S'] == 1) & (df_['Prefer_not_to_answer_(group_2)_S'] == 1), np.nan, df_['Stroke_S'])
df_['Heart_disease_S'] = np.where((df_['Prefer_not_to_answer_(group_1)_S'] == 1) & (df_['Prefer_not_to_answer_(group_2)_S'] == 1), np.nan, df_['Heart_disease_S'])

df_['Hip_fracture_S'] = np.where((df_['Do_not_know_(group_1)_S'] == 1) & (df_['Do_not_know_(group_2)_S'] == 1), np.nan, df_['Hip_fracture_S'])
df_['Prostate_cancer_S'] = np.where((df_['Do_not_know_(group_1)_S'] == 1) & (df_['Do_not_know_(group_2)_S'] == 1), np.nan, df_['Prostate_cancer_S'])
df_['Severe_depression_S'] = np.where((df_['Do_not_know_(group_1)_S'] == 1) & (df_['Do_not_know_(group_2)_S'] == 1), np.nan, df_['Severe_depression_S'])
df_['Parkinson_disease_S'] = np.where((df_['Do_not_know_(group_1)_S'] == 1) & (df_['Do_not_know_(group_2)_S'] == 1), np.nan, df_['Parkinson_disease_S'])
df_['Alzheimer_disease/dementia_S'] = np.where((df_['Do_not_know_(group_1)_S'] == 1) & (df_['Do_not_know_(group_2)_S'] == 1), np.nan, df_['Alzheimer_disease/dementia_S'])
df_['Diabetes_S'] = np.where((df_['Do_not_know_(group_1)_S'] == 1) & (df_['Do_not_know_(group_2)_S'] == 1), np.nan, df_['Diabetes_S'])
df_['High_blood_pressure_S'] = np.where((df_['Do_not_know_(group_1)_S'] == 1) & (df_['Do_not_know_(group_2)_S'] == 1), np.nan, df_['High_blood_pressure_S'])
df_['Chronic_bronchitis/emphysema_S'] = np.where((df_['Do_not_know_(group_1)_S'] == 1) & (df_['Do_not_know_(group_2)_S'] == 1), np.nan, df_['Chronic_bronchitis/emphysema_S'])
df_['Breast_cancer_S'] = np.where((df_['Do_not_know_(group_1)_S'] == 1) & (df_['Do_not_know_(group_2)_S'] == 1), np.nan, df_['Breast_cancer_S'])
df_['Bowel_cancer_S'] = np.where((df_['Do_not_know_(group_1)_S'] == 1) & (df_['Do_not_know_(group_2)_S'] == 1), np.nan, df_['Bowel_cancer_S'])
df_['Lung_cancer_S'] = np.where((df_['Do_not_know_(group_1)_S'] == 1) & (df_['Do_not_know_(group_2)_S'] == 1), np.nan, df_['Lung_cancer_S'])
df_['Stroke_S'] = np.where((df_['Do_not_know_(group_1)_S'] == 1) & (df_['Do_not_know_(group_2)_S'] == 1), np.nan, df_['Stroke_S'])
df_['Heart_disease_S'] = np.where((df_['Do_not_know_(group_1)_S'] == 1) & (df_['Do_not_know_(group_2)_S'] == 1), np.nan, df_['Heart_disease_S'])

del df_['Illnesses_siblings_0']
del df_['Illnesses_siblings_1']
del df_['Illnesses_siblings_2']
del df_['Illnesses_siblings_3']
del df_['Illnesses_siblings_4']
del df_['Illnesses_siblings_5']
del df_['Illnesses_siblings_6']
del df_['Illnesses_siblings_7']
del df_['Illnesses_siblings_8']
del df_['Illnesses_siblings_9']
del df_['Illnesses_siblings_10']
del df_['Illnesses_siblings_11']

del df_['Do_not_know_(group_1)_S']
del df_['Prefer_not_to_answer_(group_1)_S']
del df_['None_of_the_above_(group_1)_S']
del df_['Do_not_know_(group_2)_S']
del df_['Prefer_not_to_answer_(group_2)_S']
del df_['None_of_the_above_(group_2)_S']

In [None]:
variables = ['Standing_height','Fasting_time','Number_cancers',
            'Number_operation','Job_walking_standing','Job_heavy_manual',
            'Job_involves_shift','Frequency_friend_family_visits',
            'Time_outdoors_summer','Time_outdoors_winter','Length_phone_use',
            'Weekly_phone_use','Difference_phone_use','Getting_up_in_morning',
            'Morning_evening_person','Snoring','Coffee_intake','Skin_colour',
            'Mood_swings','Miserableness','Irritability','Sensitivity',
            'Fed-up_feelings','Nervous_feelings','Worrier/Anxious_feelings',
            'Tense','Worry_too_long_after_embarrassment','Suffer_from_nerves',
            'Loneliness_isolation','Guilty_feelings','Risk_taking','Able_to_confide',
            'Overall_health_rating','Plays_computer_games','Hearing_difficulty/problems',
            'Use_of_sun/uv_protection','Frequency_of_solarium/sunlamp','Had_major_operation',
            'Cancer_diagnosed_by_doctor','Fractured/broken_bones','Other_serious_condition',
            'Health_satisfaction','Family_relationship satisfaction','Friendships_satisfaction',
            'Financial_satisfaction','Ever_manic/hyper','Ever_highly_irritable/argumentative',
            'Leg_pain_on_walking','Tinnitus','Length_manic/irritable_episode',
            'Severity_of_manic/irritable_episodes','Mean_time_to_identify_matches',
            'Bipolar_disorder_status','Weight']

for col in variables:
    #df_[col] = df_[col].replace([-1,-2,-3,-10,-818],[np.NaN,100,np.NaN,0.5,np.NaN])
    df_[col] = df_[col].replace([-1,-2,-3,-10,-121,-818],[np.NaN,999,np.NaN,0.5,np.NaN,np.NaN])

In [None]:
df_.shape

In [None]:
variables = ['Standing_height','Fasting_time','Number_cancers',
            'Number_operation','Job_walking_standing','Job_heavy_manual',
            'Job_involves_shift','Frequency_friend_family_visits',
            'Time_outdoors_summer','Time_outdoors_winter','Length_phone_use',
            'Weekly_phone_use','Difference_phone_use','Getting_up_in_morning',
            'Morning_evening_person','Snoring','Coffee_intake','Skin_colour',
            'Mood_swings','Miserableness','Irritability','Sensitivity',
            'Fed-up_feelings','Nervous_feelings','Worrier/Anxious_feelings',
            'Tense','Worry_too_long_after_embarrassment','Suffer_from_nerves',
            'Loneliness_isolation','Guilty_feelings','Risk_taking','Able_to_confide',
            'Overall_health_rating','Plays_computer_games','Hearing_difficulty/problems',
            'Use_of_sun/uv_protection','Frequency_of_solarium/sunlamp','Had_major_operation',
            'Cancer_diagnosed_by_doctor','Fractured/broken_bones','Other_serious_condition',
            'Health_satisfaction','Family_relationship satisfaction','Friendships_satisfaction',
            'Financial_satisfaction','Ever_manic/hyper','Ever_highly_irritable/argumentative',
            'Leg_pain_on_walking','Tinnitus','Length_manic/irritable_episode',
            'Severity_of_manic/irritable_episodes','Mean_time_to_identify_matches',
            'Had_other_major_operations',
            'Happiness',
            'Work/job_satisfaction',
            'Cancer_age_first',
            'Bipolar_disorder_status','Weight']

for col in variables:
    #df_[col] = df_[col].replace([-1,-2,-3,-10,-818],[np.NaN,100,np.NaN,0.5,np.NaN])
    df_[col] = df_[col].replace([-1,-2,-3,-10,-121,-818],[np.NaN,999,np.NaN,0.5,np.NaN,np.NaN])

In [None]:
columns_with_negatives = df_.columns[df_.lt(0).any()]; columns_with_negatives

In [None]:
df_.to_csv(r'/Users/marinacamacho/Desktop/Master_I/Raw_Data/Time_0/ukb669914_clean.csv', index = False, header=True)