In [None]:
# Import necessary libraries 
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import numpy as np
import pandas as pd
import pickle
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split

In [None]:
# Import dataset
df = pd.read_csv('../Processed datasets/Before splitting/DF_with_target.csv')

In [None]:
# Drop duplicates in the dataset 
df = df.drop_duplicates()

#### Create overview of participant charachteristics

In [None]:
# Create a dataframe consisting of the basic characteristics of the participants 
basics_target = df[['YOB_C', 'ETHN_DS', 'SEX', 'diabetic_outcome']].copy()

In [None]:
# Derive the age of participants based on their year of birth
basics_target.loc[:,'AGE_PHASE_9'] = 2008 - (basics_target.loc[:,'YOB_C'] + 1900)

# Plot the age of participants with a histogram
ax = basics_target['AGE_PHASE_9'].plot.hist(bins=23, grid=True, rwidth=0.9, 
                                 color='gray')

ax.yaxis.grid(True)
ax.xaxis.grid(False)

plt.title('Histogram of Participant Age in 2008')
plt.xlabel('Age')
plt.ylabel('Counts')
plt.grid(axis='y', alpha=0.75)

In [None]:
# Print the ehtnicity, gender and diabetic outcome distributions
print(basics_target['ETHN_DS'].value_counts())
print(basics_target['SEX'].value_counts())
print(basics_target['diabetic_outcome'].value_counts())

In [None]:
# Print the number of rows, number of columns,  and datatypes
print('Number of rows:', df.shape[0])
print('Number of columns:', df.shape[1])
print('Column Data Types:', df.dtypes)

#### Remove features used during target creation

In [None]:
# Make a list of columns that were used for creating the target 
used_target_creation =[ 'JDIABYR',"JDRG0611","JDRG6112", 
                       "JDRG6110", "JDRG6111", "JDRG0612", 
                       "JDRG6122", "JDRG6120", "JDRG6123", 
                       "JDRG6121", "JHBA"]

In [None]:
# Delete the columns that were used for creating the target 
for col in used_target_creation:
    if col in df.columns:
        df = df.drop(col, axis=1)

In [None]:
# Make a list of features that provide direct information about the outcome 
diabetes_features = ['JDMKNOWN',  'JDIABDR1', 'JDIABDR2', 'JDIABDR3', 'JDIABDR4',
                      'JDMKNCUM',  'JDMADATO', 'JDMWHOTO', 'JDMINCUM', 
                     'JDRG611', 'JDRG612', 'JDIABDRG', 'JPRDIADR','JINCLAU_L', 'JDRG22', 
                     'JDRG25', 'JDRG66', 'JDRG255', 'JDRG412', 'JDRG29', 'JDRG26', 'JLIPDRG', 
                     'JDIABET' , 'JDIABET1', 'JHRSFAST', 'JFASTING', 'JDMGCL2H', 'JDMGCL_F', 
                     'JGLUC_F', 'JINSU_F', 'MTIMGLU','JFASTED', 'JTIMGLU','JFOODYT', 'JFAINTED' , 
                     'JBLTRIES','JTIMSMP1', 'JTIMSMP2']

In [None]:
# Drop the features that provide direct information about the outcome
for col in diabetes_features:
    if col in df.columns:
        df = df.drop(col, axis=1)

In [None]:
# Delete all the people that have diabetes type 1, only have diabetes  type 2 in phase 9, or have diabetes type 2 in both phase 9 and 11 
non_diabetes_type_134_rows = ((df['diabetic_outcome'] != 1) & (df['diabetic_outcome'] != 3) & (df['diabetic_outcome'] != 4))
df = df[non_diabetes_type_134_rows]

### Get an overview of percentage of missing values per feature

In [None]:
# Define a function that calculates the percentage of missing values per column
def missing_values_columns(dataframe):
    percent_missing = dataframe.isnull().sum()*100/len(dataframe)
    missing_value_df = pd.DataFrame({'column_name':dataframe.columns,
                                     'percent_missing':percent_missing})
    
    missing_value_df.sort_values('percent_missing', inplace=True)
    return missing_value_df

In [None]:
# Create a dataframe that presents the percentage of missing values per column 
missing_df = missing_values_columns(df)

In [None]:
# Plot the missing data as a histogram 
missing_df['percent_missing'].hist(bins=100)
plt.title('Histogram to visualize % missing data of columns')
plt.xlabel('Percentage of data missing')
plt.ylabel('Number of columns')
plt.show()

#### Handle the missing values for the basic data

In [None]:
# Create a dataframe with only the basic characteristics
basics = df[['YOB_C', 'ETHN_DS', 'SEX']]

In [None]:
# Print the amount of missing values per column
basics.isna().sum()

In [None]:
# Replace the missing values with a 1, equalling white 
df['ETHN_DS'] = df['ETHN_DS'].fillna(1)

#### Create function to remove columns with a lot of missing values

In [None]:
# Create a function that removes the features with missing data above a chosen threshold 
def remove_columns_with_missing_values(dataframe, missing_values_dataframe, threshold):
    remove = missing_values_dataframe[missing_values_dataframe.iloc[:, 1]>threshold].iloc[:,0].tolist()
    print("The number of columns that will be removed is",len(remove))
    dataframe = dataframe.drop(columns=remove)
    return dataframe 

#### Drop rows with only missing values

In [None]:
df = df.dropna(axis=1, how='all')

#### Check for columns with only 1 possible value

In [None]:
# Check if there are features that only have 1 possible value as these are deemed useless for prediction 
one_value_features = []

for column in df.columns:
    if df[column].nunique() == 1:
        one_value_features.append(column)

one_value_features

In [None]:
# Create a function that removes a list of columns from a dataframe
def remove_columns(df, list):
    return df.drop(columns=list, inplace=False)

In [None]:
# Remove all features that have the same value for all participants
df = remove_columns(df, one_value_features)

#### Divide the dataframe into multiple dataframes corresponding to the type of question (questionnaire, FFQ, clinical events and clinical measures)

In [None]:
# Load dictionary from files to use
with open('../Processed datasets/Before splitting/feature_dictionary.pkl', 'rb') as f:
    columns_whitehall = pickle.load(f)

In [None]:
# Create a function that creates seperate dataframes based on the feature dictionary
def create_dataframes_from_dict(df, feature_dict):
    result_dataframes = {}
    for key, columns in feature_dict.items():
        valid_columns = ['Id_random_DPUK'] + [column for column in columns if column in df.columns]
        if valid_columns:
            result_dataframes[key] = df[valid_columns]
    return result_dataframes

In [None]:
# Apply function to arrive multiple seperate dataframes for further cleaning
result_dataframes = create_dataframes_from_dict(df, columns_whitehall)

In [None]:
# Retrieve datasets from created dictionary using the keys 
questionnaire_data = result_dataframes.get("questionnaire_fts")
FFQ_data = result_dataframes.get("FFQ_fts")
clinical_measure_data = result_dataframes.get('clinical_measures_fts')
clinical_events_data = result_dataframes.get('clinical_events_fts')

In [None]:
# Create a list with all columns from the FFQ data
FFQ_columns = FFQ_data.columns.tolist()

In [None]:
# Remove the administrative variables from the list 
admin = ['Id_random_DPUK','JFFQDOC','JFFQDOR']
for item in admin:
    FFQ_columns.remove(item)
    
# Remove participants that did not fill in the FFQ 
FFQ_data = FFQ_data.dropna(subset=FFQ_columns, how='all')
df = df.dropna(subset=FFQ_columns, how='all')

In [None]:
# Print the ehtnicity, gender and diabetic outcome distributions
print(df['ETHN_DS'].value_counts())
print(df['SEX'].value_counts())
print(df['diabetic_outcome'].value_counts())

In [None]:
# Derive the age of participants based on their year of birth
basics_target.loc[:,'AGE_PHASE_9'] = 2008 - (basics_target.loc[:,'YOB_C'] + 1900)

# Plot the age of participants with a histogram
ax = basics_target['AGE_PHASE_9'].plot.hist(bins=23, grid=True, rwidth=0.9, 
                                 color='gray')

ax.yaxis.grid(True)
ax.xaxis.grid(False)

plt.title('Histogram of Participant Age in 2008')
plt.xlabel('Age')
plt.ylabel('Counts')
plt.grid(axis='y', alpha=0.75)

#### Questionnaire data

In [None]:
missing_df_questionnaire = missing_values_columns(questionnaire_data)
missing_df_questionnaire['percent_missing'].hist(bins=100)

In [None]:
# Sort the dataframe to determine which column has the most missing data 
missing_df_questionnaire.sort_values(by=['percent_missing'], ascending=False)

#### Medical consultation

In [None]:
questionnaire_data_processed = questionnaire_data.copy()

In [None]:
# Create a function that applies a specified value to a column if a set of specific other columns are all NaNs
def assign_value_based_on_nan(df, column_list, target_column, value):
    df_modified = df.copy()
    condition = df_modified[column_list].isna().all(axis=1)
    df_modified.loc[condition, target_column] = value 
    return df_modified

In [None]:
# Assign the value 2, equalling no, to the longstanding illness question if all follow-up questions were left unanswered 
medical_conditions = ['JLONGIL1','JLONGIL2', 'JLONGIL3', 'JLONGIL4',
                      'JLONGIL5','JLONGIL6', 'JLONGIL7', 'JLONGIL8',
                      'JLONGIL9', 'JLONGL10' ]

questionnaire_data_processed = assign_value_based_on_nan(questionnaire_data_processed, medical_conditions, 'JLONGILL', 2)
df = assign_value_based_on_nan(df, medical_conditions, 'JLONGILL', 2)

#### Medicine

In [None]:
# Create a function that creates list based on the first few characters of a feature 
def extract_strings_starting_with(dataframe, column, string):
    filtered_series = dataframe[dataframe[column].astype(str).str.startswith(string)][column]
    result_list = filtered_series.tolist()
    return result_list

In [None]:
# Create a function that can be used to filter a list 
def filter_list(list, starts_with):
    filtered_list = [s for s in list if any(s.startswith(start) for start in starts_with)]
    return filtered_list

In [None]:
# Extract all features related to medicine by filtering on JPR
JPR_list = extract_strings_starting_with(missing_df_questionnaire, 'column_name', 'JPR')

In [None]:
# Extract the open questions from the list of features related to medicine
open_questions = filter_list(JPR_list, ['JPRESD', 'JPRSDR'])

In [None]:
# Remove one feature that is not an open question
open_questions.remove('JPRESDOC')

In [None]:
# Set 'have you been taking any medicines' to 2 (no) if all the follow up questions are unanswered 
questionnaire_data_processed = assign_value_based_on_nan(questionnaire_data_processed, open_questions, 'JPRESDOC', 2)
df = assign_value_based_on_nan(df, open_questions, 'JPRESDOC', 2)

In [None]:
# Drop the open questions 
questionnaire_data_processed = questionnaire_data_processed.drop(columns=open_questions)
df = df.drop(columns=open_questions)

#### Chest pain

In [None]:
# Create a function that assigns the value 2 (no) to a set of columns if the value of a specified column equals a specified value 
def set_columns_to_two(dataframe, condition_column, target_columns, value):
    condition_met = dataframe[condition_column] == value
    cond_target = dataframe[target_columns].isna().all(axis=1)
    dataframe.loc[condition_met & cond_target, target_columns] = 2 
    return dataframe

In [None]:
# Create a function that sets specified columns to 0 based on a whether a specified columns equals a set value  
def set_columns_to_zero(dataframe, condition_column, target_columns, value):
    condition_met = dataframe[condition_column] == value
    cond_target = dataframe[target_columns].isna().all(axis=1)
    dataframe.loc[condition_met & cond_target, target_columns] = 0 
    return dataframe

In [None]:
# Create a list of questions that are follow-up questions of the question asking whether someone has pain in their chest
follow_up_questions_chest_pain = ['JCHPUPH', 'JCHPLEV', 'JCHPACT', 'JCHPSTOP', 'JCHPTIME', 'JCHPDOC', 'JCHPNUM']

# Set the answer to the quetion to 2 (no) if all follow-up questions were left unanswered 
questionnaire_data_processed = set_columns_to_zero(questionnaire_data_processed, 'JCHPAIN',follow_up_questions_chest_pain ,2)
df = set_columns_to_zero(df,'JCHPAIN' , follow_up_questions_chest_pain, 2)

In [None]:
# Set the answer to JCHPEXT to 2 (no) if the answer to JCHPAIN was 2 (no), because participants were asked to skip that question in that case
questionnaire_data_processed = set_columns_to_two(questionnaire_data_processed, 'JCHPAIN',['JCHPEXT'] ,2)
df = set_columns_to_two(df,'JCHPAIN',['JCHPEXT'], 2)

In [None]:
# Set the question about severe pain across the front of the chest to 2 (no) if the follow up questions were left unanswered
follow_up_questions_chest_pain = ['JCHPDOC', 'JCHPNUM']
questionnaire_data_processed = assign_value_based_on_nan(questionnaire_data_processed, follow_up_questions_chest_pain, 'JCHPEXT', 2)
df = assign_value_based_on_nan(df, follow_up_questions_chest_pain, 'JCHPEXT', 2)

In [None]:
# Set the number of pain attacks to 0 if someone answered 2 (no) to having severe pain across the chest 
questionnaire_data_processed = set_columns_to_zero(questionnaire_data_processed, 'JCHPEXT',follow_up_questions_chest_pain ,2)
df = set_columns_to_zero(df,'JCHPEXT', follow_up_questions_chest_pain, 2)

In [None]:
# Create list of open questions related to chest pain
open_questions_chest_pain = ['JCHPSIT1', 'JCHPSIT2', 'JCHPSIT3', 'JCHPSIT4', 
                             'JCHPSIT5', 'JCHPSIT6', 'JCHPSIT7', 'JCHPSIT8', 
                             'JCHPSIT9', 'JCHPDIAG']

# Delete the open questions about chest pain
for col in open_questions_chest_pain:
    if col in questionnaire_data_processed.columns:
        questionnaire_data_processed = questionnaire_data_processed.drop(col, axis=1)
        df = df.drop(col, axis=1)

#### Test and treaments

In [None]:
# If someone was redirected to question 106, the answer to the skipped questions is set to 0 (a new category)
questionnaire_data_processed.loc[(questionnaire_data_processed['JLR2005'] == 1) & (questionnaire_data_processed['JLRCLGD'].isna()), 'JLRCLGD'] = 0
df.loc[(df['JLR2005'] == 1) & (df['JLRCLGD'].isna()), 'JLRCLGD'] = 0

In [None]:
# If someone has no first treatment, there was also not a second treatment 
questionnaire_data_processed.loc[(questionnaire_data_processed['JEXECG1'] == 2) & (questionnaire_data_processed['JEXECG2'].isna()), 'JEXECG2'] = 2
df.loc[(df['JEXECG1'] == 2) & (df['JEXECG2'].isna()), 'JEXECG2'] = 2

questionnaire_data_processed.loc[(questionnaire_data_processed['JAGRAM1'] == 2) & (questionnaire_data_processed['JAGRAM2'].isna()), 'JAGRAM2'] = 2
df.loc[(df['JAGRAM1'] == 2) & (df['JAGRAM2'].isna()), 'JAGRAM2'] = 2

questionnaire_data_processed.loc[(questionnaire_data_processed['JAPLAS1'] == 2) & (questionnaire_data_processed['JAPLAS2'].isna()), 'JAPLAS2'] = 2
df.loc[(df['JAPLAS1'] == 2) & (df['JAPLAS2'].isna()), 'JAPLAS2'] = 2

questionnaire_data_processed.loc[(questionnaire_data_processed['JADMCH1'] == 2) & (questionnaire_data_processed['JADMCH2'].isna()), 'JADMCH2'] = 2
df.loc[(df['JADMCH1'] == 2) & (df['JADMCH2'].isna()), 'JADMCH2'] = 2

In [None]:
# Set all remaining NaNs to 2, assuming that people who did not answer did not have treament 
test_and_treatments = ['JEXECG1', 'JEXECG2', 
                       'JAGRAM1', 'JAGRAM2', 
                       'JADMCH1', 'JADMCH2',
                       'JAPLAS1', 'JAPLAS2',
                       'JCABG1' ]

for column in test_and_treatments:
    questionnaire_data_processed[column] =  questionnaire_data_processed[column].fillna(2)
    df[column] = df[column].fillna(2)

In [None]:
# Create a list of all open questions related to treatment 
open_questions_treatments = ['JEXECGY1', 'JEXECGY2', 
                       'JAGRAMY1', 'JAGRAMY2', 
                       'JADMCHY1', 'JADMCHY2',
                       'JAPLASY1', 'JAPLASY2',
                       'JCABGY1','JCABGY2',
                       'JOHTOAY1','JOHTOAT1', 
                       'JOHTOAT2', 'JOHTOAT3', 
                       'JOHTOAT4', 'JOHTOAT5']

# Delete all the open questions related to treatment
for col in open_questions_treatments:
    if col in questionnaire_data_processed.columns:
        questionnaire_data_processed = questionnaire_data_processed.drop(col, axis=1)
        df = df.drop(col, axis=1)

#### Angina

In [None]:
# Create a list of open questions related to angina
open_questions_angina = ['JOHTDX1']

# Assign a 2 (no) to the question about whether someone had heart trouble if the follow-up questions were left unanswered 
questionnaire_data_processed = assign_value_based_on_nan(questionnaire_data_processed, open_questions_angina, 'JOHT', 2)
df = assign_value_based_on_nan(df, open_questions_angina, 'JOHT', 2)

#### Hospital admittance

In [None]:
# Create a list of open questions about hospital admittance 
open_questions_admittance = ['JHS1RSN', 'JHS2RSN', 'JHS3RSN', 'JHS4RSN',
                         'JHS1MNTH', 'JHS2MNTH', 'JHS3MNTH', 'JHS4MNTH', 
                         'JHS1YR', 'JHS2YR', 'JHS3YR', 'JHS4YR']


# Assign a 2 (no) to the question about hospital admittance if the follow-up questions were left unanswered 
questionnaire_data_processed = assign_value_based_on_nan(questionnaire_data_processed, open_questions_admittance, 'JOHT', 2)
df = assign_value_based_on_nan(df, open_questions_admittance, 'JOHT', 2)

In [None]:
# Set the number of hospital admittance to 0 if someone answered not to been admitted to the hospital
questionnaire_data_processed = set_columns_to_zero(questionnaire_data_processed, 'JHSADMYR', ['JHSADMNO'], 2)
df = set_columns_to_zero(df, 'JHSADMYR', ['JHSADMNO'], 2)

#### Neurological symptoms

In [None]:
# Create a list of open questions related to neurological symptoms 
open_questions_neurological = ['JNSTAM1', 'JNSTAY1', 'JNSTAM2', 'JNSTAY2']

# Set the answer to the question about whether someone has been treated by a docter to 2 (no) if the follow-up quesitons were left unanswered 
questionnaire_data_processed = assign_value_based_on_nan(questionnaire_data_processed, open_questions_neurological, 'JNSTADOC', 2)

# Set the answer to the question about whether someone had a slurred speech to 2 (no) if the follow-up quesitons were left unanswered 
questionnaire_data_processed = assign_value_based_on_nan(questionnaire_data_processed, open_questions_neurological, 'JNSTALK', 2)

df = assign_value_based_on_nan(df, open_questions_neurological, 'JNSTADOC', 2)
df = assign_value_based_on_nan(df, open_questions_neurological, 'JNSTALK', 2)

#### Stroke symptoms

In [None]:
# Create a list of open questions related to strokes 
open_questions_stroke = ['JSTRKM1', 'JSTRKY1', 'JSTRKM2', 'JSTRKY2']

# Set the answer to the question about having had a stroke to 2 (no) if the follow up questions are unanswered 
questionnaire_data_processed = assign_value_based_on_nan(questionnaire_data_processed, open_questions_stroke, 'JSTRKDOC', 2)
df = assign_value_based_on_nan(df, open_questions_stroke, 'JSTRKDOC', 2)

In [None]:
# Set the answer to the question about having had a stroke to 2 (no) if the question about being treated was skipped 
follow_up_questions_stroke = ['JSTRKDOC']
questionnaire_data_processed = assign_value_based_on_nan(questionnaire_data_processed, 
                                                         follow_up_questions_stroke, 'JSTROKE', 2)
df = assign_value_based_on_nan(df, follow_up_questions_stroke, 'JSTROKE', 2)

In [None]:
# Drop open questions related to diseases
open_questions_disease = ['JSTRKM1', 'JSTRKY1', 'JSTRKM2', 'JSTRKY2',
                         'JNSTAM1', 'JNSTAY1', 'JNSTAM2', 'JNSTAY2',
                         'JHS1RSN', 'JHS2RSN', 'JHS3RSN', 'JHS4RSN',
                         'JHS1MNTH', 'JHS2MNTH', 'JHS3MNTH', 'JHS4MNTH', 
                         'JHS1YR', 'JHS2YR', 'JHS3YR', 'JHS4YR',
                         'JOHTDX1', 'JOHTDX2', 'JOHTDX3', 'JOHTDX4', 'JSTRDIAG')]

for col in open_questions_disease:
    if col in questionnaire_data_processed.columns:
        questionnaire_data_processed = questionnaire_data_processed.drop(col, axis=1)
        df = df.drop(col, axis=1)

#### Visual symptoms

In [None]:
# Set the answer to the question about having visual symptoms to 2 (no) if the follow-up questions were left unanswered 
questionnaire_data_processed = set_columns_to_two(questionnaire_data_processed, 'JNSVISUA', 
                                                   ['JNSVIDOC'], 2)

df = set_columns_to_two(df, 'JNSVISUA',  ['JNSVIDOC'], 2)

In [None]:
# Drop open questions related to visual symptoms
open_questions_visual_symptoms = ['JNSVISM1', 'JNSVISY1',
                                  'JNSVISM2', 'JNSVISY2',
                                  'JNSVISYM']

for col in open_questions_visual_symptoms:
    if col in questionnaire_data_processed.columns:
        questionnaire_data_processed = questionnaire_data_processed.drop(col, axis=1)
        df = df.drop(col, axis=1)

#### Open questions vigorous physical activity

In [None]:
# Drop the open questions related to physical activity 
open_questions_physical_activities = ['JVIG_OB1', 'JVIG_OB2', 'JVIG_OB3', 'JVIG_OB4', 'JVIG_OB5', 'JVIG_OBF', 'JVIG_OBH',
                                      'JPHYSA11', 'JPHYSA12', 'JPHYSA13', 'JPHYSA1F', 'JPHYSA1H', 'JPHYSA21', 'JPHYSA22', 
                                      'JPHYSA23', 'JPHYSA2F', 'JPHYSA2H', 'JDIY11', 'JDIY12', 'JDIY13', 'JDIY1F', 'JDIY1H']

for col in open_questions_physical_activities:
    if col in questionnaire_data_processed.columns:
        questionnaire_data_processed = questionnaire_data_processed.drop(col, axis=1)
        df = df.drop(col, axis=1)

#### Open questions sport

In [None]:
# Drop open questions related to sports 
open_questions_sport = ['JSPORT11', 'JSPORT12', 'JSPORT13', 'JSPORT1F', 'JSPORT1H', 
                        'JSPORT21', 'JSPORT22', 'JSPORT23', 'JSPORT2F', 'JSPORT2H']

for col in open_questions_sport:
    if col in questionnaire_data_processed.columns:
        questionnaire_data_processed = questionnaire_data_processed.drop(col, axis=1)
        df = df.drop(col, axis=1)

#### Jobs

In [None]:
# Set the answer to the question about employment to 1 (employee) if the two follow-up questions were left unanswered 
# because participants were asked to skip these when being a employee
questionnaire_data_processed = assign_value_based_on_nan(questionnaire_data_processed,['JLREMPPL', 'JLREPPLN'] ,'JLREMPEE', 1)
questionnaire_data_processed = set_columns_to_two(questionnaire_data_processed, 'JLREMPEE',['JLREMPPL', 'JLREPPLN'], 1)

df = assign_value_based_on_nan(df,['JLREMPPL', 'JLREPPLN'] ,'JLREMPEE', 1)
df = set_columns_to_two(df, 'JLREMPEE',['JLREMPPL', 'JLREPPLN'], 1)

In [None]:
# Set the answer to the question about being a manager to 3 (no) if the next questions was skipped, because participants were redirected 
questionnaire_data_processed = assign_value_based_on_nan(questionnaire_data_processed,['JLRECHNO'] ,'JLRECHAR', 3)
df = assign_value_based_on_nan(df,['JLRECHNO'] ,'JLRECHAR', 3)

In [None]:
# Drop open questions related to civil services 
questionnaire_data_processed = questionnaire_data_processed.drop(columns=['JLRROUTO', 'JGRLUMP'])
df = df.drop(columns=['JLRROUTO', 'JGRLUMP'])

In [None]:
# Create a new category (0) for the question about how many people someone manages if someone answered not to be a manager 
questionnaire_data_processed['JLRECHNO'] =  questionnaire_data_processed['JLRECHNO'].fillna(0)
df['JLRECHNO'] =  df['JLRECHNO'].fillna(0)

#### Arthritis

In [None]:
# Create a list of open questions and closed questions related to arthritis
open_questions_arthritis = ['JOST_AYR', 'JRHE_AYR', 'JGOUT_YR', 'JOST_PYR']
closed_questions = ['JOST_ART', 'JRHE_ART', 'JGOUT', 'JOST_POR']

# Set the answer to the closed question to 2 (no) if the corresponding open question was left unanswered
for i,e in enumerate(closed_questions):
    questionnaire_data_processed = assign_value_based_on_nan(questionnaire_data_processed,[open_questions_arthritis[i]],e, 2)
    df = assign_value_based_on_nan(df,[open_questions_arthritis[i]],e, 2)

    questionnaire_data_processed = set_columns_to_zero(questionnaire_data_processed, e, [open_questions_arthritis[i]], 2)
    df = set_columns_to_zero(df, e, [open_questions_arthritis[i]], 2)

#### Providing care

In [None]:
# If someone did not answer the questions about providing regular care, there is assumed that they do not provide regular care 
care_options_list = ['JCARCH','JCARGC','JCARFR', 'JCARRL', 'JCARPA', 'JCARSP']
for column in care_options_list:
    questionnaire_data_processed[column] = questionnaire_data_processed[column].fillna(2)
    df[column] = df[column].fillna(2)

# Set the number of hours spend on providing care to 0 if respondent answered 2 (no) to the question about providing care
care_open_questions = ['JCARCHHR', 'JCARGCHR', 'JCARSPHR', 'JCARPAHR', 'JCARRLHR', 'JCARFRHR']
for i,e in enumerate(care_options_list):
    questionnaire_data_processed = set_columns_to_zero(questionnaire_data_processed, e,
                                                       [care_open_questions[i]], 2)
    df = set_columns_to_zero(df, e,[care_open_questions[i]], 2)

#### Claudation

In [None]:
# If someone did not answer the question about being told to have bad circulation in the arteries of their legs
# there is assumed that they have never been told (2)
questionnaire_data_processed['JINCLAU'] =  questionnaire_data_processed['JINCLAU'].fillna(2)
df['JINCLAU'] =  df['JINCLAU'].fillna(2)

# Drop open question about claudation
questionnaire_data_processed = questionnaire_data_processed.drop(columns='JINCLAYR')
df = df.drop(columns='JINCLAYR')

#### Housework

In [None]:
# Drop open questions related to work around the house 
open_q = ['JHOUSW11', 'JHOUSW12', 'JHOUSW13', 'JHOUSW1F', 'JHOUSW1H', 'JHOUSW2F', 
          'JHOUSW2H', 'JHOUSW21', 'JHOUSW22', 'JHOUSW23', 'JHOUSW2F', 'JHOUSW2H']

for col in open_q:
    if col in questionnaire_data_processed.columns:
        questionnaire_data_processed = questionnaire_data_processed.drop(col, axis=1)
        df = df.drop(col, axis=1)

#### Open questions community work

In [None]:
# Drop open questions related to community work
open_questions_cw = ['JSFOTH1','JSFOTH2','JSFOTH3','JSFOTH4','JSFOTH5','JSFOTH6', 
                     'JSFOTH1F', 'JSFOTH2F', 'JSFOTH3F','JSFOTH4F', 'JSFOTH5F', 'JSFOTH6F']

for col in open_questions_cw:
    if col in questionnaire_data_processed.columns:
        questionnaire_data_processed = questionnaire_data_processed.drop(col, axis=1)
        df = df.drop(col, axis=1)

#### Open questions garden

In [None]:
# Drop open questions related to gardening 
open_questions_garden = ['JGARDN11', 'JGARDN12', 'JGARDN13', 'JGARDN1F', 'JGARDN1H']

for col in open_questions_garden:
    if col in questionnaire_data_processed.columns:
        questionnaire_data_processed = questionnaire_data_processed.drop(col, axis=1)
        df = df.drop(col, axis=1)

#### Smoking

In [None]:
# If someone did not smoke in the past, they get redirected to question 76 and they skip the question about when they stopped smoking
# Therefore, if someone did not answer the question about when they stopped smoking, smoking in the past is set to 2 (no)
questionnaire_data_processed = assign_value_based_on_nan(questionnaire_data_processed, ['JSMKSTOP'], 'JSMKPAST', 2)
df  = assign_value_based_on_nan(df, ['JSMKSTOP'], 'JSMKPAST', 2)

# If someone does not smoke, the number of cigarettes per day is 0
questionnaire_data_processed = set_columns_to_zero(questionnaire_data_processed, 'JSMOKE', ['JCIGNUM'], 2)
df = set_columns_to_zero(df, 'JSMOKE', ['JCIGNUM'], 2)

# If someone did not answer how old they were when they stopped smoking, there is assumed that they got redirected
# The answer to the question of how old they were when they stopped smoking is set to 0, considering there is expected that the younger
# People stop with smoking, the less T2D risk
questionnaire_data_processed['JSMKSTOP'] = questionnaire_data_processed['JSMKSTOP'].fillna(0)
df['JSMKSTOP'] = df['JSMKSTOP'].fillna(0)

#### Alcohol consumption

In [None]:
# Set the answer to the question about being a non-drinker to 1 (yes) if all follow-up questions were left unanswered
# Because respondents got redirected to the next part of the survey
questionnaire_data_processed = assign_value_based_on_nan(questionnaire_data_processed, ['JALCWK', 'JSPRTWK', 'JWINEWK', 'JBEERWK',
                                                                                         'JDRNKHNG', 'JDRNKGLT', 'JDRNKANN', 'JDRNKCUT'], 'JNONDRNK', 1)

df = assign_value_based_on_nan(df, ['JALCWK', 'JSPRTWK', 'JWINEWK', 'JBEERWK','JDRNKHNG', 'JDRNKGLT', 'JDRNKANN', 'JDRNKCUT'], 'JNONDRNK', 1)

In [None]:
# Add a new category represented by 0 for the question about having had an alcoholic drink in the last seven days 
# that represents people who have never drunk an alcoholic drink 
questionnaire_data_processed = set_columns_to_zero(questionnaire_data_processed, 'JNONDRNK', ['JALCWK'], 1)
df = set_columns_to_zero(df, 'JNONDRNK', ['JALCWK'], 1)

questionnaire_data_processed = set_columns_to_two(questionnaire_data_processed, 'JALCWK', ['JNONDRNK'], 1)
df = set_columns_to_two(df, 'JALCWK', ['JNONDRNK'], 1)

# Set 'have you had an alcoholic drink in the last seven days' to no if there are nou amount of drinks entered
questionnaire_data_processed = assign_value_based_on_nan(questionnaire_data_processed, ['JSPRTWK', 'JWINEWK', 'JBEERWK'], 'JALCWK', 2)
df = assign_value_based_on_nan(df, ['JSPRTWK', 'JWINEWK', 'JBEERWK'], 'JALCWK', 2)

# If someone did not drink the last 7 days or never had an alcoholic drink, the amount of drinks during last week are all set to 0 
questionnaire_data_processed = set_columns_to_zero(questionnaire_data_processed, 'JALCWK', ['JSPRTWK', 'JWINEWK', 'JBEERWK'], 2)
df = set_columns_to_zero(df, 'JALCWK', ['JSPRTWK', 'JWINEWK', 'JBEERWK'], 2)

questionnaire_data_processed = set_columns_to_zero(questionnaire_data_processed, 'JALCWK', ['JSPRTWK', 'JWINEWK', 'JBEERWK'], 0)
df = set_columns_to_zero(df, 'JALCWK', ['JSPRTWK', 'JWINEWK', 'JBEERWK'], 0)

# If someone never had an alcoholic drink, the questions regarding abusive drinking are all set to 2 (no)
questionnaire_data_processed = set_columns_to_two(questionnaire_data_processed, 'JNONDRNK', ['JDRNKHNG', 'JDRNKGLT', 'JDRNKANN', 'JDRNKCUT'], 2)
df = set_columns_to_two(df, 'JNONDRNK', ['JDRNKHNG', 'JDRNKGLT', 'JDRNKANN', 'JDRNKCUT'], 2)

# Because of the assumption that someone only fills in the number of drinks of the category that he/she drinks and 
# leaves the category he/she does not drink blank, the NaNs are filled with 0 

questionnaire_data_processed['JSPRTWK'] = questionnaire_data_processed['JSPRTWK'].fillna(0)
questionnaire_data_processed['JWINEWK'] = questionnaire_data_processed['JWINEWK'].fillna(0)
questionnaire_data_processed['JBEERWK'] = questionnaire_data_processed['JBEERWK'].fillna(0)

df['JSPRTWK'] = df['JSPRTWK'].fillna(0)
df['JWINEWK'] = df['JWINEWK'].fillna(0)
df['JBEERWK'] = df['JBEERWK'].fillna(0)

#### Remove built environment

In [None]:
# Extract all features related to built environment
BE_list = extract_strings_starting_with(missing_df_questionnaire, 'column_name', 'JBE')

In [None]:
# Remove the features related to beer consumption from the built environment list 
beer = ['JBEERWK','JBERUWK0', 'JBEERWK0']
for item in beer:
    BE_list.remove(item)

In [None]:
# drop the questions related to the built environment 
for col in BE_list:
    if col in questionnaire_data_processed.columns:
        questionnaire_data_processed = questionnaire_data_processed.drop(col, axis=1)
        df = df.drop(col, axis=1)

#### Leg pain

In [None]:
# Extract all features related to leg pain 
JLP_list = extract_strings_starting_with(missing_df_questionnaire, 'column_name', 'JLP')

In [None]:
# Assign a 2 (no) to leg pain if all the follow up questions are unanswered 
questionnaire_data_processed = assign_value_based_on_nan(questionnaire_data_processed, JLP_list, 'JLEGPAIN', 2)
df = assign_value_based_on_nan(df, JLP_list, 'JLEGPAIN', 2)

In [None]:
# Assign a 2 (no) to all the follow up questions if a person has no leg pain 
questionnaire_data_processed = set_columns_to_two(questionnaire_data_processed, 'JLEGPAIN', JLP_list, 2)
df = set_columns_to_two(df, 'JLEGPAIN', JLP_list, 2 )

#### Marriage

In [None]:
# Check if the 'JMARCOH' feature is in the dataframe to use to impute JNOTMAR
'JMARCOH' in missing_df_questionnaire['column_name'].values

In [None]:
# Create a new category 'married/partnership' for the 'JNOTMAR' feature if 
# they answers that they are married/cohabiting/in a civil partnership in the previous question
condition = pd.isna(questionnaire_data_processed['JNOTMAR']) & (questionnaire_data_processed['JMARCOH'] == 1)

questionnaire_data_processed.loc[condition, 'JNOTMAR'] = 0
df.loc[condition, 'JNOTMAR'] = 0 

In [None]:
# Drop open question related to someones civil status 
questionnaire_data_processed = questionnaire_data_processed.drop(columns='JWDSYEAR')
df = df.drop(columns='JWDSYEAR')

#### Drop three column with a lot of missing values that cannot be found back in the questionnaire

In [None]:
# Drop four features that could not be found back in the questionnaire with a lot of missing data 
questionnaire_data_processed = questionnaire_data_processed.drop(columns = ['JLRESC', 'JJOBSOC', 'JLRESEG', 'JLRGRLMP'])
df = df.drop(columns = ['JLRESC', 'JJOBSOC', 'JLRESEG', 'JLRGRLMP'])

#### Drop two columns that have more than 50% missing data without a clear reason

In [None]:
questionnaire_data_processed = questionnaire_data_processed.drop(columns = ['JLRROUT','JLGRLUMP'])
df = df.drop(columns = ['JLRROUT','JLGRLUMP'])

#### Pets

In [None]:
# If someone answered that he/she does not have a pet, a new category for the feature pet attachment is created and set to 0 (no pet)
questionnaire_data_processed.loc[(questionnaire_data_processed['JPET'] == 2) & (questionnaire_data_processed['JPETATTA'].isna()) , 'JPETATTA'] = 0 
df.loc[(df['JPET'] == 2) & (df['JPETATTA'].isna()) , 'JPETATTA'] = 0 

#### Snoring

In [None]:
# Set 'do you snore' to 2 (no) if all the follow up questions are unanswered 
questionnaire_data_processed = assign_value_based_on_nan(questionnaire_data_processed, ['JSNORHOW', 'JSNOROFT', 'JSNORBOT'], 'JSNORE', 2)
df = assign_value_based_on_nan(df, ['JSNORHOW', 'JSNOROFT', 'JSNORBOT'], 'JSNORE', 2)

In [None]:
# Create a new category 'not snoring' (0) for the follow up questions and assign the value 0 if someone does not snor
questionnaire_data_processed = set_columns_to_zero(questionnaire_data_processed, 'JSNORE', ['JSNORHOW', 'JSNOROFT', 'JSNORBOT'], 2)
df = set_columns_to_zero(df, 'JSNORE', ['JSNORHOW', 'JSNOROFT', 'JSNORBOT'], 2)

In [None]:
# Also assign the value 0 if someone does not now if he/she snores 
questionnaire_data_processed = set_columns_to_zero(questionnaire_data_processed, 'JSNORE', ['JSNORHOW', 'JSNOROFT', 'JSNORBOT'], 3)
df = set_columns_to_zero(df, 'JSNORE', ['JSNORHOW', 'JSNOROFT', 'JSNORBOT'], 3)

#### Body pain

In [None]:
# Create a list with all the features related to body pain 
twelve_months = ['JPNN', 'JPNS', 'JPNUB', 'JPNA', 'JPNLB']


for column in twelve_months:
    col_m = f"{column}12M"
    col_od = f"{column}_OD"
    col_d = f"{column}14D"

    # If someone had pain during the last 14 days, they had pain during the last 12 months 
    condition = (questionnaire_data_processed[col_d] == 1) & (pd.isna(questionnaire_data[col_m])) 
    questionnaire_data_processed.loc[condition, [col_m]] = 1
    df.loc[condition,[col_m]] = 1
    
    # If someone did not have pain during the last 12 months, 
    # they did not have pain more than three times during the last 12 months or pain during the last 14 days
    condition3 = (questionnaire_data_processed[col_m] == 2)  & (pd.isna(questionnaire_data[col_d])) 
    questionnaire_data_processed.loc[condition3, [col_d]] = 2
    df.loc[condition3, [col_d]] = 2

    condition4 = (questionnaire_data_processed[col_m] == 2)  & (pd.isna(questionnaire_data[col_od])) 
    questionnaire_data_processed.loc[condition4, [col_od]] = 2
    df.loc[condition4, [col_od]] = 2
     
    # If they left all the three options blank, there is assumed that they had no pain 
    columns_to_check = [col_m, col_od, col_d]
    for index, row in questionnaire_data_processed.iterrows():
        if row[columns_to_check].isna().all():
            questionnaire_data_processed.loc[index, columns_to_check] = 2
            df.loc[index, columns_to_check] = 2

In [None]:
# Create a list of all features related to pain in the upper body 
twelve_months = ['JPNN', 'JPNS', 'JPNUB', 'JPNA', 'JPNLB']

twelve_months_complete = []

for column in twelve_months:
    col_m = f"{column}12M"
    col_od = f"{column}_OD"
    col_d = f"{column}14D"

    columns = [col_m, col_od, col_d]
    for item in columns:
        twelve_months_complete.append(item)

In [None]:
# The remaining missing values related to the questions about body pained were filled with a 2
# assuming that people who did not answer did not have severe pain
for column in twelve_months_complete:
    questionnaire_data_processed[column] = questionnaire_data_processed[column].fillna(2)
    df[column] = df[column].fillna(2)

#### Employment

In [None]:
# If someone was redirected to question 108, the answer to the skipped questions is set to 0 (a new category representing being employed
questionnaire_data_processed.loc[(questionnaire_data_processed['JLREMPL'] == 1) & (questionnaire_data_processed['JLRNE'].isna()), 'JLRNE'] = 0 
df.loc[(df['JLREMPL'] == 1) & (df['JLRNE'].isna()), 'JLRNE'] = 0   

#### Sports

In [None]:
# If someone does not practice a sport, the total hours is set to 0 
sports = ['JSOCCER', 'JGOLF', 'JSWIM']

for column in sports:
    col_F = f"{column}F"
    col_H = f"{column}H"

    condition = questionnaire_data_processed[col_F] == 0 
    questionnaire_data_processed.loc[condition, [col_H]] = 0
    df.loc[condition, [col_H]] = 0

#### Manual lawn mowning

In [None]:
# If someone does not mow their garden, the total hours spent is set to 0 
condition = (questionnaire_data_processed['JMOWF'] == 0) & (questionnaire_data_processed['JMOWH'].isna())
questionnaire_data_processed.loc[condition, ['JMOWH']] = 0
df.loc[condition, ['JMOWH']] = 0

#### Painting

In [None]:
# If someone does not paint or decorate, the total hours spend is set to 0
condition = (questionnaire_data_processed['JPAIDECF'] == 0) & (questionnaire_data_processed['JPAIDECH'].isna())
questionnaire_data_processed.loc[condition, ['JPAIDECH']] = 0
df.loc[condition, ['JPAIDECH']] = 0

#### JLR2005

In [None]:
# If someone did not answer the follow up questions about civil service, they chose 1 and were redirected to 106
# Therefore, if the follow up questions are NaN, the first question is set to 1
questionnaire_data_processed = assign_value_based_on_nan(questionnaire_data_processed, ['JLRCLGD'], 'JLR2005', 1)
df  = assign_value_based_on_nan(df, ['JLRCLGD'], 'JLR2005', 1)

In [None]:
# If the first question is 1, the follow up questions are assigned to a new category 0 (a category representing leaving civil service before 2005)
questionnaire_data_processed = set_columns_to_zero(questionnaire_data_processed, 'JLR2005', ['JLRCLGD'], 1)
df = set_columns_to_zero(df, 'JLR2005', ['JLRCLGD'], 1)

#### Carwash

In [None]:
# If someone does not wash the car, the total hours spent is set to 0 
condition = (questionnaire_data_processed['JCARWASF'] == 0) & (questionnaire_data_processed['JCARWASH'].isna())
questionnaire_data_processed.loc[condition, ['JCARWASH']] = 0
df.loc[condition, ['JCARWASH']] = 0

#### NaNs for employment

In [None]:
# If someone chose 1 for JLREMPL (question 106), that person gets redirected to question 108 
# Therefore, if the answer to question 107 is NaN, the answer to question 106 is set to 1 
questionnaire_data_processed = assign_value_based_on_nan(questionnaire_data_processed, ['JLRNE'], 'JLREMPL', 1)
df = assign_value_based_on_nan(df, ['JLRNE'], 'JLREMPL', 1)

In [None]:
# If the answer to question 106 is 1, the asnwer to question JLRN is set to 0 (a new category representing being employed)
questionnaire_data_processed = set_columns_to_zero(questionnaire_data_processed, 'JLREMPL', ['JLRNE'], 1)
df = set_columns_to_zero(df, 'JLREMPL', ['JLRNE'], 1)

#### Follow up questions frequency

In [None]:
# This is a list of questions, where the first question determines whether someone does something
# and the second question refers to how many hours someone does something
# If someone does not do something, the amount of hours is set to 0 

freq = ['JWEED', 'JHANGW', 'JCOOK', 'JCARRYH']

for column in freq:
    col_F = f"{column}F"
    col_H = f"{column}H"

    condition = questionnaire_data_processed[col_F] == 0
    questionnaire_data_processed.loc[condition, [col_H]] = 0
    df.loc[condition, [col_H]] = 0

#### Self-fulfillment

In [None]:
# If someone did not answer a question about self-fulfillment activities, there is assumed that they never do it, and the value is set to 1 
self_fulfillment_list = ['JSFCONSU', 'JSFBOOKS', 'JSFCHARI', 'JSFENVIR', 'JSFPOLIT', 'JSFCOMMU']

for column in self_fulfillment_list:
    questionnaire_data_processed[column] = questionnaire_data_processed[column].fillna(1)
    df[column] = df[column].fillna(1)

#### Gardening

In [None]:
'JSPAGDNF' in missing_df_questionnaire['column_name'].values

In [None]:
# If someone is never involved in gardening, someone is also never involved in mowing or weeding
questionnaire_data_processed.loc[(questionnaire_data_processed['JSPAGDNF'] == 4) & (questionnaire_data_processed['JWEEDF'].isna()), 'JWEEDF'] = 0
questionnaire_data_processed.loc[(questionnaire_data_processed['JSPAGDNF'] == 4) & (questionnaire_data_processed['JWEEDH'].isna()), 'JWEEDH'] = 0
questionnaire_data_processed.loc[(questionnaire_data_processed['JSPAGDNF'] == 4) & (questionnaire_data_processed['JMOWF'].isna()), 'JMOWF'] = 0
questionnaire_data_processed.loc[(questionnaire_data_processed['JSPAGDNF'] == 4) & (questionnaire_data_processed['JMOWH'].isna()), 'JMOWH'] = 0

df.loc[(df['JSPAGDNF'] == 4) & (df['JWEEDF'].isna()), 'JWEEDF'] = 0
df.loc[(df['JSPAGDNF'] == 4) & (df['JWEEDH'].isna()), 'JWEEDH'] = 0
df.loc[(df['JSPAGDNF'] == 4) & (df['JMOWF'].isna()), 'JMOWF'] = 0
df.loc[(df['JSPAGDNF'] == 4) & (df['JMOWH'].isna()), 'JMOWH'] = 0

#### Remove variables that are not in the questionnaire

In [None]:
# Remove features thast cannot be found in the questionnaire 
questionnaire_data_processed = questionnaire_data_processed.drop(columns=['JCAGE', 'JSPATOT'])
df = df.drop(columns=['JCAGE', 'JSPATOT'])

In [None]:
# If someone did not answer the number of occasions and the total time spent on a activity, there is assumed that this person does not do this activity
# Therefore, occasions and time spent is set to 0 
FH_columns = ['JMOW', 'JSOCCER', 'JWEED', 'JGOLF', 'JSWIM', 'JCARRYH', 'JCOOK', 'JHANGW', 'JCARWAS', 'JPAIDEC']

for column in FH_columns:
    col_F = f"{column}F"
    col_H = f"{column}H"

    questionnaire_data_processed.loc[questionnaire_data_processed[col_F].isna() & questionnaire_data_processed[col_H].isna(), [col_F,col_H]] = 0
    df.loc[df[col_F].isna() & df[col_H].isna(), [col_F,col_H]] = 0

In [None]:
# If there are remaining activity columns with NaNs, the time spent and the number of occasions is also set to 0 
for column in FH_columns:
    col_F = f"{column}F"
    col_H = f"{column}H"

    questionnaire_data_processed[col_F] = questionnaire_data_processed[col_F].fillna(0)
    questionnaire_data_processed[col_H] = questionnaire_data_processed[col_H].fillna(0)
    df[col_F] = df[col_F].fillna(0)
    df[col_H] = df[col_H].fillna(0)

#### Remove derived features 

In [None]:
# Some of the features were derived from other features, but there is no information on how these were derived 
# Therefore, handling missing values is challenging and the features are removed from the dataframe 
derived = ['JDPN_SUM', 'JWALKMET', 'JCYCMET', 'JPCS', 'JMCS', 'JPRACT1','JCONF1', 'JNEG1', 'JLAD', 'JPROXY', 'JTRLPP','JGHQC']

questionnaire_data_processed = questionnaire_data_processed.drop(columns=derived)
df = df.drop(columns=derived)

#### Breathing pauses

In [None]:
# If someone did not answer the question about breathing pauses during sleep, there is assumed that he/she did not know (6)
questionnaire_data_processed['JSLBRPAU'] = questionnaire_data_processed['JSLBRPAU'].fillna(6)
df['JSLBRPAU'] = df['JSLBRPAU'].fillna(6)

#### Activities

In [None]:
# Create a list with all free-time activities 
activity_list = extract_strings_starting_with(missing_df_questionnaire, 'column_name', 'JSPA')

In [None]:
# Remove the aggragated variable 
activity_list.remove('JSPATOT')

In [None]:
# If someone did not answer the questions about activities in their spare time, there is assumed that the never take part in these activities 
for column in activity_list:
    questionnaire_data_processed[column] = questionnaire_data_processed[column].fillna(4)
    df[column] = df[column].fillna(4)

#### Alcoholishm

In [None]:
# Create a additional feature that shows whether someone did or did not answer the questions about problemtic drinking behaviour 
questionnaire_data_processed['problematic_drinking'] = questionnaire_data_processed['JDRNKANN'].isna().astype(int)
df['problematic_drinking'] = df['JDRNKANN'].isna().astype(int)

In [None]:
# Fill the missing values in the questions related alcoholic drinking behaviour with a 2 (no)
for column in ['JDRNKCUT', 'JDRNKANN', 'JDRNKGLT', 'JDRNKHNG']:
    questionnaire_data_processed[column] = questionnaire_data_processed[column].fillna(2)
    df[column] = df[column].fillna(2)

#### Closest person

In [None]:
# Create a list with all variables related to how many people the participants feel close to 
JCP_list = extract_strings_starting_with(missing_df_questionnaire, 'column_name', 'JCP')

In [None]:
# Drop an open question related to who felt the closest in the last 12 months, as this is an open question and not usefull
questionnaire_data_processed = questionnaire_data_processed.drop(columns=['JCP1'])
df = df.drop(columns=['JCP1'])

In [None]:
# Remove open questions from the list 
JCP_list.remove('JCP1')
JCP_list.remove('JCPNO')

In [None]:
# If someone did not answer the question about how many people they feel very close to, there is assumed that the answer is 0 
questionnaire_data_processed = assign_value_based_on_nan(questionnaire_data_processed, JCP_list, 'JCPNO', 0)
df = assign_value_based_on_nan(df, JCP_list, 'JCPNO', 0)

In [None]:
# Create a function that sets the values of specified columns to 1 based on a condition
def set_columns_to_one(dataframe, condition_column, target_columns, value):
    condition_met = dataframe[condition_column] == value
    cond_target = dataframe[target_columns].isna().all(axis=1)
    dataframe.loc[condition_met & cond_target, target_columns] = 1 
    return dataframe

In [None]:
# If someone is not close to anybody, all the followed up questions related to these close persons are set to 1 (not at all)
questionnaire_data_processed = set_columns_to_one(questionnaire_data_processed, 'JCPNO', JCP_list, 0)
df = set_columns_to_one(df, 'JCPNO', JCP_list, 0)

#### JDPN features

In [None]:
# Drop features that could not be found in the questionnaire 
questionnaire_data_processed = questionnaire_data_processed.drop(columns=['JDPN_NCP', 'JDPN_PCT'])
df = df.drop(columns=['JDPN_NCP', 'JDPN_PCT'])

In [None]:
# Create a list of the features with data type 'object'
object_columns = questionnaire_data_processed.select_dtypes(include=['object']).columns.tolist()

In [None]:
# Drop the columns that are of dtype object because they cannot be found back in the questionnaire 
questionnaire_data_processed = questionnaire_data_processed.drop(columns=object_columns)
df = df.drop(columns=object_columns)

In [None]:
# questionnaire_data_processed.to_csv('../Processed datasets/cleaned_data/questionnaire_data_clean_with_target.csv', index=False)

In [None]:
#df.to_csv('../Processed datasets/cleaned_data/fulldata_questionnaire_data_clean.csv', index=False)

#### FFQ data

In [None]:
missing_df_FFQ = missing_values_columns(FFQ_data)
missing_df_FFQ['percent_missing'].hist(bins=100)

In [None]:
# Create a list with all columns from the FFQ data
FFQ_columns = FFQ_data.columns.tolist()

In [None]:
# Remove the administrative variables from the list 
admin = ['Id_random_DPUK','JFFQDOC','JFFQDOR']
for item in admin:
    FFQ_columns.remove(item)

In [None]:
# Remove participants that did not fill in the FFQ 
FFQ_data = FFQ_data.dropna(subset=FFQ_columns, how='all')
df = df.dropna(subset=FFQ_columns, how='all')

In [None]:
FFQ_data_processed = FFQ_data.copy()

In [None]:
# Remove open ended questions because we do not have the information 
columns_to_throw = FFQ_data.columns[FFQ_data.columns.str.endswith(tuple(str(i) for i in range(10)))]
columns_to_throw

FFQ_data_processed.drop(columns=columns_to_throw, inplace=True)
df.drop(columns=columns_to_throw, inplace=True)

In [None]:
# Drop the dates that show when someone filled in the FFQ 
dates = ['JFFQDOC', 'JFFQDOR']

FFQ_data_processed.drop(columns=dates, inplace=True)
df.drop(columns=dates, inplace=True)

In [None]:
missing_df_FFQ = missing_values_columns(FFQ_data_processed)
missing_df_FFQ

In [None]:
# Set diet length to 0 if someone is not on a slimming diet 
FFQ_data_processed.loc[FFQ_data_processed['JDIET']==2, 'JDIETLNG'] = 0 
df.loc[df['JDIET']==2, 'JDIETLNG'] = 0 

In [None]:
# Remove whether people eat any other foods more than once a week because a lot of data is missing
# and we do not have the information about what kind of food
FFQ_data_processed = FFQ_data_processed.drop(columns=['JFOODOTH'])
df = df.drop(columns=['JFOODOTH'])

In [None]:
# Seperate the features into binary and non binary features
binary_features_FFQ = []

non_binary_features_FFQ =[]

for column in FFQ_data_processed.columns:
    if FFQ_data_processed[column].nunique() > 2:
        non_binary_features_FFQ.append(column)
    else:
        binary_features_FFQ.append(column)

In [None]:
binary_features_FFQ

In [None]:
# Fill the question about whether the food in the FFQ is representative with the mode 
FFQ_data_processed['JREPRES'].value_counts().plot(kind='bar')
FFQ_data_processed.loc[:,'JREPRES'] = FFQ_data_processed['JREPRES'].fillna(1.0)
df.loc[:,'JREPRES'] = df['JREPRES'].fillna(1.0)

In [None]:
# Assumption that everyone who did not tick a box in case of a follow up question meant the option 'no'
for column in binary_features_FFQ:
    FFQ_data_processed.loc[:,column] = FFQ_data_processed[column].fillna(2)
    df.loc[:,column] = df[column].fillna(2)

In [None]:
# Remove the participant ID from the list of non binary features 
non_binary_features_FFQ.remove('Id_random_DPUK')

In [None]:
# Assumption that everyone that did not tick a box in case of a food frequency question did not eat that specific food 
for column in non_binary_features_FFQ:
    FFQ_data_processed.loc[:,column] = FFQ_data_processed[column].fillna(1)
    df.loc[:,column] = df[column].fillna(1)

#### Correct dtypes 

In [None]:
# Create a function that changes datatypes from floats to integers 
def convert_floats_to_int(dataframe, columns):
    for column in columns:
        dataframe[column] = dataframe[column].astype(int)
    return dataframe

In [None]:
# Transform all datatypes to integers 
FFQ_data_processed = convert_floats_to_int(FFQ_data_processed, binary_features_FFQ)
FFQ_data_processed = convert_floats_to_int(FFQ_data_processed, non_binary_features_FFQ)

df = convert_floats_to_int(df, binary_features_FFQ)
df = convert_floats_to_int(df, non_binary_features_FFQ)

In [None]:
#df.to_csv('../Processed datasets/cleaned_data/questionnaire_FFQ_data__with_target_clean.csv', index=False)

In [None]:
#FFQ_data_processed.to_csv('../Processed datasets/cleaned_data/FFQ_data_clean.csv', index=False)

#### Clinical measure data 

In [None]:
missing_df_clinical_measure = missing_values_columns(clinical_measure_data)
missing_df_clinical_measure['percent_missing'].hist(bins=100)

In [None]:
clinical_measures_processed = clinical_measure_data.copy()

In [None]:
# Create a function that removes the features with missing data above a chosen threshold 
def remove_columns_with_missing_values(dataframe, missing_values_dataframe, threshold):
    remove = missing_values_dataframe[missing_values_dataframe.iloc[:, 1]>threshold].iloc[:,0].tolist()
    print("The number of columns that will be removed is",len(remove))
    dataframe = dataframe.drop(columns=remove)
    return dataframe 

In [None]:
# Remove all the clinical measure features that have more than 40% missing values 
clinical_measures_processed = remove_columns_with_missing_values(clinical_measures_processed, missing_df_clinical_measure, 40)
df = remove_columns_with_missing_values(df, missing_df_clinical_measure, 40)

In [None]:
# Create a dataframe with only the features with dtype object to inspect what type of features they are 
object_columns = clinical_measures_processed.select_dtypes(include=['object']).columns.tolist()
object_df = clinical_measures_processed[object_columns]
object_df

In [None]:
# Convert the feature about left or right arms into a binary variable represented by 0 and 1 
columns_to_convert_to_binary = ['JBLDARM', 'JBLD2ARM']
for column in columns_to_convert_to_binary:
    clinical_measures_processed.loc[:,column] = clinical_measures_processed[column].map({'L':0, 'R':1})
    df.loc[:,column] = df[column].map({'L':0, 'R':1})

In [None]:
# Create a list with all time stamp data 
time_list = object_df.columns.to_list()
time_list.remove('JBLDARM')
time_list.remove('JBLD2ARM')

In [None]:
# Drop all time stamp variables 
clinical_measures_processed = clinical_measures_processed.drop(columns=time_list)
df = df.drop(columns=time_list)

In [None]:
missing_df_clinical_measure = missing_values_columns(clinical_measures_processed)
missing_df_clinical_measure

In [None]:
columns_with_inf = [col for col in clinical_measures_processed.columns if clinical_measures_processed[col].isin([np.inf, -np.inf]).any()]

In [None]:
clinical_measures_processed = clinical_measures_processed.drop(columns=columns_with_inf)
df = df.drop(columns=columns_with_inf)

#### Clinical events data

In [None]:
missing_df_clinical_events = missing_values_columns(clinical_events_data)
missing_df_clinical_events['percent_missing'].hist(bins=100)

In [None]:
clinical_events_processed = clinical_events_data.copy()

In [None]:
# Create a dataframe with only the features with dtype object to inspect what type of features they are 
object_columns = clinical_events_processed.select_dtypes(include=['object']).columns.tolist()
objects_CE = clinical_events_processed[object_columns]
objects_CE

In [None]:
# Dropping all columns with dtype object 
clinical_events_processed = clinical_events_processed.drop(columns=object_columns)
df = df.drop(columns=object_columns)

In [None]:
# Create a list with all the features that carry information about clinical events after phase 9 
future_columns = clinical_events_processed.filter(regex='^(ej|dj|ij|hj19|EJ21)')
future_columns_list = future_columns.columns.tolist()

In [None]:
# Drop all features that carry information about clinical events after phase 9 
clinical_events_processed = clinical_events_processed.drop(columns=future_columns_list)
df = df.drop(columns=future_columns_list)

#### Data splitting

In [None]:
# Seperate the predictors and the outcome variable
X = df.drop('diabetic_outcome', axis=1)
y = df['diabetic_outcome']

In [None]:
# Split the data into a train set of 60%, a validation set of 20% and a test set of 20% 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, stratify=y_train, random_state=42)

print("Train set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)
print("Test set shape:", X_test.shape)

In [None]:
# Add the splited dataframes to a list 
predictor_dataframes = [X_train, X_val, X_test]

#### Imputation

#### Assets

In [None]:
# The missing values for the ASSET questions are filled with the mode
for df in predictor_dataframes:
    mode_value_assets = df['JASSETXH'].mode()[0]
    df['JASSETXH'] = df['JASSETXH'].fillna(mode_value_assets)

In [None]:
for df in predictor_dataframes:
    mode_value_assets = df['JASSETHH'].mode()[0]
    df['JASSETHH'] = df['JASSETHH'].fillna(mode_value_assets)

#### Medical consultation

In [None]:
# The missing values related to how many times have consulted their GP in the last month is filled with the mean
for df in predictor_dataframes:
    mean_value_MC = questionnaire_data_processed['JGPVISYR'].mean()
    df['JGPVISYR'] = df['JGPVISYR'].fillna(mean_value_MC)

#### Total income

In [None]:
# The small percentage of missing values related to the total income are filled with the mode
for df in predictor_dataframes:
    mode_value_TI = df['JINCHH'].mode()[0]
    df['JINCHH'] = df['JINCHH'].fillna(mode_value_TI)

#### JDPN features

In [None]:
# Create a list with all the questions related to behavioural statements 
JDPN_list = [f"JDPN{str(i).zfill(2)}" for i in range(1,21)]

In [None]:
# Fill the missing values for the columns in the created list with the mode 
for df in predictor_dataframes:
    for item in JDPN_list:
        mode_value = df[item].mode()[0]
        df[item] = df[item].fillna(mode_value)

In [None]:
# The remainder of the columns only have a very small percentage of missing values
# Considering the questions are multiple choice, the resulting NaNs are filled with the mode (representing the box that is chosen the most often)
for df in predictor_dataframes:
    for column in questionnaire_data_processed.columns:
        if column in df.columns:
            mode_value = df[column].mode()[0]
            df[column] = df[column].fillna(mode_value)

In [None]:
# Create a list with the behavioural statements that need to be reverse coded because of differences in sentiment 
JDPN_list_complete = extract_strings_starting_with(missing_df_questionnaire, 'column_name', 'JDPN')
JDPN_list_complete = [item for item in JDPN_list_complete if item not in JDPN_list]
print(JDPN_list_complete)

In [None]:
must_be_removed =['JDPN_NCP', 'JDPN_PCT','JDPN_SUM']
for item in must_be_removed:
    JDPN_list_complete.remove(item)

In [None]:
JDPN_list_reversed = JDPN_list_complete

In [None]:
# Reverse code the columns without the missing values 
scale_min = 0
scale_max = 4   

for df in predictor_dataframes:
    max_plus_one = 4
    df[JDPN_list_reversed] = max_plus_one - df[JDPN_list_reversed]

In [None]:
# Convert all the multiple choice questions from floats to integers
admin = ['JWITHDRW', 'JPART', 'JPARTTYP', 'JQUESTYP', 'JSITE', 'JAGE_Q', 'JPROXY', 'JDATCOMP', 'JDATEOR', 'JVERSION', 'JCOMPLET','JPHDATE','Id_random_DPUK']

for df in predictor_dataframes:
    for column in questionnaire_data_processed.columns:
        if column not in admin and questionnaire_data_processed[column].dtype == float:
            df[column] = df[column].astype(int)

## Clinical measures numerical imputation

In [None]:
for df in predictor_dataframes:
    if 'S9DATE.1'in df.columns:
        df.drop(columns='S9DATE.1', inplace=True)

In [None]:
X_train_ids = X_train['Id_random_DPUK']
X_val_ids = X_val['Id_random_DPUK']
X_test_ids = X_test['Id_random_DPUK']

In [None]:
X_train.drop(columns='Id_random_DPUK', inplace=True)
X_val.drop(columns='Id_random_DPUK', inplace=True)
X_test.drop(columns='Id_random_DPUK', inplace=True)

In [None]:
original_dtypes = X_train.dtypes

In [None]:
imputer = KNNImputer(n_neighbors=1)

X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_val_imputed = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns, index=X_val.index)
X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns, index=X_test.index)

In [None]:
for col in original_dtypes.index:
    X_train_imputed[col] = X_train_imputed[col].astype(original_dtypes[col])
    X_val_imputed[col] = X_val_imputed[col].astype(original_dtypes[col])
    X_test_imputed[col] = X_test_imputed[col].astype(original_dtypes[col])


In [None]:
X_train_imputed  = pd.concat([X_train_ids, X_train_imputed], axis=1)
X_val_imputed  = pd.concat([X_val_ids, X_val_imputed], axis=1)
X_test_imputed  = pd.concat([X_test_ids, X_test_imputed], axis=1)

In [None]:
X_train_imputed.to_csv('../cleaned_imputed_split/X_train.csv', index=False)

In [None]:
X_val_imputed.to_csv('../cleaned_imputed_split/X_val.csv', index=False)

In [None]:
X_test_imputed.to_csv('../cleaned_imputed_split/X_test.csv', index=False)

In [None]:
y_train.to_csv('../cleaned_imputed_split/y_train.csv', index=False)

In [None]:
y_val.to_csv('../cleaned_imputed_split/y_val.csv', index=False)

In [None]:
y_test.to_csv('../cleaned_imputed_split/y_test.csv', index=False)