### Preprocessing 25-feature POI case/control cohort

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
# load datasets
case_cohort = pd.read_csv('../datasets/25-feature/POI-case-25_participant.csv')
control_cohort = pd.read_csv('../datasets/25-feature/POI-control-25_participant.csv')

In [3]:
case_cohort.head()

Unnamed: 0,eid,smoking status,sleep duration,sleeplessness,coffee intake,tea intake,moderate physical activity time spent,vigorous physical activity time spent,duration of moderate physical activity-pilot,duration of vigorous physical activity-pilot,...,job code,qualifications,BMI,waist circumference,HDL-C,LDL-C,apolipoprotein B,treatment/medication code,ICD10,T2D
0,1004331,Previous,9,Usually,1,Less than one,,,,,...,Brokers,"Other professional qualifications eg: nursing,...",18.2098,65.0,-1.16566,-0.314136,0.925,,"O04.9 Complete or unspecified, without complic...",
1,1005087,Previous,7,Never/rarely,4,0,,,,,...,Shopkeepers and wholesale/retail dealers,O levels/GCSEs or equivalent,31.2883,91.0,1.11553,-0.377054,1.082,azelastine|beclometasone|hay-crom eye drops|ne...,H02.3 Blepharochalasis|H92.0 Otalgia|I26.9 Pul...,
2,1007659,Previous,5,Usually,0,Less than one,,,,,...,,College or University degree|A levels/AS level...,31.8405,102.0,-0.464109,1.57059,1.113,budesonide|cetirizine|meloxicam|omeprazole,C50.4 Upper-outer quadrant of breast|D12.4 Des...,-0.395678
3,1014838,Never,7,Sometimes,1,4,,,,,...,,None of the above,24.308,70.0,0.150831,1.16908,1.209,calcichew d3 tablet|cetirizine|cod liver oil c...,H02.8 Other specified disorders of eyelid|M16....,
4,1019851,Never,9,Never/rarely,5,0,,,,,...,,None of the above,25.1797,77.0,-0.638671,1.74931,0.835,aspirin|bendroflumethiazide|felodipine|ibuprof...,E10.9 Without complications|E78.0 Pure hyperch...,0.442098


In [4]:
case_cohort.isna().sum()

eid                                                 0
smoking status                                      0
sleep duration                                      0
sleeplessness                                       0
coffee intake                                       0
tea intake                                          0
moderate physical activity time spent            9299
vigorous physical activity time spent            9299
duration of moderate physical activity-pilot    10565
duration of vigorous physical activity-pilot    10595
alcohol intake frequency                            0
energy intake                                    9299
salad/raw vegetable intake                          0
fresh fruit intake                                  0
cooked vegetable intake                             0
pork intake                                         0
job code                                         5232
qualifications                                     81
BMI                         

In [5]:
control_cohort.isna().sum()

eid                                                 0
smoking status                                      2
sleep duration                                      2
sleeplessness                                       2
coffee intake                                       2
tea intake                                          2
moderate physical activity time spent           43779
vigorous physical activity time spent           43779
duration of moderate physical activity-pilot    50278
duration of vigorous physical activity-pilot    50392
alcohol intake frequency                            2
energy intake                                   43779
salad/raw vegetable intake                          2
fresh fruit intake                                  2
cooked vegetable intake                             2
pork intake                                         2
job code                                        21975
qualifications                                    391
BMI                         

In [6]:
control_cohort['smoking status'].value_counts()

Never                   28690
Previous                17133
Current                  4564
Prefer not to answer      198
Name: smoking status, dtype: int64

In [7]:
case_cohort.loc[case_cohort['smoking status'] != 'Current', 'smoking status']= '0'
case_cohort.loc[case_cohort['smoking status'] == 'Current', 'smoking status']= '1'
control_cohort.loc[control_cohort['smoking status'] != 'Current', 'smoking status'] = '0'
control_cohort.loc[control_cohort['smoking status'] == 'Current', 'smoking status'] = '1'

In [8]:
# Use the mean value of the column 'BMI' to fill in the missing BMI values
mean_val = case_cohort['BMI'].mean()
case_cohort['BMI'].fillna(mean_val, inplace=True)
mean_val = control_cohort['BMI'].mean()
control_cohort['BMI'].fillna(mean_val, inplace=True)

In [9]:
# replace null sleep duration with mode
control_cohort['sleep duration'].fillna('Prefer not to answer', inplace=True)
case_cohort.loc[case_cohort['sleep duration'] == 'Do not know', 'sleep duration']= '7'
control_cohort.loc[control_cohort['sleep duration'] == 'Do not know', 'sleep duration'] = '7'
case_cohort.loc[case_cohort['sleep duration'] == 'Prefer not to answer', 'sleep duration'] = '7'
control_cohort.loc[control_cohort['sleep duration'] == 'Prefer not to answer', 'sleep duration'] = '7'

In [10]:
# replace sleeplessness	
control_cohort['sleeplessness'].fillna('Prefer not to answer', inplace=True)
case_cohort.loc[case_cohort['sleeplessness'] == 'Never/rarely', 'sleeplessness']= '0'
case_cohort.loc[case_cohort['sleeplessness'] == 'Sometimes', 'sleeplessness']= '0'
case_cohort.loc[case_cohort['sleeplessness'] == 'Usually', 'sleeplessness']= '1'
case_cohort.loc[case_cohort['sleeplessness'] == 'Prefer not to answer', 'sleeplessness']= '0'
control_cohort.loc[control_cohort['sleeplessness'] == 'Never/rarely', 'sleeplessness'] = '0'
control_cohort.loc[control_cohort['sleeplessness'] == 'Sometimes', 'sleeplessness'] = '0'
control_cohort.loc[control_cohort['sleeplessness'] == 'Usually', 'sleeplessness'] = '1'
control_cohort.loc[control_cohort['sleeplessness'] == 'Prefer not to answer', 'sleeplessness'] = '0'

In [11]:
# Use 'Do not know' to fill the missing coffee intake values
control_cohort['coffee intake'].fillna('Do not know', inplace=True)
case_cohort.loc[case_cohort['coffee intake'] == 'Less than one', 'coffee intake']= '0'
case_cohort.loc[case_cohort['coffee intake'] == 'Do not know', 'coffee intake']= '0'
case_cohort.loc[case_cohort['coffee intake'] == 'Prefer not to answer', 'coffee intake']= '0'
control_cohort.loc[control_cohort['coffee intake'] == 'Less than one', 'coffee intake'] = '0'
control_cohort.loc[control_cohort['coffee intake'] == 'Do not know', 'coffee intake'] = '0'
control_cohort.loc[control_cohort['coffee intake'] == 'Prefer not to answer', 'coffee intake'] = '0'

In [12]:
# Use 'Do not know' to fill the missing tea intake values
control_cohort['tea intake'].fillna('Do not know', inplace=True)
case_cohort.loc[case_cohort['tea intake'] == 'Less than one', 'tea intake']= '0'
case_cohort.loc[case_cohort['tea intake'] == 'Do not know', 'tea intake']= '0'
case_cohort.loc[case_cohort['tea intake'] == 'Prefer not to answer', 'tea intake']= '0'
control_cohort.loc[control_cohort['tea intake'] == 'Less than one', 'tea intake'] = '0'
control_cohort.loc[control_cohort['tea intake'] == 'Do not know', 'tea intake'] = '0'
control_cohort.loc[control_cohort['tea intake'] == 'Prefer not to answer', 'tea intake'] = '0'

In [13]:
case_cohort['moderate physical activity time spent'].fillna('None', inplace=True)
control_cohort['moderate physical activity time spent'].fillna('None', inplace=True)
case_cohort.loc[case_cohort['moderate physical activity time spent'] == 'None', 'moderate physical activity time spent']= '0'
case_cohort.loc[case_cohort['moderate physical activity time spent'] == 'Under 10 minutes', 'moderate physical activity time spent']= '0'
control_cohort.loc[control_cohort['moderate physical activity time spent'] == 'None', 'moderate physical activity time spent'] = '0'
control_cohort.loc[control_cohort['moderate physical activity time spent'] == 'Under 10 minutes', 'moderate physical activity time spent'] = '0'
case_cohort.loc[case_cohort['moderate physical activity time spent'] != '0', 'moderate physical activity time spent']= '1'
control_cohort.loc[control_cohort['moderate physical activity time spent'] != '0', 'moderate physical activity time spent'] = '1'

In [14]:
case_cohort['vigorous physical activity time spent'].fillna('None', inplace=True)
control_cohort['vigorous physical activity time spent'].fillna('None', inplace=True)
case_cohort.loc[case_cohort['vigorous physical activity time spent'] == 'None', 'vigorous physical activity time spent']= '0'
case_cohort.loc[case_cohort['vigorous physical activity time spent'] == 'Under 10 minutes', 'vigorous physical activity time spent']= '0'
control_cohort.loc[control_cohort['vigorous physical activity time spent'] == 'None', 'vigorous physical activity time spent'] = '0'
control_cohort.loc[control_cohort['vigorous physical activity time spent'] == 'Under 10 minutes', 'vigorous physical activity time spent'] = '0'
case_cohort.loc[case_cohort['vigorous physical activity time spent'] != '0', 'vigorous physical activity time spent']= '1'
control_cohort.loc[control_cohort['vigorous physical activity time spent'] != '0', 'vigorous physical activity time spent'] = '1'

In [15]:
case_cohort['duration of moderate physical activity-pilot'].fillna('Do not know', inplace=True)
control_cohort['duration of moderate physical activity-pilot'].fillna('Do not know', inplace=True)
case_cohort.loc[case_cohort['duration of moderate physical activity-pilot'] == 'Do not know', 'duration of moderate physical activity-pilot']= '0'
case_cohort.loc[case_cohort['duration of moderate physical activity-pilot'] == 'Prefer not to answer', 'duration of moderate physical activity-pilot']= '0'
case_cohort.loc[case_cohort['duration of moderate physical activity-pilot'] == 'Less than 30 mins', 'duration of moderate physical activity-pilot']= '0'
control_cohort.loc[control_cohort['duration of moderate physical activity-pilot'] == 'Do not know', 'duration of moderate physical activity-pilot'] = '0'
control_cohort.loc[control_cohort['duration of moderate physical activity-pilot'] == 'Prefer not to answer', 'duration of moderate physical activity-pilot'] = '0'
control_cohort.loc[control_cohort['duration of moderate physical activity-pilot'] == 'Less than 30 mins', 'duration of moderate physical activity-pilot'] = '0'
case_cohort.loc[case_cohort['duration of moderate physical activity-pilot'] != '0', 'duration of moderate physical activity-pilot']= '1'
control_cohort.loc[control_cohort['duration of moderate physical activity-pilot'] != '0', 'duration of moderate physical activity-pilot'] = '1'

In [16]:
case_cohort['duration of vigorous physical activity-pilot'].fillna('Do not know', inplace=True)
control_cohort['duration of vigorous physical activity-pilot'].fillna('Do not know', inplace=True)
case_cohort.loc[case_cohort['duration of vigorous physical activity-pilot'] == 'Do not know', 'duration of vigorous physical activity-pilot']= '0'
case_cohort.loc[case_cohort['duration of vigorous physical activity-pilot'] == 'Prefer not to answer', 'duration of vigorous physical activity-pilot']= '0'
case_cohort.loc[case_cohort['duration of vigorous physical activity-pilot'] == 'Less than 30 mins', 'duration of vigorous physical activity-pilot']= '0'
control_cohort.loc[control_cohort['duration of vigorous physical activity-pilot'] == 'Do not know', 'duration of vigorous physical activity-pilot'] = '0'
control_cohort.loc[control_cohort['duration of vigorous physical activity-pilot'] == 'Prefer not to answer', 'duration of vigorous physical activity-pilot'] = '0'
control_cohort.loc[control_cohort['duration of vigorous physical activity-pilot'] == 'Less than 30 mins', 'duration of vigorous physical activity-pilot'] = '0'
case_cohort.loc[case_cohort['duration of vigorous physical activity-pilot'] != '0', 'duration of vigorous physical activity-pilot']= '1'
control_cohort.loc[control_cohort['duration of vigorous physical activity-pilot'] != '0', 'duration of vigorous physical activity-pilot'] = '1'

In [17]:
control_cohort['alcohol intake frequency'].fillna('Never', inplace=True)
case_cohort.loc[case_cohort['alcohol intake frequency'] == 'Daily or almost daily', 'alcohol intake frequency']= '1'
case_cohort.loc[case_cohort['alcohol intake frequency'] == 'Three or four times a week', 'alcohol intake frequency']= '1'
control_cohort.loc[control_cohort['alcohol intake frequency'] == 'Daily or almost daily', 'alcohol intake frequency']= '1'
control_cohort.loc[control_cohort['alcohol intake frequency'] == 'Three or four times a week', 'alcohol intake frequency']= '1'
case_cohort.loc[case_cohort['alcohol intake frequency'] != '1', 'alcohol intake frequency']= '0'
control_cohort.loc[control_cohort['alcohol intake frequency'] != '1', 'alcohol intake frequency'] = '0'

In [18]:
mean_val = case_cohort['energy intake'].mean()
case_cohort['energy intake'].fillna(mean_val, inplace=True)
mean_val = control_cohort['energy intake'].mean()
control_cohort['energy intake'].fillna(mean_val, inplace=True)

In [19]:
control_cohort['salad/raw vegetable intake'].fillna('Do not know', inplace=True)
case_cohort.loc[case_cohort['salad/raw vegetable intake'] == 'Less than one', 'salad/raw vegetable intake']= '0'
case_cohort.loc[case_cohort['salad/raw vegetable intake'] == 'Do not know', 'salad/raw vegetable intake']= '0'
case_cohort.loc[case_cohort['salad/raw vegetable intake'] == 'Prefer not to answer', 'salad/raw vegetable intake']= '0'
control_cohort.loc[control_cohort['salad/raw vegetable intake'] == 'Less than one', 'salad/raw vegetable intake'] = '0'
control_cohort.loc[control_cohort['salad/raw vegetable intake'] == 'Do not know', 'salad/raw vegetable intake'] = '0'
control_cohort.loc[control_cohort['salad/raw vegetable intake'] == 'Prefer not to answer', 'salad/raw vegetable intake'] = '0'

In [20]:
control_cohort['fresh fruit intake'].fillna('Do not know', inplace=True)
case_cohort.loc[case_cohort['fresh fruit intake'] == 'Less than one', 'fresh fruit intake']= '0'
case_cohort.loc[case_cohort['fresh fruit intake'] == 'Do not know', 'fresh fruit intake']= '0'
case_cohort.loc[case_cohort['fresh fruit intake'] == 'Prefer not to answer', 'fresh fruit intake']= '0'
control_cohort.loc[control_cohort['fresh fruit intake'] == 'Less than one', 'fresh fruit intake'] = '0'
control_cohort.loc[control_cohort['fresh fruit intake'] == 'Do not know', 'fresh fruit intake'] = '0'
control_cohort.loc[control_cohort['fresh fruit intake'] == 'Prefer not to answer', 'fresh fruit intake'] = '0'

In [21]:
control_cohort['cooked vegetable intake'].fillna('Do not know', inplace=True)
case_cohort.loc[case_cohort['cooked vegetable intake'] == 'Less than one', 'cooked vegetable intake']= '0'
case_cohort.loc[case_cohort['cooked vegetable intake'] == 'Do not know', 'cooked vegetable intake']= '0'
case_cohort.loc[case_cohort['cooked vegetable intake'] == 'Prefer not to answer', 'cooked vegetable intake']= '0'
control_cohort.loc[control_cohort['cooked vegetable intake'] == 'Less than one', 'cooked vegetable intake'] = '0'
control_cohort.loc[control_cohort['cooked vegetable intake'] == 'Do not know', 'cooked vegetable intake'] = '0'
control_cohort.loc[control_cohort['cooked vegetable intake'] == 'Prefer not to answer', 'cooked vegetable intake'] = '0'

In [22]:
control_cohort['pork intake'].fillna('Never', inplace=True)
case_cohort.loc[case_cohort['pork intake'] == '2-4 times a week', 'pork intake']= '1'
case_cohort.loc[case_cohort['pork intake'] != '1', 'pork intake']= '0'
control_cohort.loc[control_cohort['pork intake'] == '2-4 times a week', 'pork intake']= '1'
control_cohort.loc[control_cohort['pork intake'] != '1', 'pork intake']= '0'

In [23]:
case_cohort['qualifications'].fillna('None of the above', inplace=True)
control_cohort['qualifications'].fillna('None of the above', inplace=True)
case_cohort.loc[case_cohort['qualifications'].str.contains('College or University degree'), 'qualifications']= '1'
case_cohort.loc[case_cohort['qualifications'] != '1', 'qualifications']= '0'
control_cohort.loc[control_cohort['qualifications'].str.contains('College or University degree'), 'qualifications']= '1'
control_cohort.loc[control_cohort['qualifications'] != '1', 'qualifications']= '0'

In [24]:
mean_val = case_cohort['waist circumference'].mean()
case_cohort['waist circumference'].fillna(mean_val, inplace=True)
mean_val = control_cohort['waist circumference'].mean()
control_cohort['waist circumference'].fillna(mean_val, inplace=True)

mean_val = case_cohort['HDL-C'].mean()
case_cohort['HDL-C'].fillna(mean_val, inplace=True)
mean_val = control_cohort['HDL-C'].mean()
control_cohort['HDL-C'].fillna(mean_val, inplace=True)

mean_val = case_cohort['LDL-C'].mean()
case_cohort['LDL-C'].fillna(mean_val, inplace=True)
mean_val = control_cohort['LDL-C'].mean()
control_cohort['LDL-C'].fillna(mean_val, inplace=True)

mean_val = case_cohort['apolipoprotein B'].mean()
case_cohort['apolipoprotein B'].fillna(mean_val, inplace=True)
mean_val = control_cohort['apolipoprotein B'].mean()
control_cohort['apolipoprotein B'].fillna(mean_val, inplace=True)

In [25]:
case_cohort['treatment/medication code'].fillna('No', inplace=True)
control_cohort['treatment/medication code'].fillna('No', inplace=True)

case_cohort.loc[case_cohort['treatment/medication code'].str.contains('atorvastatin'), 'treatment/medication code']= '1'
case_cohort.loc[case_cohort['treatment/medication code'].str.contains('cerivastatin'), 'treatment/medication code']= '1'
case_cohort.loc[case_cohort['treatment/medication code'].str.contains('fluvastatin'), 'treatment/medication code']= '1'
case_cohort.loc[case_cohort['treatment/medication code'].str.contains('lovastatin'), 'treatment/medication code']= '1'
case_cohort.loc[case_cohort['treatment/medication code'].str.contains('pravastatin'), 'treatment/medication code']= '1'
case_cohort.loc[case_cohort['treatment/medication code'].str.contains('rosuvastatin'), 'treatment/medication code']= '1'
case_cohort.loc[case_cohort['treatment/medication code'].str.contains('simvastatin'), 'treatment/medication code']= '1'
case_cohort.loc[case_cohort['treatment/medication code'].str.contains('vitamin c'), 'treatment/medication code']= '1'
case_cohort.loc[case_cohort['treatment/medication code'].str.contains('vitamin b'), 'treatment/medication code']= '1'
case_cohort.loc[case_cohort['treatment/medication code'] != '1', 'treatment/medication code']= '0'

control_cohort.loc[control_cohort['treatment/medication code'].str.contains('atorvastatin'), 'treatment/medication code']= '1'
control_cohort.loc[control_cohort['treatment/medication code'].str.contains('cerivastatin'), 'treatment/medication code']= '1'
control_cohort.loc[control_cohort['treatment/medication code'].str.contains('fluvastatin'), 'treatment/medication code']= '1'
control_cohort.loc[control_cohort['treatment/medication code'].str.contains('lovastatin'), 'treatment/medication code']= '1'
control_cohort.loc[control_cohort['treatment/medication code'].str.contains('pravastatin'), 'treatment/medication code']= '1'
control_cohort.loc[control_cohort['treatment/medication code'].str.contains('rosuvastatin'), 'treatment/medication code']= '1'
control_cohort.loc[control_cohort['treatment/medication code'].str.contains('simvastatin'), 'treatment/medication code']= '1'
control_cohort.loc[control_cohort['treatment/medication code'].str.contains('vitamin c'), 'treatment/medication code']= '1'
control_cohort.loc[control_cohort['treatment/medication code'].str.contains('vitamin b'), 'treatment/medication code']= '1'
control_cohort.loc[control_cohort['treatment/medication code'] != '1', 'treatment/medication code']= '0'

In [26]:
case_cohort['ICD10'].fillna('No', inplace=True)
control_cohort['ICD10'].fillna('No', inplace=True)

case_cohort.loc[case_cohort['ICD10'].str.contains('Y42.5'), 'ICD10']= '1'
case_cohort.loc[case_cohort['ICD10'] != '1', 'ICD10']= '0'
control_cohort.loc[control_cohort['ICD10'].str.contains('Y42.5'), 'ICD10']= '1'
control_cohort.loc[control_cohort['ICD10'] != '1', 'ICD10']= '0'

In [27]:
# Use 0s to fill the missing T2D values
case_cohort['T2D'].fillna(0, inplace=True)
control_cohort['T2D'].fillna(0, inplace=True)

In [28]:
case_cohort = case_cohort.drop('job code',axis=1)
control_cohort = control_cohort.drop('job code',axis=1)

In [29]:
control_cohort

Unnamed: 0,eid,smoking status,sleep duration,sleeplessness,coffee intake,tea intake,moderate physical activity time spent,vigorous physical activity time spent,duration of moderate physical activity-pilot,duration of vigorous physical activity-pilot,...,pork intake,qualifications,BMI,waist circumference,HDL-C,LDL-C,apolipoprotein B,treatment/medication code,ICD10,T2D
0,1000215,1,6,0,2,0,0,0,0,0,...,0,1,24.6355,80.0,0.088206,0.233285,1.179000,0,0,-0.537662
1,1000544,0,7,0,0,4,1,0,0,0,...,0,0,21.8195,67.0,1.367390,-1.021660,0.857000,0,0,0.000000
2,1000821,1,5,1,2,4,0,0,0,0,...,0,0,26.0156,74.0,-0.658663,0.453621,0.926000,1,0,0.000000
3,1001040,0,7,1,0,10,0,0,0,0,...,1,0,21.9854,72.0,0.725695,-0.454302,1.046000,0,0,0.000000
4,1001071,1,7,0,2,4,0,0,0,0,...,0,0,22.8538,78.0,0.022764,-0.077343,1.060069,0,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50582,6022151,0,8,0,0,2,0,0,0,0,...,0,0,27.7651,87.0,-1.058820,-0.942836,1.311000,0,0,-1.360960
50583,6022571,0,6,0,0,7,0,0,0,0,...,0,0,25.9651,74.0,1.139240,-0.222365,1.094000,0,0,0.000000
50584,6023181,0,6,1,4,4,0,0,0,0,...,0,0,23.8150,74.0,-0.896226,-0.271863,0.842000,0,0,-1.624070
50585,6023229,1,7,1,1,6,0,0,0,0,...,0,0,27.7441,88.0,0.022764,-0.077343,1.060069,0,0,0.000000


In [30]:
# saved fullfilled files
case_cohort.to_csv('../datasets/25-feature/cleaned_case.csv', index=None)
control_cohort.to_csv('../datasets/25-feature/cleaned_control.csv', index=None)