## Notebook Generated using Apocrita OnDemand

Hudak et al., (2018). Open OnDemand: A web-based client portal for HPC centers. Journal of Open Source Software, 3(25), 622, https://doi.org/10.21105/joss.00622

In [25]:
pwd

'/data/home/bt211037/dissertation'

In [None]:
# Imports
import pandas as pd # dataframes
import matplotlib.pyplot as plt # plots
import seaborn as sns # plots
import missingno as msno # Missing values
import numpy as np # arrays
import math # math.ceil
from dython.nominal import associations # correlation anal
from timeit import default_timer as timer # Time how long commands take

import warnings # Control warnings
warnings.filterwarnings('ignore') # Ignore warnings to clean up visual output change back to 'default' to see them

%matplotlib inline
plt.style.use('ggplot')

# Get dataframe

In [None]:
# Read in phenotype dataframe
df = pd.read_csv('/data/home/bt211037/dissertation/input/raw_data/FINAL_25.tab', sep='\t', header=0, index_col=0)

In [None]:
# Read in disease dataframe to extract thyroid cancer column
diseases_df = pd.read_csv('/data/home/bt211037/dissertation/input/raw_data/phecodes_clean_2021.txt', sep='\t', header=0, index_col=0)

In [None]:
# Add thyroid cancer column to phenotype df
df['thyroid_cancer']=diseases_df['p193']

In [None]:
# Remove rows with missing thyroid cancer values
df=df.loc[df['thyroid_cancer'].notnull()]

# Convert phenotype codes to descriptions

In [None]:

# Function for converting phecodes to descriptions. Works when descriptions/phecodes are the headers.
def phecode_header_conversion(data, dictionary_df, key_col, new_col, index_col=False):

    dataframe = data # re-assign dataframe
    colnames = list(dataframe.columns)  # Create a list of phecodes in existing df
    new = []  # New column entries - descriptions

    for item in colnames:  # Iterate over phecodes
        # Iterate over dictionary df keys (phecodes) and values (descriptions)
        for key, value in zip(list(dictionary_df[key_col]), list(dictionary_df[new_col])):
            if item == key:  # If phecode in current df
                new.append(value)  # Add corresponding description to new list
            else:  # If not pass
                pass

    dataframe.set_axis(new, 1, inplace=True)   # Replace headers with descriptions

    return dataframe  # Return new dataframe

In [None]:
# Read in data containg phenotype descriptions
dictionary_df = pd.read_csv('/data/home/bt211037/dissertation/input/raw_data/ukbb_and_phec_clean.txt', sep='\t', header=0)

In [None]:
# Check if all column headers are in dictionary 
cols = list(df.columns) # Create list of column names in our data
col_translations = list(dictionary_df['FieldID']) # List of possible columns names in dictionary 

not_included = set(cols) - set(col_translations) # Find values in columns headers not in dictionary
not_included

- Thyroid cancer and medications cbi do not need to be translated as they are already descriptive.
- The other variants are the genetic principle components from 16-40. Discovered by searching for 22009 in the Biobank  [data showcase](https://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=22009).
- Clearly, genetic principle compnents 1-16 have been included but the others have been missed and need to be added.

In [None]:
# Determine how existing genetics principle components are described
dictionary_df.loc[dictionary_df['FieldID'] == 'x22009_0_15']

In [None]:
# Add the missing genetic principle components
for item in not_included:
    if item == 'medication_cbi' or item == 'thyroid_cancer': # Add these as they are for simplicity 
        dictionary_df.loc[len(dictionary_df.index)] = [item, item] 
    else:
        dictionary_df.loc[len(dictionary_df.index)] = [item, f'Genetic principal components_{item[-2:]}|{item}'] 

In [None]:
# Check all items have been added 
cols2 = list(df.columns) # Create list of column names in our data
col_translations2 = list(dictionary_df['FieldID']) # List of possible columns names in dictionary 
not_included2 = set(cols2) - set(col_translations2)
len(not_included2)

In [None]:
# Manually check new entries to dicitonary are there
dictionary_df.tail(30)

In [None]:
# Convert the dataframe headers
df = phecode_header_conversion(df, dictionary_df, 'FieldID', 'Field' )

# Missing Values

In [None]:
# Matrix plot giving positional information about missing values
msno.matrix(df)

# Target-Class Imbalance

In [None]:
# Create a dataframe showing thyroid cancer vs no thyroid cancer to plot from
c_imbal = pd.DataFrame([[len(df.loc[df['thyroid_cancer'] == 1]), 'Thyroid_Cancer'], [len(df.loc[df['thyroid_cancer'] != 1]), 'No_Thyroid_Cancer']], columns=['Participant_Count', 'Disease_State'])
c_imbal

In [None]:
# Plot bar chart of imbalance
colours = ['indianred', 'powderblue'] # Colour names
plt.figure(figsize=(6,6)) # Figure size
plt.rcParams['axes.facecolor'] = 'white' # Barchart background colour
plt.grid() # Remove grid background

bars = plt.bar(c_imbal['Disease_State'], c_imbal['Participant_Count'], color=colours) # Create the bars
plt.bar_label(bars) # Add number label on top of bars
plt.xticks(fontsize=13) # Fontsize of x-axis bar labels

#plt.savefig('class_imbalance_bar.png', bbox_inches='tight') # Save figure
plt.show() # Show figure


# Summary Statistics

In [None]:
# Function to get column names for continuous and non-continuous (categorical) features
def find_cat_or_con_columns(dataframe):

    columns = [list(dataframe[i]) for i in dataframe] # Nested list of column values

    uniques = [len(set([i for i in a if pd.notna(i)])) for a in columns] # Num of unique values in a column

    continuous_indexes = [i for i, c in enumerate(uniques) if c > 50] # Indexes of continuous columns

    categorical_indexes = [i for i, c in enumerate(uniques) if c <= 50] # Indexes of categorical columns

    con_cols = [dataframe.columns[x] for x in continuous_indexes] # List containing continuous columns names

    cat_cols = [dataframe.columns[x] for x in categorical_indexes] # List containing categorical columns names

    return con_cols, cat_cols # return two lists of continuous & categorical column names

In [None]:
# Find continuous columns
continuous_cols, categorical_cols = find_cat_or_con_columns(df)

# Number of continuous cols
num_con = len(continuous_cols)

# Summary statistics for continuous columns (transpose for presentation)
con_summary = df[continuous_cols].groupby(df['thyroid_cancer']).describe().T

print(num_con)
print()
con_summary.head(20)

In [None]:
### Boxplots ###
# For continuous columns
customPalette = sns.set_palette(sns.color_palette(colours))

fig_col_num = 5
fig_row_num = math.ceil(len(continuous_cols)/fig_col_num)
fig_height = fig_row_num*3
fig_width = fig_col_num*4

fig, axes = plt.subplots(fig_row_num, fig_col_num, figsize=(fig_width,fig_height))
axes = axes.flatten()
plt.rcParams['axes.facecolor'] = 'white'


for i, col in enumerate(df[continuous_cols]):
    sns.boxplot(x=df['thyroid_cancer'], y=col, data=df, palette=customPalette, ax = axes[i])
    axes[i].set(xlabel=None, ylabel=None)
    axes[i].tick_params(axis=u'both', which=u'both',length=0)
    axes[i].set_title('%s' %(col), fontweight='bold', fontsize=8)
    axes[i].set_xticklabels(['No_Thyroid_Cancer', 'Thyroid_Cancer'], size=8)

plt.tight_layout()
plt.grid()

plt.savefig('continuous_boxplots.png', bbox_inches='tight') # Save figure
plt.show()

# Correlation

Using dython library which calculates correlations for different feature types(continuous & categorical) using different methods for each. Further details in documentation [here](http://shakedzy.xyz/dython/modules/nominal/) and guide [here](https://medium.com/@knoldus/how-to-find-correlation-value-of-categorical-variables-23de7e7a9e26).


In [None]:
# Runs the analysis producing a heatmap and matrix (heatmap too large)
s = timer()
corr_matrix = associations(df, nominal_columns=categorical_cols)
e = timer()
print(f'Time taken for correlations: {(e - s)/60} mins')

In [None]:
# Extract the matrix as heatmap is too large 
df_complete_corr=corr_matrix['corr']
df_complete_corr = df_complete_corr.dropna(axis=1, how='all').dropna(axis=0, how='all')

In [None]:
df_complete_corr

In [None]:
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=5):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]


In [None]:
# Find 100 highest correlations
high_correlations = get_top_abs_correlations(df_complete_corr, 100)

In [None]:
# Look at how correlated the lowest of these correlations are 
high_correlations.tail(10)

In [None]:
most_correlated = [list(x) for x in high_correlations.index]

In [None]:
high_corr = []
for item in most_correlated:
    for x in item:
        if x not in high_corr:
            high_corr.append(x)
        else:
            pass
    

In [None]:
high_corr

In [None]:
# Pairplot to look at correlated values - batched in smaller groups

customPalette = sns.set_palette(sns.color_palette(colours))

data = high_corr[0:5]
data.append('thyroid_cancer')

sns.pairplot(df[data], hue="thyroid_cancer", corner=True, palette=customPalette)

plt.show()


# Look at categorical data 

Will labal encoding or categorical encoding be needed and for which columns.

In [None]:
# Look at categorical clumns to see which should be one hot encoded and which label encoded
categorical_cols

In [None]:
one_hot = ['Weight method|x21_0_0', 'Spirometry method|x23_0_0', 'Sex|x31_0_0', 
           'UK Biobank assessment centre|x54_0_0', 'Birth weight known|x120_0_0', 
           'Type of accommodation lived in|x670_0_0',
           'Own or rent accommodation lived in|x680_0_0', 'Drive faster than motorway speed limit|x1100_0_0',
           'Usual side of head for mobile phone use|x1150_0_0', 'Usual side of head for mobile phone use|x1150_0_0',
           'Morning/evening person (chronotype)|x1180_0_0', 'Nap during day|x1190_0_0', 'Snoring|x1210_0_0',
           'Daytime dozing / sleeping (narcolepsy)|x1220_0_0', 'Current tobacco smoking|x1239_0_0',
           'Past tobacco smoking|x1249_0_0', 'Major dietary changes in the last 5 years|x1538_0_0',
           'Variation in diet|x1548_0_0',  'Alcohol usually taken with meals|x1618_0_0',
           'Alcohol intake versus 10 years previously|x1628_0_0', 'Skin colour|x1717_0_0', 
           'Ease of skin tanning|x1727_0_0', 'Hair colour (natural before greying)|x1747_0_0',
           'Facial ageing|x1757_0_0', 'Father still alive|x1797_0_0', 'Mother still alive|x1835_0_0', 
           'Mood swings|x1920_0_0', 'Miserableness|x1930_0_0', 'Irritability|x1940_0_0', 
           'Sensitivity / hurt feelings|x1950_0_0', 'Fed-up feelings|x1960_0_0', 'Nervous feelings|x1970_0_0',
           'Worrier / anxious feelings|x1980_0_0', "Tense / 'highly strung'|x1990_0_0",
           'Worry too long after embarrassment|x2000_0_0', "Suffer from 'nerves'|x2010_0_0",
           'Loneliness isolation|x2020_0_0', 'Guilty feelings|x2030_0_0', 'Risk taking|x2040_0_0',
           'Seen doctor (GP) for nerves anxiety tension or depression|x2090_0_0',
           'Seen a psychiatrist for nerves anxiety tension or depression|x2100_0_0',
           'Able to confide|x2110_0_0',
           'Answered sexual history questions|x2129_0_0',
           'Ever had same-sex intercourse|x2159_0_0', 'Long-standing illness disability or infirmity|x2188_0_0',
           'Wears glasses or contact lenses|x2207_0_0', 'Other eye problems|x2227_0_0',
           'Plays computer games|x2237_0_0', 'Hearing difficulty/problems|x2247_0_0',
           'Hearing difficulty/problems with background noise|x2257_0_0', 'Use of sun/uv protection|x2267_0_0',
           'Weight change compared with 1 year ago|x2306_0_0',
           'Wheeze or whistling in the chest in last year|x2316_0_0',
           'Chest pain or discomfort|x2335_0_0',
           'Ever had bowel cancer screening|x2345_0_0',
           'Diabetes diagnosed by doctor|x2443_0_0',
           'Cancer diagnosed by doctor|x2453_0_0',
           'Fractured/broken bones in last 5 years|x2463_0_0',
           'Other serious medical condition/disability diagnosed by doctor|x2473_0_0',
           'Taking other prescription medications|x2492_0_0',
           'Pace-maker|x3079_0_0', 'Contra-indications for spirometry|x3088_0_0', 
           'Caffeine drink within last hour|x3089_0_0', 'Used an inhaler for chest within last hour|x3090_0_0',
           'Method of measuring blood pressure|x4081_0_0', 'Qualifications|x6138_0_0',
           'Gas or solid-fuel cooking/heating|x6139_0_0', 
           'How are people in household related to participant|x6141_0_0',
           'Current employment status|x6142_0_0',
           'Never eat eggs dairy wheat sugar|x6144_0_0',
           'Illness injury bereavement stress in last 2 years|x6145_0_0',
           'Attendance/disability/mobility allowance|x6146_0_0',
           'Mouth/teeth dental problems|x6149_0_0',
           'Medication for pain relief constipation heartburn|x6154_0_0',
           'Vitamin and mineral supplements|x6155_0_0', 'Pain type(s) experienced in last month|x6159_0_0',
           'Leisure/social activities|x6160_0_0',
           'Types of transport used (excluding work)|x6162_0_0',
           'Types of physical activity in last 4 weeks|x6164_0_0',
           'Mineral and other dietary supplements|x6179_0_0',
           'Illnesses of father|x20107_0_0',
           'Illnesses of mother|x20110_0_0',
           'Illnesses of siblings|x20111_0_0',
           'Smoking status|x20116_0_0',
           'Alcohol drinker status|x20117_0_0',
           'Home area population density - urban or rural|x20118_0_0',
           'Ever smoked|x20160_0_0',  'Spirometry QC measure|x20255_0_0', 'Genetic sex|x22001_0_0',
           'Genetic kinship to other participants|x22021_0_0',
           'IPAQ activity group|x22032_0_0', 'Summed days activity|x22033_0_0',
           'Above moderate/vigorous recommendation|x22035_0_0',
           'Above moderate/vigorous/walking recommendation|x22036_0_0',
           'Close to major road|x24014_0_0', 'medication_cbi' ]

In [None]:
label = ['Year of birth|x34_0_0', 'Month of birth|x52_0_0', 'Month of attending assessment centre|x55_0_0', 
         'Number of self-reported cancers|x134_0_0', 'Number of self-reported non-cancer illnesses|x135_0_0',
         'Number of operations self-reported|x136_0_0', 'Number of treatments/medications taken|x137_0_0',
         'Number in household|x709_0_0',  'Number of vehicles in household|x728_0_0', 
         'Average total household income before tax|x738_0_0',
         'Number of days/week walked 10+ minutes|x864_0_0', 
         'Number of days/week of moderate physical activity 10+ minutes|x884_0_0',
         'Number of days/week of vigorous physical activity 10+ minutes|x904_0_0', 'Usual walking pace|x924_0_0',
         'Frequency of stair climbing in last 4 weeks|x943_0_0', 'Frequency of friend/family visits|x1031_0_0',
         'Time spend outdoors in summer|x1050_0_0', 'Time spent outdoors in winter|x1060_0_0', 
         'Time spent watching television (TV)|x1070_0_0','Time spent using computer|x1080_0_0', 
         'Time spent driving|x1090_0_0', 'Length of mobile phone use|x1110_0_0',
         'Weekly usage of mobile phone in last 3 months|x1120_0_0',
         'Hands-free device/speakerphone use with mobile phone in last 3 month|x1130_0_0',
         'Difference in mobile phone use compared to two years previously|x1140_0_0', 'Sleep duration|x1160_0_0',
         'Getting up in morning|x1170_0_0', 'Sleeplessness / insomnia|x1200_0_0', 
         'Smoking/smokers in household|x1259_0_0', 'Cooked vegetable intake|x1289_0_0',
         'Salad / raw vegetable intake|x1299_0_0', 'Fresh fruit intake|x1309_0_0', 'Dried fruit intake|x1319_0_0',
         'Oily fish intake|x1329_0_0', 'Non-oily fish intake|x1339_0_0', 'Processed meat intake|x1349_0_0',
         'Poultry intake|x1359_0_0', 'Beef intake|x1369_0_0', 'Lamb/mutton intake|x1379_0_0',
         'Pork intake|x1389_0_0', 'Cheese intake|x1408_0_0', 'Milk type used|x1418_0_0',
         'Spread type|x1428_0_0', 'Bread type|x1448_0_0', 'Cereal intake|x1458_0_0',
         'Cereal type|x1468_0_0', 'Salt added to food|x1478_0_0', 'Coffee intake|x1498_0_0',
         'Coffee type|x1508_0_0', 'Hot drink temperature|x1518_0_0', 'Water intake|x1528_0_0',
         'Alcohol intake frequency.|x1558_0_0', 'Number of full brothers|x1873_0_0',
         'Number of full sisters|x1883_0_0', 'Frequency of depressed mood in last 2 weeks|x2050_0_0',
         'Frequency of unenthusiasm / disinterest in last 2 weeks|x2060_0_0',
         'Frequency of tenseness / restlessness in last 2 weeks|x2070_0_0',
         'Frequency of tiredness / lethargy in last 2 weeks|x2080_0_0', 'Overall health rating|x2178_0_0'
         'Falls in the last year|x2296_0_0', 'Result ranking|x3059_0_0',
         'Ordering of blows|x3065_0_0',  'Seating box height|x3077_0_0', 'Number of measurements made|x3137_0_0',
         'Neuroticism score|x20127_0_0', 'Age when attended assessment centre|x21003_0_0',
         'Age at recruitment|x21022_0_0', 'Genotype measurement batch|x22000_0_0',
         'Total volume of urine samples held by UKB|x30394_0_0']

In [None]:
# Save dataframe with new headers for downstream analsysis
df.to_csv('/data/home/bt211037/dissertation/supervised_ML_data.tsv', sep='\t')

In [50]:
# Look at how correlated the lowest of these correlations are 
high_correlations.tail(10)

Summed minutes activity|x22034_0_0     Summed MET minutes per week for all activity|x22040_0_0    0.994302
Arm predicted mass (left)|x23126_0_0   Trunk fat-free mass|x23129_0_0                             0.994229
Whole body fat mass|x23100_0_0         Arm fat mass (left)|x23124_0_0                             0.994208
Leg predicted mass (right)|x23114_0_0  Trunk fat-free mass|x23129_0_0                             0.994090
Leg fat-free mass (right)|x23113_0_0   Arm fat-free mass (left)|x23125_0_0                        0.994078
Leg predicted mass (right)|x23114_0_0  Arm predicted mass (left)|x23126_0_0                       0.994042
Platelet count|x30080_0_0              Platelet crit|x30090_0_0                                   0.993873
Leg predicted mass (left)|x23118_0_0   Arm fat-free mass (left)|x23125_0_0                        0.993831
Leg predicted mass (right)|x23114_0_0  Trunk predicted mass|x23130_0_0                            0.993688
Leg fat-free mass (right)|x23113_0_0 

In [51]:
most_correlated = [list(x) for x in high_correlations.index]

In [52]:
high_corr = []
for item in most_correlated:
    for x in item:
        if x not in high_corr:
            high_corr.append(x)
        else:
            pass
    

In [53]:
high_corr

['Age when attended assessment centre|x21003_0_0',
 'Age at recruitment|x21022_0_0',
 'Sex|x31_0_0',
 'Genetic sex|x22001_0_0',
 'Average daytime sound level of noise pollution|x24020_0_0',
 'Average 24-hour sound level of noise pollution|x24024_0_0',
 'Average 16-hour sound level of noise pollution|x24023_0_0',
 'Whole body fat-free mass|x23101_0_0',
 'Whole body water mass|x23102_0_0',
 'Trunk fat-free mass|x23129_0_0',
 'Trunk predicted mass|x23130_0_0',
 'Arm fat-free mass (right)|x23121_0_0',
 'Arm predicted mass (right)|x23122_0_0',
 'Leg fat-free mass (right)|x23113_0_0',
 'Leg predicted mass (right)|x23114_0_0',
 'Leg fat-free mass (left)|x23117_0_0',
 'Leg predicted mass (left)|x23118_0_0',
 'Arm fat-free mass (left)|x23125_0_0',
 'Arm predicted mass (left)|x23126_0_0',
 'Arm fat mass (right)|x23120_0_0',
 'Arm fat mass (left)|x23124_0_0',
 'Average evening sound level of noise pollution|x24021_0_0',
 'Arm fat percentage (right)|x23119_0_0',
 'Arm fat percentage (left)|x23123_

In [None]:
# Pairplot to look at correlated values - batched in smaller groups

customPalette = sns.set_palette(sns.color_palette(colours))

data = high_corr[0:5]
data.append('thyroid_cancer')

sns.pairplot(df[data], hue="thyroid_cancer", corner=True, palette=customPalette)

plt.show()


# Look at categorical data 

Will labal encoding or categorical encoding be needed and for which columns.

In [None]:
# Look at categorical clumns to see which should be one hot encoded and which label encoded
categorical_cols

In [None]:
one_hot = ['Weight method|x21_0_0', 'Spirometry method|x23_0_0', 'Sex|x31_0_0', 
           'UK Biobank assessment centre|x54_0_0', 'Birth weight known|x120_0_0', 
           'Type of accommodation lived in|x670_0_0',
           'Own or rent accommodation lived in|x680_0_0', 'Drive faster than motorway speed limit|x1100_0_0',
           'Usual side of head for mobile phone use|x1150_0_0', 'Usual side of head for mobile phone use|x1150_0_0',
           'Morning/evening person (chronotype)|x1180_0_0', 'Nap during day|x1190_0_0', 'Snoring|x1210_0_0',
           'Daytime dozing / sleeping (narcolepsy)|x1220_0_0', 'Current tobacco smoking|x1239_0_0',
           'Past tobacco smoking|x1249_0_0', 'Major dietary changes in the last 5 years|x1538_0_0',
           'Variation in diet|x1548_0_0',  'Alcohol usually taken with meals|x1618_0_0',
           'Alcohol intake versus 10 years previously|x1628_0_0', 'Skin colour|x1717_0_0', 
           'Ease of skin tanning|x1727_0_0', 'Hair colour (natural before greying)|x1747_0_0',
           'Facial ageing|x1757_0_0', 'Father still alive|x1797_0_0', 'Mother still alive|x1835_0_0', 
           'Mood swings|x1920_0_0', 'Miserableness|x1930_0_0', 'Irritability|x1940_0_0', 
           'Sensitivity / hurt feelings|x1950_0_0', 'Fed-up feelings|x1960_0_0', 'Nervous feelings|x1970_0_0',
           'Worrier / anxious feelings|x1980_0_0', "Tense / 'highly strung'|x1990_0_0",
           'Worry too long after embarrassment|x2000_0_0', "Suffer from 'nerves'|x2010_0_0",
           'Loneliness isolation|x2020_0_0', 'Guilty feelings|x2030_0_0', 'Risk taking|x2040_0_0',
           'Seen doctor (GP) for nerves anxiety tension or depression|x2090_0_0',
           'Seen a psychiatrist for nerves anxiety tension or depression|x2100_0_0',
           'Able to confide|x2110_0_0',
           'Answered sexual history questions|x2129_0_0',
           'Ever had same-sex intercourse|x2159_0_0', 'Long-standing illness disability or infirmity|x2188_0_0',
           'Wears glasses or contact lenses|x2207_0_0', 'Other eye problems|x2227_0_0',
           'Plays computer games|x2237_0_0', 'Hearing difficulty/problems|x2247_0_0',
           'Hearing difficulty/problems with background noise|x2257_0_0', 'Use of sun/uv protection|x2267_0_0',
           'Weight change compared with 1 year ago|x2306_0_0',
           'Wheeze or whistling in the chest in last year|x2316_0_0',
           'Chest pain or discomfort|x2335_0_0',
           'Ever had bowel cancer screening|x2345_0_0',
           'Diabetes diagnosed by doctor|x2443_0_0',
           'Cancer diagnosed by doctor|x2453_0_0',
           'Fractured/broken bones in last 5 years|x2463_0_0',
           'Other serious medical condition/disability diagnosed by doctor|x2473_0_0',
           'Taking other prescription medications|x2492_0_0',
           'Pace-maker|x3079_0_0', 'Contra-indications for spirometry|x3088_0_0', 
           'Caffeine drink within last hour|x3089_0_0', 'Used an inhaler for chest within last hour|x3090_0_0',
           'Method of measuring blood pressure|x4081_0_0', 'Qualifications|x6138_0_0',
           'Gas or solid-fuel cooking/heating|x6139_0_0', 
           'How are people in household related to participant|x6141_0_0',
           'Current employment status|x6142_0_0',
           'Never eat eggs dairy wheat sugar|x6144_0_0',
           'Illness injury bereavement stress in last 2 years|x6145_0_0',
           'Attendance/disability/mobility allowance|x6146_0_0',
           'Mouth/teeth dental problems|x6149_0_0',
           'Medication for pain relief constipation heartburn|x6154_0_0',
           'Vitamin and mineral supplements|x6155_0_0', 'Pain type(s) experienced in last month|x6159_0_0',
           'Leisure/social activities|x6160_0_0',
           'Types of transport used (excluding work)|x6162_0_0',
           'Types of physical activity in last 4 weeks|x6164_0_0',
           'Mineral and other dietary supplements|x6179_0_0',
           'Illnesses of father|x20107_0_0',
           'Illnesses of mother|x20110_0_0',
           'Illnesses of siblings|x20111_0_0',
           'Smoking status|x20116_0_0',
           'Alcohol drinker status|x20117_0_0',
           'Home area population density - urban or rural|x20118_0_0',
           'Ever smoked|x20160_0_0',  'Spirometry QC measure|x20255_0_0', 'Genetic sex|x22001_0_0',
           'Genetic kinship to other participants|x22021_0_0',
           'IPAQ activity group|x22032_0_0', 'Summed days activity|x22033_0_0',
           'Above moderate/vigorous recommendation|x22035_0_0',
           'Above moderate/vigorous/walking recommendation|x22036_0_0',
           'Close to major road|x24014_0_0', 'medication_cbi' ]

In [None]:
label = ['Year of birth|x34_0_0', 'Month of birth|x52_0_0', 'Month of attending assessment centre|x55_0_0', 
         'Number of self-reported cancers|x134_0_0', 'Number of self-reported non-cancer illnesses|x135_0_0',
         'Number of operations self-reported|x136_0_0', 'Number of treatments/medications taken|x137_0_0',
         'Number in household|x709_0_0',  'Number of vehicles in household|x728_0_0', 
         'Average total household income before tax|x738_0_0',
         'Number of days/week walked 10+ minutes|x864_0_0', 
         'Number of days/week of moderate physical activity 10+ minutes|x884_0_0',
         'Number of days/week of vigorous physical activity 10+ minutes|x904_0_0', 'Usual walking pace|x924_0_0',
         'Frequency of stair climbing in last 4 weeks|x943_0_0', 'Frequency of friend/family visits|x1031_0_0',
         'Time spend outdoors in summer|x1050_0_0', 'Time spent outdoors in winter|x1060_0_0', 
         'Time spent watching television (TV)|x1070_0_0','Time spent using computer|x1080_0_0', 
         'Time spent driving|x1090_0_0', 'Length of mobile phone use|x1110_0_0',
         'Weekly usage of mobile phone in last 3 months|x1120_0_0',
         'Hands-free device/speakerphone use with mobile phone in last 3 month|x1130_0_0',
         'Difference in mobile phone use compared to two years previously|x1140_0_0', 'Sleep duration|x1160_0_0',
         'Getting up in morning|x1170_0_0', 'Sleeplessness / insomnia|x1200_0_0', 
         'Smoking/smokers in household|x1259_0_0', 'Cooked vegetable intake|x1289_0_0',
         'Salad / raw vegetable intake|x1299_0_0', 'Fresh fruit intake|x1309_0_0', 'Dried fruit intake|x1319_0_0',
         'Oily fish intake|x1329_0_0', 'Non-oily fish intake|x1339_0_0', 'Processed meat intake|x1349_0_0',
         'Poultry intake|x1359_0_0', 'Beef intake|x1369_0_0', 'Lamb/mutton intake|x1379_0_0',
         'Pork intake|x1389_0_0', 'Cheese intake|x1408_0_0', 'Milk type used|x1418_0_0',
         'Spread type|x1428_0_0', 'Bread type|x1448_0_0', 'Cereal intake|x1458_0_0',
         'Cereal type|x1468_0_0', 'Salt added to food|x1478_0_0', 'Coffee intake|x1498_0_0',
         'Coffee type|x1508_0_0', 'Hot drink temperature|x1518_0_0', 'Water intake|x1528_0_0',
         'Alcohol intake frequency.|x1558_0_0', 'Number of full brothers|x1873_0_0',
         'Number of full sisters|x1883_0_0', 'Frequency of depressed mood in last 2 weeks|x2050_0_0',
         'Frequency of unenthusiasm / disinterest in last 2 weeks|x2060_0_0',
         'Frequency of tenseness / restlessness in last 2 weeks|x2070_0_0',
         'Frequency of tiredness / lethargy in last 2 weeks|x2080_0_0', 'Overall health rating|x2178_0_0'
         'Falls in the last year|x2296_0_0', 'Result ranking|x3059_0_0',
         'Ordering of blows|x3065_0_0',  'Seating box height|x3077_0_0', 'Number of measurements made|x3137_0_0',
         'Neuroticism score|x20127_0_0', 'Age when attended assessment centre|x21003_0_0',
         'Age at recruitment|x21022_0_0', 'Genotype measurement batch|x22000_0_0',
         'Total volume of urine samples held by UKB|x30394_0_0']

In [35]:
# Save dataframe with new headers for downstream analsysis
df.to_csv('/data/home/bt211037/dissertation/supervised_ML_data.tsv', sep='\t')