# Install

In [1]:
! pip install memory_profiler
%load_ext memory_profiler 

Defaulting to user installation because normal site-packages is not writeable


# Import

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearnex import patch_sklearn  # Speeds up sklearn with intel patch
patch_sklearn()  # Activate patch - changes sklearn imports below

from timeit import default_timer as timer # Time how long commands take
from sklearn.model_selection import train_test_split, StratifiedKFold  # test_train split, cross-validation

from sklearn.experimental import enable_iterative_imputer  # Iterative imputer experimental so need to enable it
from sklearn.impute import IterativeImputer  # Once enabled iterative imputer can be imported

from sklearn.linear_model import RidgeClassifier, BayesianRidge  # Imputation
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder  # Normalisation & Encoding
from imblearn.under_sampling import TomekLinks, RandomUnderSampler   # Undersampling
from imblearn.over_sampling import SMOTENC  # Oversampling
from sklearn.feature_selection import RFE, RFECV  # Recursive feature elimination - feature selection
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier  # RFE
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.svm import SVC
from imblearn.pipeline import Pipeline as imbpipeline

from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.model_selection import validation_curve

import pickle

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


# Variables

In [3]:
# Number of cores being used 
n_jobs = 10

In [4]:
# Random State
random_state = 14

# General Functions

In [5]:
# Stopwatch to profile function runtimes
class Stopwatch:

    # Initiate constructor
    def __init__(self):
        self.start = timer()
        self.end = None
        self.runtime = None

    # Stop stopwatch
    def stop(self):
        self.end = timer()
        self.runtime = self.end - self.start
        return self.runtime

# Manual Feature Selection

In [6]:
# Read in data
df = pd.read_csv('/data/home/bt211037/dissertation/supervised_ML_data.tsv',
                   sep='\t', header=0, index_col=0)


In [9]:
# Get a list of the features
features = list(df.columns)

In [10]:
features

['Weight method|x21_0_0',
 'Spirometry method|x23_0_0',
 'Year of birth|x34_0_0',
 'Hand grip strength (left)|x46_0_0',
 'Hand grip strength (right)|x47_0_0',
 'Waist circumference|x48_0_0',
 'Hip circumference|x49_0_0',
 'Standing height|x50_0_0',
 'Month of birth|x52_0_0',
 'UK Biobank assessment centre|x54_0_0',
 'Month of attending assessment centre|x55_0_0',
 'Time since interview start at which blood pressure screen(s) shown|x96_0_0',
 'Pulse rate automated reading|x102_0_0',
 'Birth weight known|x120_0_0',
 'Place of birth in UK - north co-ordinate|x129_0_0',
 'Place of birth in UK - east co-ordinate|x130_0_0',
 'Number of self-reported cancers|x134_0_0',
 'Number of self-reported non-cancer illnesses|x135_0_0',
 'Number of operations self-reported|x136_0_0',
 'Number of treatments/medications taken|x137_0_0',
 'Townsend deprivation index at recruitment|x189_0_0',
 'Type of accommodation lived in|x670_0_0',
 'Own or rent accommodation lived in|x680_0_0',
 'Length of time at curr

In [11]:
# Create list of features to be removed 
bad_feats = [
    
    'Number of self-reported cancers|x134_0_0',
    'Number of operations self-reported|x136_0_0',
    'Number of treatments/medications taken|x137_0_0',
    'Cancer diagnosed by doctor|x2453_0_0',
    'Taking other prescription medications|x2492_0_0',
    'Other serious medical condition/disability diagnosed by doctor|x2473_0_0',
    'Records in HES inpatient operations dataset|x41149_0_0',
    'Records in HES inpatient diagnoses dataset|x41234_0_0',
    'Records in HES inpatient main dataset|x41259_0_0',
    
]

In [13]:
# Remove inappropriate features 
m_sel_features = [x for x in features if x not in bad_feats]

In [17]:
# new dataframe
no_leakage_data = df[m_sel_features]

In [18]:
# Save dataframe
no_leakage_data.to_csv('/data/home/bt211037/dissertation/no_leakage_data.tsv',
                      sep='\t',
                      index=True)

