# Install 

In [1]:
! pip install memory_profiler
%load_ext memory_profiler 

Defaulting to user installation because normal site-packages is not writeable


# Import 

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearnex import patch_sklearn  # Speeds up sklearn with intel patch
patch_sklearn()  # Activate patch - changes sklearn imports below

from timeit import default_timer as timer # Time how long commands take
from sklearn.model_selection import train_test_split, StratifiedKFold  # test_train split, cross-validation

from sklearn.experimental import enable_iterative_imputer  # Iterative imputer experimental so need to enable it
from sklearn.impute import IterativeImputer  # Once enabled iterative imputer can be imported

from sklearn.linear_model import RidgeClassifier, BayesianRidge  # Imputation
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder  # Normalisation & Encoding
from imblearn.under_sampling import TomekLinks  # Undersampling
from imblearn.over_sampling import SMOTENC  # Oversampling
from sklearn.feature_selection import RFE, RFECV  # Recursive feature elimination - feature selection
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier  # RFE
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.svm import SVC




Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


# Random State

In [7]:
random_state=0

# General Functions

In [3]:
# Stopwatch to profile function runtimes
class Stopwatch:

    # Initiate constructor
    def __init__(self):
        self.start = timer()
        self.end = None
        self.runtime = None

    # Stop stopwatch
    def stop(self):
        self.end = timer()
        self.runtime = self.end - self.start
        return self.runtime

# Data Cleaning

In [9]:
df = pd.read_csv('/data/home/bt211037/dissertation/supervised_ML_data.tsv',
                   sep='\t', header=0, index_col=0)


### One Hot Encoding (if required!) - ERROR NANs CONVERTED

In [17]:
# Columns requiring one hot encoding
onehot = ['Weight method|x21_0_0', 'Spirometry method|x23_0_0', 'UK Biobank assessment centre|x54_0_0',
          'Birth weight known|x120_0_0', 'Type of accommodation lived in|x670_0_0',
          'Own or rent accommodation lived in|x680_0_0', 'Drive faster than motorway speed limit|x1100_0_0',
          'Usual side of head for mobile phone use|x1150_0_0', 'Usual side of head for mobile phone use|x1150_0_0',
          'Morning/evening person (chronotype)|x1180_0_0', 'Nap during day|x1190_0_0', 'Snoring|x1210_0_0',
          'Daytime dozing / sleeping (narcolepsy)|x1220_0_0', 'Current tobacco smoking|x1239_0_0',
          'Past tobacco smoking|x1249_0_0', 'Major dietary changes in the last 5 years|x1538_0_0',
          'Variation in diet|x1548_0_0', 'Alcohol usually taken with meals|x1618_0_0',
          'Alcohol intake versus 10 years previously|x1628_0_0', 'Skin colour|x1717_0_0',
          'Ease of skin tanning|x1727_0_0', 'Hair colour (natural before greying)|x1747_0_0',
          'Facial ageing|x1757_0_0', 'Father still alive|x1797_0_0', 'Mother still alive|x1835_0_0',
          'Mood swings|x1920_0_0', 'Miserableness|x1930_0_0', 'Irritability|x1940_0_0',
          'Sensitivity / hurt feelings|x1950_0_0', 'Fed-up feelings|x1960_0_0', 'Nervous feelings|x1970_0_0',
          'Worrier / anxious feelings|x1980_0_0', "Tense / 'highly strung'|x1990_0_0",
          'Worry too long after embarrassment|x2000_0_0', "Suffer from 'nerves'|x2010_0_0",
          'Loneliness isolation|x2020_0_0', 'Guilty feelings|x2030_0_0', 'Risk taking|x2040_0_0',
          'Seen doctor (GP) for nerves anxiety tension or depression|x2090_0_0',
          'Seen a psychiatrist for nerves anxiety tension or depression|x2100_0_0', 'Able to confide|x2110_0_0',
          'Answered sexual history questions|x2129_0_0', 'Ever had same-sex intercourse|x2159_0_0',
          'Long-standing illness disability or infirmity|x2188_0_0', 'Wears glasses or contact lenses|x2207_0_0',
          'Other eye problems|x2227_0_0', 'Plays computer games|x2237_0_0', 'Hearing difficulty/problems|x2247_0_0',
          'Hearing difficulty/problems with background noise|x2257_0_0', 'Use of sun/uv protection|x2267_0_0',
          'Weight change compared with 1 year ago|x2306_0_0', 'Wheeze or whistling in the chest in last year|x2316_0_0',
          'Chest pain or discomfort|x2335_0_0', 'Ever had bowel cancer screening|x2345_0_0',
          'Diabetes diagnosed by doctor|x2443_0_0', 'Cancer diagnosed by doctor|x2453_0_0',
          'Fractured/broken bones in last 5 years|x2463_0_0',
          'Other serious medical condition/disability diagnosed by doctor|x2473_0_0',
          'Taking other prescription medications|x2492_0_0', 'Pace-maker|x3079_0_0',
          'Contra-indications for spirometry|x3088_0_0', 'Caffeine drink within last hour|x3089_0_0',
          'Used an inhaler for chest within last hour|x3090_0_0', 'Method of measuring blood pressure|x4081_0_0',
          'Qualifications|x6138_0_0', 'Gas or solid-fuel cooking/heating|x6139_0_0',
          'How are people in household related to participant|x6141_0_0', 'Current employment status|x6142_0_0',
          'Never eat eggs dairy wheat sugar|x6144_0_0', 'Illness injury bereavement stress in last 2 years|x6145_0_0',
          'Attendance/disability/mobility allowance|x6146_0_0', 'Mouth/teeth dental problems|x6149_0_0',
          'Medication for pain relief constipation heartburn|x6154_0_0', 'Vitamin and mineral supplements|x6155_0_0',
          'Pain type(s) experienced in last month|x6159_0_0', 'Leisure/social activities|x6160_0_0',
          'Types of transport used (excluding work)|x6162_0_0', 'Types of physical activity in last 4 weeks|x6164_0_0',
          'Mineral and other dietary supplements|x6179_0_0', 'Illnesses of father|x20107_0_0',
          'Illnesses of mother|x20110_0_0', 'Illnesses of siblings|x20111_0_0', 'Smoking status|x20116_0_0',
          'Alcohol drinker status|x20117_0_0', 'Home area population density - urban or rural|x20118_0_0',
          'Spirometry QC measure|x20255_0_0', 'Genetic sex|x22001_0_0',
          'Genetic kinship to other participants|x22021_0_0', 'IPAQ activity group|x22032_0_0',
          'Summed days activity|x22033_0_0', 'Above moderate/vigorous recommendation|x22035_0_0',
          'Above moderate/vigorous/walking recommendation|x22036_0_0', 'Close to major road|x24014_0_0',
          'medication_cbi']

In [18]:
# Feature encoding
def feature_encoding(dataframe, Onehot):
    
    encode_targets = []  # Remove binary columns
    for item in Onehot:
        col = dataframe[item]
        unique = len(set(col))
        if unique > 2:
            encode_targets.append(item)

    onehot = OneHotEncoder()
    onehot_data = onehot.fit_transform(dataframe[encode_targets]).toarray()  # Create new cols
    one_hot_names = onehot.get_feature_names_out(encode_targets)  # Get new col names
    onehot_df = pd.DataFrame(onehot_data, columns=one_hot_names, index=dataframe.index)  # Create df of new cols
    dataframe = dataframe.drop(axis=1, labels=Onehot)  # Drop original columns
    dataframe = dataframe.join(onehot_df)

    return dataframe

In [20]:
df = feature_encoding(df, Onehot=onehot)  # One hot encoding


### Test Train Split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'thyroid_cancer'],
                                                        df['thyroid_cancer'],
                                                        test_size=0.2,
                                                        shuffle=True,
                                                        random_state=random_state)

### Convert categorical columns to integers

In [23]:
# Find which columns are categorical and which continuous
def cat_con_cols(df):
    columns = [list(df[i]) for i in df]  # Nested list of column values
    num_unique_vals = [len(set([i for i in a if pd.notna(i)])) for a in columns]  # Num of unique values in a column

    categorical_indexes = [i for i, v in enumerate(num_unique_vals) if v <= 100
                           and set([a % 1 for a in df[df.columns[i]].dropna()]) == {0}]

    continuous_indexes = [i for i, v in enumerate(num_unique_vals) if v > 100 or
                          set([a % 1 for a in df[df.columns[i]].dropna()]) != {0}]

    cat = list(df.columns[categorical_indexes])
    con = list(df.columns[continuous_indexes])
    return cat, con


In [24]:
cat, con = cat_con_cols(X_train)  # Get the column names of the continuous and nominal data
X_train[cat] = X_train[cat].astype('Int64')  # Convert categorical cols values from floats to integers - train
X_test[cat] = X_test[cat].astype('Int64')  # Convert categorical cols values from floats to integers - test



TypeError: not all arguments converted during string formatting

### Normalisation

In [None]:
# Normalised feature scaling
def minmax_scaling(df, continuous_data):

    outdata = df
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(df[continuous_data])
    outdata[continuous_data] = scaled

    return outdata

In [None]:
X_train = minmax_scaling(X_train, con)

# RFECV