# Install

In [1]:
! pip install memory_profiler
%load_ext memory_profiler 

Defaulting to user installation because normal site-packages is not writeable


# Import

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearnex import patch_sklearn  # Speeds up sklearn with intel patch
patch_sklearn()  # Activate patch - changes sklearn imports below

from timeit import default_timer as timer # Time how long commands take
from sklearn.model_selection import train_test_split, StratifiedKFold  # test_train split, cross-validation

from sklearn.experimental import enable_iterative_imputer  # Iterative imputer experimental so need to enable it
from sklearn.impute import IterativeImputer  # Once enabled iterative imputer can be imported

from sklearn.linear_model import RidgeClassifier, BayesianRidge  # Imputation
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder  # Normalisation & Encoding
from imblearn.under_sampling import TomekLinks, RandomUnderSampler   # Undersampling
from imblearn.over_sampling import SMOTENC  # Oversampling
from sklearn.feature_selection import RFE, RFECV  # Recursive feature elimination - feature selection
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier  # RFE
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.svm import SVC
from imblearn.pipeline import Pipeline as imbpipeline

from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import validation_curve

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


# Variables

In [3]:
# Number of cores being used 
n_jobs = 10

In [4]:
# Random State
random_state = 14

# General Functions

In [5]:
# Stopwatch to profile function runtimes
class Stopwatch:

    # Initiate constructor
    def __init__(self):
        self.start = timer()
        self.end = None
        self.runtime = None

    # Stop stopwatch
    def stop(self):
        self.end = timer()
        self.runtime = self.end - self.start
        return self.runtime

In [6]:
# Find which columns are categorical and which continuous
def cat_con_cols(df):
    
    columns = [list(df[i]) for i in df]  # Nested list of column values
    num_unique_vals = [len(set([i for i in a if pd.notna(i)])) for a in columns]  # Num of unique values in a column

    categorical_indexes = [i for i, v in enumerate(num_unique_vals) if v <= 100
                           and set([a % 1 for a in df[df.columns[i]].dropna()]) == {0}]

    continuous_indexes = [i for i, v in enumerate(num_unique_vals) if v > 100 or
                          set([a % 1 for a in df[df.columns[i]].dropna()]) != {0}]

    cat = list(df.columns[categorical_indexes])
    con = list(df.columns[continuous_indexes])
    return cat, con

# Data Cleaning 

In [7]:
# Read in data
df = pd.read_csv('/data/home/bt211037/dissertation/no_leakage_data.tsv',
                   sep='\t', header=0, index_col=0)

### Convert categorical columns to integers

In [8]:
# Get the column names of the continuous and categorical data
cat, con = cat_con_cols(df)  

# Convert categorical cols values from floats to integers 
df[cat] = df[cat].astype('Int64')  

### Separate categorical and Continuous features in dataframe¶

This makes indexing certain features in later processess easier. Continuous features are the first columns followed by categorical.


In [9]:
df = pd.concat([df[con], df[cat]], axis=1, join="inner")

In [10]:
# Column indexes for categorical and continuous features
# Important to exclude target feature - thryoid_cancer
categorical_indexes = [i for i, x in enumerate(df.columns[0:374]) if x in cat]
continuous_indexes = [i for i, x in enumerate(df.columns) if x in con]


# Test Train Split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'thyroid_cancer'],
                                                        df['thyroid_cancer'],
                                                        test_size=0.2,
                                                        shuffle=True,
                                                        random_state=random_state,
                                                        stratify=df['thyroid_cancer'])


# Feature Selection 

Method of choice is Recursive Feature Elimination (RFE) using an ExtraTreesClassifier from sklearn as the estimator. Will be using a subset of the dataset through random undersampling of the majority class for resource efficiency.

### Pipeline

In [12]:
# KNN imputer
knn_imputer = KNNImputer(n_neighbors=1)

In [13]:
# Tomek Links undersampling
tl = TomekLinks(sampling_strategy='majority')

In [14]:
# SMOTE oversampling
smote = SMOTENC(random_state=random_state,
                categorical_features=categorical_indexes,
                sampling_strategy=1)


In [15]:
# Predictive Model 
rfe_model = ExtraTreesClassifier(n_estimators=200,
                                 max_features=20,
                                 max_depth=5,
                                 bootstrap=True,
                                 n_jobs=n_jobs, 
                                 random_state=random_state)

In [16]:
# RFE
rfe = RFE(rfe_model, step=25)

In [17]:
# Final Pipeline
rfe_pipeline = imbpipeline(steps = [('imputer', knn_imputer),
                                ('tomek', tl),
                                ('smotenc', smote),
                                ('rfe', rfe),
                                ('model', rfe_model)])

### Random Undersampling

Randomly undersample majority class to reduce the class imbalance and decrease resource use while retaining the varaiation of the dataset.

Some results are showing that training on this smaller dataset may give better results.

In [18]:
# Configure random undersampler
rus_rfe = RandomUnderSampler(sampling_strategy=0.1,
                         random_state=random_state)

In [19]:
# Create the dataset used for hyperparamter tuning and training
X_res_rfe, y_res_rfe = rus_rfe.fit_resample(X_train, y_train.astype('float64'))


In [20]:
X_res_rfe

Unnamed: 0,Waist circumference|x48_0_0,Hip circumference|x49_0_0,Standing height|x50_0_0,Time since interview start at which blood pressure screen(s) shown|x96_0_0,Pulse rate automated reading|x102_0_0,Place of birth in UK - north co-ordinate|x129_0_0,Place of birth in UK - east co-ordinate|x130_0_0,Townsend deprivation index at recruitment|x189_0_0,Length of time at current address|x699_0_0,Duration of walks|x874_0_0,...,Genotype measurement batch|x22000_0_0,Genetic sex|x22001_0_0,Genetic kinship to other participants|x22021_0_0,IPAQ activity group|x22032_0_0,Summed days activity|x22033_0_0,Above moderate/vigorous recommendation|x22035_0_0,Above moderate/vigorous/walking recommendation|x22036_0_0,Close to major road|x24014_0_0,Total volume of urine samples held by UKB|x30394_0_0,medication_cbi
0,79.0,99.0,163.1,144.0,89.0,564500.0,425500.0,1.441460,12.0,15.0,...,1,0,1,0,12,0,1,0,5100,-1
1,87.0,90.0,174.0,94.0,69.0,188500.0,514500.0,-0.688732,25.0,100.0,...,0,1,0,0,4,0,1,0,5100,-1
2,68.0,101.0,155.0,138.0,54.0,336500.0,452500.0,-3.168330,28.0,60.0,...,1,0,0,2,19,1,1,0,5100,-1
3,68.0,99.0,159.0,51.0,58.0,388500.0,327500.0,-4.493780,31.0,45.0,...,1,0,0,1,16,1,1,0,5100,-1
4,92.0,102.0,166.0,54.0,65.0,393500.0,381500.0,-2.409450,24.0,30.0,...,1,0,0,2,18,1,1,0,5100,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6034,69.0,89.0,160.0,68.0,65.0,172500.0,534500.0,0.577263,59.0,60.0,...,1,0,1,2,16,1,1,0,5100,-1
6035,68.0,90.0,158.0,101.0,65.0,523500.0,446500.0,-4.338350,19.0,30.0,...,1,0,1,2,12,1,1,0,5100,-1
6036,75.0,88.0,158.0,77.0,74.0,401500.0,386500.0,-3.787730,6.0,240.0,...,1,0,0,,,,,1,5100,1
6037,60.0,88.0,153.0,53.0,67.0,181500.0,534500.0,-2.833990,1.0,,...,1,0,0,,,,,0,5100,-1


### Searching for the optimum features

In [21]:
# Parameter to search
rfe_search_grid = {'rfe__n_features_to_select':range(20, 380, 5),
                   'model__max_depth': [5, 10],
                   'model__bootstrap': [True, False]}

In [22]:
# Random search configurations
rf_rfe_hyper_search = RandomizedSearchCV(estimator=rfe_pipeline,
                                    param_distributions=rfe_search_grid,
                                    n_iter=100,
                                    cv=3,
                                    n_jobs=n_jobs,
                                    verbose=1,
                                    random_state=random_state,
                                    scoring='f1')

In [23]:
%%memit

t=Stopwatch()

rf_rfe_hyper_search.fit(X_res_rfe, y_res_rfe)

print(t.stop())

Fitting 3 folds for each of 100 candidates, totalling 300 fits
1506.3540014168248
peak memory: 3418.51 MiB, increment: 773.87 MiB


In [24]:
# Look at results of search
pd.DataFrame(rf_rfe_hyper_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rfe__n_features_to_select,param_model__max_depth,param_model__bootstrap,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,33.468880,2.938826,10.181536,1.262952,340,5,True,"{'rfe__n_features_to_select': 340, 'model__max...",0.132780,0.116223,0.118421,0.122475,0.007342,39
1,30.220109,5.704293,9.457935,2.458464,360,10,True,"{'rfe__n_features_to_select': 360, 'model__max...",0.074324,0.070671,0.088328,0.077775,0.007610,80
2,42.291064,0.802589,10.959171,0.179695,115,10,True,"{'rfe__n_features_to_select': 115, 'model__max...",0.094340,0.063492,0.092879,0.083570,0.014210,71
3,44.222551,0.473822,9.885333,1.775757,45,5,True,"{'rfe__n_features_to_select': 45, 'model__max_...",0.182663,0.129436,0.142857,0.151652,0.022602,7
4,37.450662,1.428466,11.600028,0.743567,250,10,False,"{'rfe__n_features_to_select': 250, 'model__max...",0.060403,0.064057,0.074576,0.066345,0.006008,99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,35.131193,1.671196,10.631540,0.352864,285,10,True,"{'rfe__n_features_to_select': 285, 'model__max...",0.093023,0.064516,0.071197,0.076246,0.012173,83
96,41.030632,0.546870,10.097317,0.281120,140,5,False,"{'rfe__n_features_to_select': 140, 'model__max...",0.154676,0.129176,0.106996,0.130283,0.019481,27
97,35.826725,2.792602,10.327680,0.324707,265,5,True,"{'rfe__n_features_to_select': 265, 'model__max...",0.141667,0.121065,0.099783,0.120838,0.017100,43
98,33.822263,1.098840,10.212557,0.463097,310,5,False,"{'rfe__n_features_to_select': 310, 'model__max...",0.132000,0.104019,0.100656,0.112225,0.014050,53


In [25]:
# Find the optimum number of features
rf_rfe_hyper_search.best_params_

{'rfe__n_features_to_select': 35,
 'model__max_depth': 5,
 'model__bootstrap': True}

# Create New Dataset with Selected Features

In [36]:
# Predictive Model 
rfe_model = ExtraTreesClassifier(n_estimators=200,
                                 max_features=20,
                                 max_depth=5,
                                 bootstrap=True,
                                 n_jobs=n_jobs)
                         

In [37]:
# RFE
rfe = RFE(rfe_model, step=25, n_features_to_select=35)

In [38]:
# Extract the optimum feature indexes
# Final Pipeline
rfe_pipeline_feats = imbpipeline(steps = [('imputer', knn_imputer),
                                ('tomek', tl),
                                ('smotenc', smote),
                                ('rfe', rfe)])

In [39]:
# Fit the feature selection pipeline
feats = rfe_pipeline_feats.fit(X_res_rfe, y_res_rfe)

In [40]:
# Get the column names of the selected features
selected_feats = list(X_res_rfe.columns[feats.named_steps['rfe'].support_])

In [41]:
selected_feats

['Whole body fat-free mass|x23101_0_0',
 'Whole body water mass|x23102_0_0',
 'Leg fat percentage (right)|x23111_0_0',
 'Leg predicted mass (right)|x23114_0_0',
 'Leg fat percentage (left)|x23115_0_0',
 'Leg fat-free mass (left)|x23117_0_0',
 'Leg predicted mass (left)|x23118_0_0',
 'Arm fat-free mass (right)|x23121_0_0',
 'Arm predicted mass (right)|x23122_0_0',
 'Arm fat-free mass (left)|x23125_0_0',
 'Arm predicted mass (left)|x23126_0_0',
 'Trunk fat-free mass|x23129_0_0',
 'Trunk predicted mass|x23130_0_0',
 'Testosterone|x30850_0_0',
 'Number of days/week of vigorous physical activity 10+ minutes|x904_0_0',
 'Hands-free device/speakerphone use with mobile phone in last 3 month|x1130_0_0',
 'Milk type used|x1418_0_0',
 'Alcohol intake versus 10 years previously|x1628_0_0',
 'Childhood sunburn occasions|x1737_0_0',
 'Father still alive|x1797_0_0',
 'Nervous feelings|x1970_0_0',
 "Suffer from 'nerves'|x2010_0_0",
 'Loneliness isolation|x2020_0_0',
 'Answered sexual history questions

In [42]:
selected_feats.append('thyroid_cancer')

In [43]:
# Create new dataframe with only selected features
df_fs = df[selected_feats]

In [44]:
df_fs

Unnamed: 0_level_0,Whole body fat-free mass|x23101_0_0,Whole body water mass|x23102_0_0,Leg fat percentage (right)|x23111_0_0,Leg predicted mass (right)|x23114_0_0,Leg fat percentage (left)|x23115_0_0,Leg fat-free mass (left)|x23117_0_0,Leg predicted mass (left)|x23118_0_0,Arm fat-free mass (right)|x23121_0_0,Arm predicted mass (right)|x23122_0_0,Arm fat-free mass (left)|x23125_0_0,...,Falls in the last year|x2296_0_0,Acceptability of each blow result|x3061_0_0,Number of measurements made|x3137_0_0,Illness injury bereavement stress in last 2 years|x6145_0_0,Types of transport used (excluding work)|x6162_0_0,Illnesses of siblings|x20111_0_0,Spirometry QC measure|x20255_0_0,Genetic sex|x22001_0_0,Above moderate/vigorous/walking recommendation|x22036_0_0,thyroid_cancer
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000020,42.0,30.8,32.5,6.9,33.4,7.1,6.7,2.0,1.9,2.0,...,1,0,3,0,1,0,1,0,1,0
1000037,40.4,29.6,43.9,6.3,43.1,6.8,6.4,2.0,1.9,2.1,...,2,2,3,0,2,8,3,0,,0
1000043,47.1,34.5,40.8,7.5,41.5,7.8,7.3,2.4,2.2,2.5,...,1,0,3,3,1,8,1,0,,0
1000066,65.9,48.2,19.6,9.7,20.4,10.0,9.5,3.8,3.6,4.2,...,2,0,3,0,1,,3,1,,0
1000092,46.2,33.8,45.6,7.7,45.5,8.1,7.6,2.4,2.2,2.5,...,1,0,3,3,1,0,,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6025060,75.7,55.4,27.6,12.1,23.5,13.3,12.6,4.5,4.2,4.6,...,1,0,3,0,1,0,1,1,1,0
6025078,57.4,42.0,17.4,9.2,17.6,9.5,9.0,3.2,3.0,3.1,...,1,0,3,3,3,,3,1,1,0
6025087,58.8,43.1,35.7,9.5,33.4,10.2,9.7,3.3,3.1,3.3,...,1,,,4,1,,,1,1,0
6025093,49.8,36.5,44.5,8.3,44.9,8.6,8.1,2.5,2.3,2.6,...,1,0,2,1,3,0,1,0,0,0


In [45]:
# Save new dataframe (tsv) with only the selected features 
# To be used as the dataset from now on
df_fs.to_csv('/data/home/bt211037/dissertation/feats_selected_dataset.tsv',
             sep='\t')  
