# Install

In [1]:
! pip install memory_profiler
%load_ext memory_profiler 

Defaulting to user installation because normal site-packages is not writeable


# Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearnex import patch_sklearn  # Speeds up sklearn with intel patch
patch_sklearn()  # Activate patch - changes sklearn imports below

from timeit import default_timer as timer # Time how long commands take
from sklearn.model_selection import train_test_split, StratifiedKFold  # test_train split, cross-validation

from sklearn.experimental import enable_iterative_imputer  # Iterative imputer experimental so need to enable it
from sklearn.impute import IterativeImputer  # Once enabled iterative imputer can be imported

from sklearn.linear_model import RidgeClassifier, BayesianRidge  # Imputation
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder  # Normalisation & Encoding
from imblearn.under_sampling import TomekLinks, RandomUnderSampler   # Undersampling
from imblearn.over_sampling import SMOTENC  # Oversampling
from sklearn.feature_selection import RFE, RFECV  # Recursive feature elimination - feature selection
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier  # RFE
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.svm import SVC
from imblearn.pipeline import Pipeline as imbpipeline

from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.model_selection import validation_curve

import pickle

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


# Variables

In [3]:
# Number of cores being used 
n_jobs = 10

In [4]:
# Random State
random_state = 14

# General Functions

In [5]:
# Stopwatch to profile function runtimes
class Stopwatch:

    # Initiate constructor
    def __init__(self):
        self.start = timer()
        self.end = None
        self.runtime = None

    # Stop stopwatch
    def stop(self):
        self.end = timer()
        self.runtime = self.end - self.start
        return self.runtime

In [6]:
# Find which columns are categorical and which continuous
def cat_con_cols(df):
    
    columns = [list(df[i]) for i in df]  # Nested list of column values
    num_unique_vals = [len(set([i for i in a if pd.notna(i)])) for a in columns]  # Num of unique values in a column

    categorical_indexes = [i for i, v in enumerate(num_unique_vals) if v <= 100
                           and set([a % 1 for a in df[df.columns[i]].dropna()]) == {0}]

    continuous_indexes = [i for i, v in enumerate(num_unique_vals) if v > 100 or
                          set([a % 1 for a in df[df.columns[i]].dropna()]) != {0}]

    cat = list(df.columns[categorical_indexes])
    con = list(df.columns[continuous_indexes])
    return cat, con

# Data Cleaning

In [7]:
# Read in data
df = pd.read_csv('/data/home/bt211037/dissertation/feats_selected_dataset.tsv',
                   sep='\t', header=0, index_col=0)

### Convert categorical columns to integers

In [8]:
# Get the column names of the continuous and categorical data
cat, con = cat_con_cols(df)  

# Convert categorical cols values from floats to integers 
df[cat] = df[cat].astype('Int64')  

### Separate categorical and Continuous features in dataframe¶

This makes indexing certain features in later processess easier. Continuous features are the first columns followed by categorical.


In [9]:
df = pd.concat([df[con], df[cat]], axis=1, join="inner")

In [9]:
# Column indexes for categorical and continuous features
# Important to exclude target feature - thryoid_cancer
categorical_indexes = [i for i, x in enumerate(df.columns[0:34]) if x in cat]
continuous_indexes = [i for i, x in enumerate(df.columns) if x in con]


# Test Train Split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'thyroid_cancer'],
                                                        df['thyroid_cancer'],
                                                        test_size=0.2,
                                                        shuffle=True,
                                                        random_state=random_state,
                                                        stratify=df['thyroid_cancer'])


# Pipeline

### Imputation
Testing three types of imputation:
- Simple Imputation
- Multiple imputation
- KNN imputation

In [11]:
# Simple imputer 
simp_imputer = ColumnTransformer(
        transformers=[
            ('num', SimpleImputer(missing_values=np.nan, strategy='median'),
             continuous_indexes),

            ('cat', SimpleImputer(missing_values=np.nan, strategy='constant',
                                  fill_value=4444),
             categorical_indexes)

        ])


In [12]:
# Multiple imputation with iterative imputer
imputer = ColumnTransformer(
    transformers=[
        ('num', IterativeImputer(initial_strategy='median',
                                 max_iter=5,
                                 random_state=random_state), 
         continuous_indexes),
        
        ('cat', IterativeImputer(estimator=RidgeClassifier(),
                                 initial_strategy='most_frequent',
                                 max_iter=10, 
                                 random_state=random_state), 
         categorical_indexes)
    
    ])


In [13]:
# KNN imputer
knn_imp = KNNImputer(n_neighbors=1)

### Sampling Methods
Tomek Links Undersampling and SMOTENC oversampling will be used to address class imbalance. This will be compared with using no sampling methods.

In [14]:
# Tomek Links undersampling
tl = TomekLinks(sampling_strategy='majority')

In [15]:
# SMOTE oversampling
smote = SMOTENC(random_state=random_state,
                categorical_features=categorical_indexes,
                sampling_strategy=1)

### Predictive model
Will be testing ExtraTreesClassifier and RandomForestClassifer from sklearn.

In [16]:
# ExtraTrees Model 
et_model = ExtraTreesClassifier(n_jobs=n_jobs, 
                                random_state=random_state)

In [17]:
# RanfomForest model
rf_model = RandomForestClassifier(n_jobs=n_jobs, 
                                  random_state=random_state)

# Random Undersampling
Undersample the majority class in the dataset to provide a smaller dataset for hyperparamter tuning. Training on this smaller dataset even seems to give better results than on the full dataset.

Reduces the data from ~300,000 to ~6,000 rows/samples.

In [18]:
# Configure random undersampler
rus = RandomUnderSampler(sampling_strategy=0.1,
                         random_state=random_state)


In [19]:
# Create the dataset used for hyperparamter tuning and training
X_res, y_res = rus.fit_resample(X_train, y_train.astype('float64'))


In [20]:
X_res

Unnamed: 0,Whole body fat-free mass|x23101_0_0,Whole body water mass|x23102_0_0,Leg fat percentage (right)|x23111_0_0,Leg predicted mass (right)|x23114_0_0,Leg fat percentage (left)|x23115_0_0,Leg fat-free mass (left)|x23117_0_0,Leg predicted mass (left)|x23118_0_0,Arm fat-free mass (right)|x23121_0_0,Arm predicted mass (right)|x23122_0_0,Arm fat-free mass (left)|x23125_0_0,...,Other eye problems|x2227_0_0,Falls in the last year|x2296_0_0,Acceptability of each blow result|x3061_0_0,Number of measurements made|x3137_0_0,Illness injury bereavement stress in last 2 years|x6145_0_0,Types of transport used (excluding work)|x6162_0_0,Illnesses of siblings|x20111_0_0,Spirometry QC measure|x20255_0_0,Genetic sex|x22001_0_0,Above moderate/vigorous/walking recommendation|x22036_0_0
0,40.1,29.4,38.0,6.3,38.6,6.5,6.2,2.1,1.9,2.1,...,0,2,,,0,1,3,,0,1
1,58.6,42.9,11.7,8.8,11.8,9.0,8.5,3.1,2.9,3.3,...,0,1,0,2,0,1,8,1,1,1
2,37.9,27.7,35.7,6.2,36.6,6.3,6.0,1.8,1.6,1.8,...,1,1,32,2,0,1,9,1,0,1
3,38.3,28.0,37.7,6.0,36.7,6.4,6.0,1.8,1.7,1.9,...,0,1,0,3,0,1,8,1,0,1
4,52.3,38.3,39.4,7.9,39.0,8.3,7.9,2.7,2.5,2.8,...,1,1,,,2,1,,,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6034,39.4,28.8,23.4,6.3,26.4,6.3,6.0,2.0,1.9,1.9,...,0,1,0,3,0,2,,2,0,1
6035,40.2,29.4,33.4,6.3,34.4,6.5,6.1,2.0,1.8,2.0,...,1,1,2,3,0,1,0,,0,1
6036,38.6,28.3,41.5,6.1,41.7,6.4,6.0,1.9,1.7,2.0,...,0,3,2,3,0,1,12,,0,
6037,35.9,26.3,33.3,5.8,34.0,6.0,5.6,1.6,1.5,1.6,...,0,1,0,2,0,1,0,,0,


# Simple Imputation

### ExtraTrees with sampling

In [21]:
simp_et_pipe = imbpipeline(steps = [('imputer', simp_imputer),
                                    ('tomek', tl),
                                    ('smotenc', smote),
                                    ('model', et_model)])

In [22]:
# Parameter to search
et_simp_search_grid = {'model__n_estimators': [100, 200, 500],
                   'model__max_features': [5, 10, 15, 20, 40],
                   'model__max_depth': [5, 10, 15, 20],
                   'model__bootstrap': [True, False]}

In [23]:
# Random search configurations
simp_et_hyper_search = GridSearchCV(estimator=simp_et_pipe,
                                    param_grid=et_simp_search_grid,
                                    cv=3,
                                    n_jobs=n_jobs,
                                    scoring='f1')

In [24]:
%%memit

t=Stopwatch()

simp_et_hyper_search.fit(X_res, y_res)

print(t.stop())

238.71750437933952
peak memory: 507.33 MiB, increment: 47.61 MiB


In [25]:
# Look at results of search
simp_et_res = pd.DataFrame(simp_et_hyper_search.cv_results_)

In [26]:
# Find the optimum number of features
simp_et_hyper_search.best_params_

{'model__bootstrap': True,
 'model__max_depth': 5,
 'model__max_features': 40,
 'model__n_estimators': 500}

In [27]:
simp_et_res['mean_test_score'].loc[simp_et_res['rank_test_score'] == 1]

14    0.16652
Name: mean_test_score, dtype: float64

### RandomForest with Sampling

In [28]:
# Parameter to search
rf_simp_search_grid = {'model__n_estimators': [100, 200, 500],
                   'model__max_features': [5, 10, 15, 20, 40],
                   'model__max_depth': [5, 10, 15, 20],
                   'model__bootstrap': [True, False]}

In [29]:
simp_rf_pipe = imbpipeline(steps = [('imputer', simp_imputer),
                                    ('tomek', tl),
                                    ('smotenc', smote),
                                    ('model', rf_model)])

In [30]:
# Random search configurations
simp_rf_hyper_search = GridSearchCV(estimator=simp_rf_pipe,
                                    param_grid=rf_simp_search_grid,
                                    cv=3,
                                    n_jobs=n_jobs,
                                    scoring='f1')

In [31]:
%%memit

t=Stopwatch()

simp_rf_hyper_search.fit(X_res, y_res)

print(t.stop())

72 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
72 fits failed with the following error:
Traceback (most recent call last):
  File "/data/home/bt211037/.conda/envs/ondem_env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/data/home/bt211037/.conda/envs/ondem_env/lib/python3.10/site-packages/imblearn/pipeline.py", line 272, in fit
    self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  File "/data/home/bt211037/.conda/envs/ondem_env/lib/python3.10/site-packages/daal4py/sklearn/_device_offload.py", line 88, in wrapper_with_self
    return wrapper_impl(self, *args, **kwargs)
  File "/data

224.54473018739372
peak memory: 523.78 MiB, increment: 32.59 MiB


In [32]:
# Look at results of search
simp_rf_res = pd.DataFrame(simp_rf_hyper_search.cv_results_)

In [33]:
# Find the optimum number of features
simp_rf_hyper_search.best_params_

{'model__bootstrap': False,
 'model__max_depth': 5,
 'model__max_features': 5,
 'model__n_estimators': 100}

In [34]:
simp_rf_res['mean_test_score'].loc[simp_rf_res['rank_test_score'] == 1]

60    0.129481
Name: mean_test_score, dtype: float64

### RandomForest with no sampling
*Random Forest outperformed ExtraTrees

In [35]:
rf_ns_pipe = imbpipeline(steps = [('imputer', simp_imputer),
                               ('model', rf_model)])

In [36]:
# Random search configurations
ns_simp_rf_hyper_search = GridSearchCV(estimator=rf_ns_pipe,
                                    param_grid=rf_simp_search_grid,
                                    cv=3,
                                    n_jobs=n_jobs,
                                    scoring='f1')

In [37]:
%%memit

t=Stopwatch()

ns_simp_rf_hyper_search.fit(X_res, y_res)

print(t.stop())

28.230015082284808
peak memory: 527.59 MiB, increment: 3.79 MiB


72 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
72 fits failed with the following error:
Traceback (most recent call last):
  File "/data/home/bt211037/.conda/envs/ondem_env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/data/home/bt211037/.conda/envs/ondem_env/lib/python3.10/site-packages/imblearn/pipeline.py", line 272, in fit
    self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  File "/data/home/bt211037/.conda/envs/ondem_env/lib/python3.10/site-packages/daal4py/sklearn/_device_offload.py", line 88, in wrapper_with_self
    return wrapper_impl(self, *args, **kwargs)
  File "/data

In [38]:
# Look at results of search
ns_simp_rf_res = pd.DataFrame(ns_simp_rf_hyper_search.cv_results_)

In [39]:
# Find the optimum number of features
ns_simp_rf_hyper_search.best_params_

{'model__bootstrap': False,
 'model__max_depth': 15,
 'model__max_features': 20,
 'model__n_estimators': 100}

In [40]:
ns_simp_rf_res['mean_test_score'].loc[ns_simp_rf_res['rank_test_score'] == 1]

99    0.016135
Name: mean_test_score, dtype: float64

# Multiple Imputation

### ExtraTrees with sampling

In [41]:
mult_et_pipe = imbpipeline(steps = [('imputer', imputer),
                                    ('tomek', tl),
                                    ('smotenc', smote),
                                    ('model', et_model)])

In [42]:
# Parameter to search
et_mult_search_grid = {'model__n_estimators': [100, 200, 500],
                   'model__max_features': [5, 10, 15, 20, 40],
                   'model__max_depth': [5, 10, 15, 20],
                   'model__bootstrap': [True, False]}

In [43]:
# Random search configurations
mult_et_hyper_search = GridSearchCV(estimator=mult_et_pipe,
                                    param_grid=et_mult_search_grid,
                                    cv=3,
                                    n_jobs=n_jobs,
                                    scoring='f1')

In [44]:
%%memit

t=Stopwatch()

mult_et_hyper_search.fit(X_res, y_res)

print(t.stop())



















255.044149113819
peak memory: 556.30 MiB, increment: 28.71 MiB


In [45]:
# Look at results of search
mult_et_res = pd.DataFrame(mult_et_hyper_search.cv_results_)

In [46]:
# Find the optimum number of features
mult_et_hyper_search.best_params_

{'model__bootstrap': False,
 'model__max_depth': 5,
 'model__max_features': 10,
 'model__n_estimators': 100}

In [47]:
mult_et_res['mean_test_score'].loc[mult_et_res['rank_test_score'] == 1]

63    0.150166
Name: mean_test_score, dtype: float64

### RandomForest with sampling

In [48]:
# Parameter to search
rf_mult_search_grid = {'model__n_estimators': [100, 200, 500],
                   'model__max_features': [5, 10, 15, 20, 40],
                   'model__max_depth': [5, 10, 15, 20],
                   'model__bootstrap': [True, False]}

In [49]:
mult_rf_pipe = imbpipeline(steps = [('imputer', imputer),
                                    ('tomek', tl),
                                    ('smotenc', smote),
                                    ('model', rf_model)])

In [50]:
# Random search configurations
mult_rf_hyper_search = GridSearchCV(estimator=mult_rf_pipe,
                                    param_grid=rf_mult_search_grid,
                                    cv=3,
                                    n_jobs=n_jobs,
                                    scoring='f1')

In [51]:
%%memit

t=Stopwatch()

mult_rf_hyper_search.fit(X_res, y_res)

print(t.stop())

















72 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
72 fits failed with the following error:
Traceback (most recent call last):
  File "/data/home/bt211037/.conda/envs/ondem_env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/data/home/bt211037/.conda/envs/ondem_env/lib/python3.10/site-packages/imblearn/pipeline.py", line 272, in fit
    self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  File "/data/home/bt211037/.conda/envs/ondem_env/lib/python3.10/site-packages/daal4py/sklearn/_device_offload.py", line 88, in wrapper_with_self
    return wrapper_impl(self, *args, **kwargs)
  File "/data



235.9299318967387
peak memory: 561.65 MiB, increment: 9.07 MiB


In [52]:
# Look at results of search
mult_rf_res = pd.DataFrame(mult_rf_hyper_search.cv_results_)

In [53]:
# Find the optimum number of features
mult_rf_hyper_search.best_params_

{'model__bootstrap': True,
 'model__max_depth': 5,
 'model__max_features': 10,
 'model__n_estimators': 200}

In [54]:
mult_rf_res['mean_test_score'].loc[mult_rf_res['rank_test_score'] == 1]

4    0.131412
Name: mean_test_score, dtype: float64

### ExtraTrees with no sampling
*ExtraTrees outperformed RandomForest

In [55]:
et_ns_pipe = imbpipeline(steps = [('imputer', imputer),
                               ('model', et_model)])

In [56]:
# Random search configurations
ns_mult_et_hyper_search = GridSearchCV(estimator=et_ns_pipe,
                                    param_grid=et_mult_search_grid,
                                    cv=3,
                                    n_jobs=n_jobs,
                                    scoring='f1')

In [57]:
%%memit

t=Stopwatch()

ns_mult_et_hyper_search.fit(X_res, y_res)

print(t.stop())



















60.67294938210398
peak memory: 564.48 MiB, increment: 4.43 MiB


In [58]:
# Look at results of search
ns_mult_et_res = pd.DataFrame(ns_mult_et_hyper_search.cv_results_)

In [59]:
# Find the optimum number of features
ns_mult_et_hyper_search.best_params_

{'model__bootstrap': False,
 'model__max_depth': 15,
 'model__max_features': 40,
 'model__n_estimators': 100}

In [60]:
ns_mult_et_res['mean_test_score'].loc[ns_mult_et_res['rank_test_score'] == 1]



102    0.026577
Name: mean_test_score, dtype: float64

# KNN Imputation

### ExtraTrees with Sampling

In [61]:
knn_et_pipe = imbpipeline(steps = [('imputer', knn_imp),
                                    ('tomek', tl),
                                    ('smotenc', smote),
                                    ('model', et_model)])

In [62]:
# Parameter to search
et_knn_search_grid = {'model__n_estimators': [100, 200, 500],
                   'model__max_features': [5, 10, 15, 20, 40],
                   'model__max_depth': [5, 10, 15, 20],
                   'model__bootstrap': [True, False]}

In [63]:
# Grid search params
knn_et_hyper_search = GridSearchCV(estimator=knn_et_pipe,
                                    param_grid=et_knn_search_grid,
                                    cv=3,
                                    n_jobs=n_jobs,
                                    scoring='f1')

In [64]:
%%memit

# Grid search

t=Stopwatch()

knn_et_hyper_search.fit(X_res, y_res)

print(t.stop())



296.52608747407794
peak memory: 985.92 MiB, increment: 420.88 MiB


In [65]:
# Look at results of search
knn_et_res = pd.DataFrame(knn_et_hyper_search.cv_results_)

In [66]:
# Find the optimum number of features
knn_et_hyper_search.best_params_

{'model__bootstrap': False,
 'model__max_depth': 5,
 'model__max_features': 20,
 'model__n_estimators': 500}

In [67]:
knn_et_res['mean_test_score'].loc[knn_et_res['rank_test_score'] == 1]

71    0.158345
Name: mean_test_score, dtype: float64

### RandomForest with sampling

In [68]:
# Parameter to search
rf_knn_search_grid = {'model__n_estimators': [100, 200, 500],
                   'model__max_features': [5, 10, 15, 20, 40],
                   'model__max_depth': [5, 10, 15, 20],
                   'model__bootstrap': [True, False]}

In [69]:
knn_rf_pipe = imbpipeline(steps = [('imputer', knn_imp),
                                    ('tomek', tl),
                                    ('smotenc', smote),
                                    ('model', rf_model)])

In [70]:
# Random search configurations
knn_rf_hyper_search = GridSearchCV(estimator=knn_rf_pipe,
                                    param_grid=rf_knn_search_grid,
                                    cv=3,
                                    n_jobs=n_jobs,
                                    scoring='f1')

In [71]:
%%memit

t=Stopwatch()

knn_rf_hyper_search.fit(X_res, y_res)

print(t.stop())

72 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
72 fits failed with the following error:
Traceback (most recent call last):
  File "/data/home/bt211037/.conda/envs/ondem_env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/data/home/bt211037/.conda/envs/ondem_env/lib/python3.10/site-packages/imblearn/pipeline.py", line 272, in fit
    self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  File "/data/home/bt211037/.conda/envs/ondem_env/lib/python3.10/site-packages/daal4py/sklearn/_device_offload.py", line 88, in wrapper_with_self
    return wrapper_impl(self, *args, **kwargs)
  File "/data

272.49860771000385
peak memory: 993.88 MiB, increment: 419.05 MiB


In [72]:
# Look at results of search
knn_rf_res = pd.DataFrame(knn_rf_hyper_search.cv_results_)

In [73]:
# Find the optimum number of features
knn_rf_hyper_search.best_params_

{'model__bootstrap': False,
 'model__max_depth': 5,
 'model__max_features': 20,
 'model__n_estimators': 500}

In [74]:
knn_rf_res['mean_test_score'].loc[knn_rf_res['rank_test_score'] == 1]

71    0.131446
Name: mean_test_score, dtype: float64

### ExtraTrees with no sampling
*ExtraTrees outperformed RandomForest

In [75]:
knn_et_ns_pipe = imbpipeline(steps = [('imputer', knn_imp),
                               ('model', et_model)])

In [76]:
# Random search configurations
ns_knn_et_hyper_search = GridSearchCV(estimator=knn_et_ns_pipe,
                                    param_grid=et_knn_search_grid,
                                    cv=3,
                                    n_jobs=n_jobs,
                                    scoring='f1')

In [77]:
%%memit

t=Stopwatch()

ns_knn_et_hyper_search.fit(X_res, y_res)

print(t.stop())

103.08188234642148
peak memory: 1006.90 MiB, increment: 419.21 MiB


In [78]:
# Look at results of search
ns_knn_et_res = pd.DataFrame(ns_knn_et_hyper_search.cv_results_)

In [79]:
# Find the optimum number of features
ns_knn_et_hyper_search.best_params_

{'model__bootstrap': False,
 'model__max_depth': 20,
 'model__max_features': 40,
 'model__n_estimators': 100}

In [80]:
ns_knn_et_res['mean_test_score'].loc[ns_knn_et_res['rank_test_score'] == 1]

117    0.023147
Name: mean_test_score, dtype: float64

# Save Model

Saving the model trained on the subset pf data which utilised KNN imputation.

In [99]:
model_file = '/data/home/bt211037/dissertation/rf_model.pkl'

In [100]:
pickle.dump(knn_et_pipe, open(model_file, 'wb'))