# Practice Random Forest Pipeline/Workflow

In [108]:
# Imports
import pandas as pd
import numpy as np
import random
import string
from timeit import default_timer as timer # Time how long commands take

### Create fake/dummy data

In [109]:
row_num = 5000

In [110]:
# Create columns names for dataset
columns = [''.join(random.choices(string.ascii_uppercase + string.digits, k=5)) for x in range(380)]

In [111]:
# Add Thyroid cancer to columns
columns.insert(0, 'userId')
# Add userId to columns
columns.append('thyroid_cancer')

In [112]:
# Create dataframe rows with different possible data types

dictionary = {}

for item in columns:

    column_options = [[random.randint(0,1) for i in range(row_num)], np.random.randint(0, 10, row_num, dtype=int), np.random.randint(0, 3, row_num, dtype=int),
                      np.random.random_sample(size = row_num), np.random.uniform(low=0, high=3000, size=row_num), np.random.uniform(low=0, high=150, size=row_num)]

    dictionary[item] = random.choice(column_options)


In [113]:
# Create the pandas DataFrame
df = pd.DataFrame(dictionary)

In [114]:
# Change userId values
df['userId'] = [x for x in range(row_num)]

# Make userId index
df.set_index('userId', inplace=True)

In [115]:
# Add missing values
for col in df.columns:
    df.loc[df.sample(frac=0.1).index, col] = np.nan

In [116]:
# Make thyroid cancer data binary with no missing values
df['thyroid_cancer'] = [random.randint(0, 1) for x in range(row_num)]

### Data preprocessing

In [117]:
df


Unnamed: 0_level_0,RJBCG,1JA1I,CQKS4,LG3FK,A20S2,DYCR7,O14A0,RMHYX,E8MSS,2W236,...,DZJMF,3APN4,T9DVI,YW0DC,V7R3B,GJFQW,4N0WS,0TLZD,2380D,thyroid_cancer
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2.0,8.0,,0.587932,0.0,1.0,0.0,2.0,52.204670,,...,7.0,0.0,940.011174,0.602125,3.0,1.0,37.049401,2.0,128.787376,0
1,0.0,2.0,815.793827,0.713335,1.0,1.0,,0.0,60.233644,2.0,...,1.0,2.0,217.204104,0.208834,,1.0,,1.0,1.244728,0
2,0.0,7.0,963.985600,0.419220,1.0,0.0,1.0,,82.978706,2.0,...,5.0,0.0,1602.467243,0.338939,3.0,0.0,2597.963340,2.0,82.513029,1
3,1.0,9.0,154.394166,0.796608,,1.0,1.0,1.0,8.893962,0.0,...,,0.0,1863.161642,0.782016,6.0,1.0,360.884714,1.0,141.916203,1
4,2.0,8.0,414.390095,0.277232,1.0,0.0,0.0,0.0,123.794548,2.0,...,6.0,0.0,1404.405778,0.373097,9.0,1.0,1496.204148,0.0,143.432610,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1.0,7.0,1391.723850,0.197264,,,0.0,1.0,28.351367,0.0,...,3.0,2.0,2015.835418,0.968824,7.0,0.0,,1.0,62.513954,0
4996,2.0,2.0,2867.293134,0.761337,2.0,0.0,1.0,0.0,70.964844,0.0,...,0.0,1.0,2377.634268,,9.0,0.0,1023.168856,0.0,25.028542,1
4997,1.0,9.0,138.230651,0.613158,0.0,0.0,,0.0,104.277156,1.0,...,5.0,0.0,475.802309,0.880915,5.0,1.0,1878.366800,1.0,67.160546,1
4998,2.0,8.0,2877.556221,0.651437,2.0,1.0,0.0,2.0,56.841272,1.0,...,8.0,0.0,,0.857142,8.0,,628.149568,2.0,126.627251,1


In [118]:
# Make sure thyroid cancer column has been added

In [119]:
# Make sure diseases descriptions are the column names rather than just phecodes

In [120]:
# Manual feature selection - use mutual information and consider data leakage.

### Pipeline

In [121]:
# Create test-train split
df

Unnamed: 0_level_0,RJBCG,1JA1I,CQKS4,LG3FK,A20S2,DYCR7,O14A0,RMHYX,E8MSS,2W236,...,DZJMF,3APN4,T9DVI,YW0DC,V7R3B,GJFQW,4N0WS,0TLZD,2380D,thyroid_cancer
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2.0,8.0,,0.587932,0.0,1.0,0.0,2.0,52.204670,,...,7.0,0.0,940.011174,0.602125,3.0,1.0,37.049401,2.0,128.787376,0
1,0.0,2.0,815.793827,0.713335,1.0,1.0,,0.0,60.233644,2.0,...,1.0,2.0,217.204104,0.208834,,1.0,,1.0,1.244728,0
2,0.0,7.0,963.985600,0.419220,1.0,0.0,1.0,,82.978706,2.0,...,5.0,0.0,1602.467243,0.338939,3.0,0.0,2597.963340,2.0,82.513029,1
3,1.0,9.0,154.394166,0.796608,,1.0,1.0,1.0,8.893962,0.0,...,,0.0,1863.161642,0.782016,6.0,1.0,360.884714,1.0,141.916203,1
4,2.0,8.0,414.390095,0.277232,1.0,0.0,0.0,0.0,123.794548,2.0,...,6.0,0.0,1404.405778,0.373097,9.0,1.0,1496.204148,0.0,143.432610,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1.0,7.0,1391.723850,0.197264,,,0.0,1.0,28.351367,0.0,...,3.0,2.0,2015.835418,0.968824,7.0,0.0,,1.0,62.513954,0
4996,2.0,2.0,2867.293134,0.761337,2.0,0.0,1.0,0.0,70.964844,0.0,...,0.0,1.0,2377.634268,,9.0,0.0,1023.168856,0.0,25.028542,1
4997,1.0,9.0,138.230651,0.613158,0.0,0.0,,0.0,104.277156,1.0,...,5.0,0.0,475.802309,0.880915,5.0,1.0,1878.366800,1.0,67.160546,1
4998,2.0,8.0,2877.556221,0.651437,2.0,1.0,0.0,2.0,56.841272,1.0,...,8.0,0.0,,0.857142,8.0,,628.149568,2.0,126.627251,1


# One Hot Encoding

In [122]:
# Find categorical data columns in dataframe
def find_categorical_columns(dataframe):

    columns = [list(dataframe[i]) for i in dataframe]

    uniques = [len(set([i for i in a if pd.notna(i)])) for a in columns]

    categorical_indexes = [i for i, c in enumerate(uniques) if c <= 10]

    unique_cat_num = [uniques[i] for i in categorical_indexes]

    index_ncats = list(zip(categorical_indexes, unique_cat_num))

    return index_ncats # return indexes of categorical data and the number of categories for that index

# Remove binary columns from categorical column list
def remove_binary(zipped_list):

    new_list = [list(item) for item in zipped_list if item[1] != 2]

    return new_list # Index first, cat num second


In [123]:
# Find categorical columns
zl = find_categorical_columns(df)
# Remove binary categorical columns
cats = remove_binary(zl)
# List of categorical column indexes
one_hot_col_indexes = [i[0] for i in cats]
# List of column names
cat_cols = [df.columns[x] for x in one_hot_col_indexes]

In [124]:
from sklearn.preprocessing import OneHotEncoder

# create instance of one hot encoder
enc = OneHotEncoder()

In [128]:
# One hot encode the data
enc_data = enc.fit_transform(df[cat_cols]).toarray()
# New column names
enc_feat = enc.get_feature_names_out(cat_cols)

In [129]:
enc_feat

array(['RJBCG_0.0', 'RJBCG_1.0', 'RJBCG_2.0', ..., '0TLZD_1.0',
       '0TLZD_2.0', '0TLZD_nan'], dtype=object)

In [62]:
enc_data

In [130]:
encoded_df = pd.DataFrame(enc_data, columns=enc_feat)

In [131]:
encoded_df

Unnamed: 0,RJBCG_0.0,RJBCG_1.0,RJBCG_2.0,RJBCG_nan,1JA1I_0.0,1JA1I_1.0,1JA1I_2.0,1JA1I_3.0,1JA1I_4.0,1JA1I_5.0,...,V7R3B_5.0,V7R3B_6.0,V7R3B_7.0,V7R3B_8.0,V7R3B_9.0,V7R3B_nan,0TLZD_0.0,0TLZD_1.0,0TLZD_2.0,0TLZD_nan
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4996,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4997,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4998,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


# Imputation

Selecting MissForest Random Forest imputation as it has been described the most successful algorithm (like in this [paper](https://www.frontiersin.org/articles/10.3389/fdata.2021.693674/full)), its relatively low level of computational 'greedyness', and due to the fact it is nonparametric. Another description of how it works can also be found [here](https://betterdatascience.com/python-missforest-algorithm/).

See the [documentation](https://github.com/epsilon-machine/missingpy) on github for further details on MissForest.

A basic example of how to use it can be found [here](https://betterdatascience.com/python-missforest-algorithm/).

In [132]:
# Import required for missforest due to new sklearn version renaming
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
# Import package for MissForest
from missingpy import MissForest

In [135]:
# Initialise imputer
imputer = MissForest(max_iter=5)

# Test imputation on subset of dataframe
s = timer()
imputed = imputer.fit_transform(df[df.columns[0:100]], cat_vars=[x for x in one_hot_col_indexes if x < 100]) # Test on subset of dataframe
e = timer()
print(f'Imputation time: {(e - s)/60} mins')

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Iteration: 0


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Iteration: 1


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Iteration: 2


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Iteration: 3


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Iteration: 4
936.8586065420022


In [136]:
imputed

array([[2.00000000e+00, 8.00000000e+00, 1.52878152e+03, ...,
        2.88072024e+03, 7.73564044e+01, 1.00000000e+00],
       [0.00000000e+00, 2.00000000e+00, 8.15793827e+02, ...,
        2.55150889e+03, 8.69659853e+01, 0.00000000e+00],
       [0.00000000e+00, 7.00000000e+00, 9.63985600e+02, ...,
        2.69231201e+03, 1.64298536e+01, 1.00000000e+00],
       ...,
       [1.00000000e+00, 9.00000000e+00, 1.38230651e+02, ...,
        1.67440940e+03, 1.17281636e+02, 0.00000000e+00],
       [2.00000000e+00, 8.00000000e+00, 2.87755622e+03, ...,
        1.54674142e+03, 8.35822960e+01, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 4.05763834e+02, ...,
        6.65185649e+02, 7.94790927e+01, 1.00000000e+00]])

In [140]:
df[df.columns[0:100]]

Unnamed: 0_level_0,RJBCG,1JA1I,CQKS4,LG3FK,A20S2,DYCR7,O14A0,RMHYX,E8MSS,2W236,...,198U9,X6LMV,8MW0Z,MNRJK,CN0SK,Y9ODD,72X8S,P8OP0,IBHZM,3PU9N
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2.0,8.0,,0.587932,0.0,1.0,0.0,2.0,52.204670,,...,7.0,104.436586,2.0,2.0,5.0,114.565161,690.606417,2880.720237,77.356404,1.0
1,0.0,2.0,815.793827,0.713335,1.0,1.0,,0.0,60.233644,2.0,...,2.0,50.835485,1.0,0.0,7.0,26.559757,1211.993301,2551.508889,86.965985,0.0
2,0.0,7.0,963.985600,0.419220,1.0,0.0,1.0,,82.978706,2.0,...,4.0,58.631180,1.0,0.0,,58.428026,1391.209115,2692.312013,16.429854,1.0
3,1.0,9.0,154.394166,0.796608,,1.0,1.0,1.0,8.893962,0.0,...,8.0,83.759559,,1.0,4.0,44.837327,2857.977135,2419.672751,138.539117,2.0
4,2.0,8.0,414.390095,0.277232,1.0,0.0,0.0,0.0,123.794548,2.0,...,5.0,90.337992,2.0,0.0,4.0,8.594330,499.324722,1916.000736,50.424698,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1.0,7.0,1391.723850,0.197264,,,0.0,1.0,28.351367,0.0,...,4.0,115.279627,0.0,,7.0,140.801634,524.527124,1860.806779,43.118554,2.0
4996,2.0,2.0,2867.293134,0.761337,2.0,0.0,1.0,0.0,70.964844,0.0,...,8.0,47.675602,2.0,0.0,7.0,43.250978,277.630561,341.409446,,0.0
4997,1.0,9.0,138.230651,0.613158,0.0,0.0,,0.0,104.277156,1.0,...,9.0,85.942240,2.0,0.0,7.0,105.567889,2331.388776,1674.409404,117.281636,0.0
4998,2.0,8.0,2877.556221,0.651437,2.0,1.0,0.0,2.0,56.841272,1.0,...,3.0,17.376337,1.0,2.0,0.0,7.963441,2011.368151,,83.582296,0.0


In [139]:
[x for x in one_hot_col_indexes if x < 100]

[0,
 1,
 4,
 7,
 9,
 13,
 14,
 18,
 20,
 25,
 27,
 28,
 30,
 31,
 32,
 33,
 45,
 50,
 52,
 59,
 62,
 63,
 66,
 67,
 71,
 72,
 73,
 78,
 79,
 81,
 84,
 85,
 87,
 88,
 89,
 90,
 92,
 93,
 94,
 99]

Feature selection: MRMR or Recursive Feature Elimination (RFE) or Boruta

You could consider feature selection a hyperparameter. Consequently, you can include it as part of a cross validation grid search to obtain optimum hyperparameters for the model in question.

https://medium.com/data-science-reporter/feature-selection-via-grid-search-in-supervised-models-4dc0c43d7ab1

Preproccessing:

- Clean data
- Add thyroid cancer
- Colum names
- Manual feature selection (data leakage - look at mutual information)


Pipeline:

Test-Train split

1. Imputation
2. One-hot encoding
3. Feature scaling (not always)
4. Oversampling (not always)
5. Model selection
6. Cross validation grid search for hyperparameters and feature selection
7. Build Model
8.