# Practice Random Forest Pipeline/Workflow

In [32]:
# Imports
import pandas as pd
import numpy as np
import random
import string

### Create fake/dummy data

In [33]:
row_num = 5000

In [34]:
# Create columns names for dataset
columns = [''.join(random.choices(string.ascii_uppercase + string.digits, k=5)) for x in range(380)]

In [35]:
# Add Thyroid cancer to columns
columns.insert(0, 'userId')
# Add userId to columns
columns.append('thyroid_cancer')

In [36]:
# Create dataframe rows with different possible data types

dictionary = {}

for item in columns:

    column_options = [[random.randint(0,1) for i in range(row_num)], np.random.randint(0, 10, row_num, dtype=int), np.random.randint(0, 3, row_num, dtype=int),
                      np.random.random_sample(size = row_num), np.random.uniform(low=0, high=3000, size=row_num), np.random.uniform(low=0, high=150, size=row_num)]

    dictionary[item] = random.choice(column_options)


In [37]:
# Create the pandas DataFrame
df = pd.DataFrame(dictionary)

In [38]:
# Change userId values
df['userId'] = [x for x in range(row_num)]

# Make userId index
df.set_index('userId', inplace=True)

In [39]:
# Add missing values
for col in df.columns:
    df.loc[df.sample(frac=0.1).index, col] = np.nan

In [40]:
# Make thyroid cancer data binary with no missing values
df['thyroid_cancer'] = [random.randint(0, 1) for x in range(row_num)]

### Data preprocessing

In [41]:
df


Unnamed: 0_level_0,L2FXN,4O140,RMMCK,4OUKQ,NRU2J,CVQXU,0TZIL,8WOG4,HTK3E,9WCJ3,...,QAF87,4KYCR,UEVHB,KMWUC,LTI7X,EP99Z,3FG4G,JEHL4,837BZ,thyroid_cancer
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,100.819095,79.533635,1.0,1784.419620,,286.229259,,0.679042,0.0,147.537345,...,1944.946285,2649.818012,90.677019,1572.126339,148.395600,101.422691,91.073179,1.0,2.0,1
1,145.644217,144.144537,,163.339933,1.0,1232.314158,0.557832,0.699832,0.0,23.497094,...,1782.308724,1086.284260,27.633470,517.495775,80.098345,68.667537,80.076787,3.0,2.0,0
2,92.604387,141.714990,2.0,1552.433984,1.0,2872.809990,0.726953,0.160975,2.0,,...,2361.472141,,3.002688,1033.492380,45.923227,128.162543,8.320745,2.0,1.0,0
3,118.852326,19.510764,2.0,1628.358886,0.0,2775.375487,,0.047124,0.0,137.028006,...,2611.594475,2016.356672,127.757900,1707.261930,92.910295,121.077810,,2.0,2.0,0
4,37.878509,102.136994,1.0,2284.578166,0.0,2075.400456,0.687593,0.790735,2.0,139.286712,...,2118.920623,2517.865771,12.079427,2486.923315,83.652358,61.038050,91.896260,,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,117.821933,130.870631,1.0,2276.016279,,2189.537938,0.702033,0.724397,0.0,46.847198,...,,1308.528287,14.784780,2919.125594,97.075243,11.535993,81.180923,7.0,1.0,0
4996,91.440365,147.438777,2.0,31.774301,1.0,2038.503004,0.043304,0.075510,,64.238290,...,2617.416273,2351.554004,105.801503,169.232430,,102.205482,125.912416,7.0,1.0,1
4997,147.570441,,1.0,816.759530,0.0,532.684464,0.093032,,1.0,62.852716,...,885.063202,,,13.897982,88.478633,97.582138,83.229296,,2.0,0
4998,,148.145928,1.0,694.372469,1.0,378.328381,0.243781,0.946571,2.0,35.177676,...,797.730384,2336.917546,55.934323,639.722705,80.290018,27.470108,32.861017,7.0,2.0,1


In [42]:
# Make sure thyroid cancer column has been added

In [43]:
# Make sure diseases descriptions are the column names rather than just phecodes

In [44]:
# Manual feature selection - use mutual information and consider data leakage.

### Pipeline

In [45]:
# Create test-train split


# Imputation

# One Hot Encoding

In [46]:
# Find categorical data columns in dataframe
def find_categorical_columns(dataframe):

    columns = [list(dataframe[i]) for i in dataframe]

    uniques = [len(set([i for i in a if pd.notna(i)])) for a in columns]

    categorical_indexes = [i for i, c in enumerate(uniques) if c <= 10]

    unique_cat_num = [uniques[i] for i in categorical_indexes]

    index_ncats = list(zip(categorical_indexes, unique_cat_num))

    return index_ncats # return indexes of categorical data and the number of categories for that index

# Remove binary columns from categorical column list
def remove_binary(zipped_list):

    new_list = [list(item) for item in zipped_list if item[1] != 2]

    return new_list # Index first, cat num second


In [47]:
# Find categorical columns
zl = find_categorical_columns(df)
# Remove binary categorical columns
cats = remove_binary(zl)
# List of categorical column indexes
one_hot_col_indexes = [i[0] for i in cats]
# List of column names
cat_cols = [df.columns[x] for x in one_hot_col_indexes]

In [None]:
from sklearn.preprocessing import OneHotEncoder

# create instance of one hot encoder
enc = OneHotEncoder()

In [55]:
# New column names
enc_feat = enc.get_feature_names_out(cat_cols)

In [56]:
enc_feat

array(['RMMCK_0.0', 'RMMCK_1.0', 'RMMCK_2.0', ..., '837BZ_1.0',
       '837BZ_2.0', '837BZ_nan'], dtype=object)

In [62]:
# One hot encode the data
enc_data = pd.DataFrame(enc.fit_transform(df[cat_cols]).toarray(), columns=list(enc_feat))

In [63]:
enc_data

Unnamed: 0,RMMCK_0.0,RMMCK_1.0,RMMCK_2.0,RMMCK_nan,HTK3E_0.0,HTK3E_1.0,HTK3E_2.0,HTK3E_nan,S2X9H_0.0,S2X9H_1.0,...,JEHL4_5.0,JEHL4_6.0,JEHL4_7.0,JEHL4_8.0,JEHL4_9.0,JEHL4_nan,837BZ_0.0,837BZ_1.0,837BZ_2.0,837BZ_nan
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4997,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4998,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


Feature selection: MRMR or Recursive Feature Elimination (RFE) or Boruta

You could consider feature selection a hyperparameter. Consequently, you can include it as part of a cross validation grid search to obtain optimum hyperparameters for the model in question.

https://medium.com/data-science-reporter/feature-selection-via-grid-search-in-supervised-models-4dc0c43d7ab1

Preproccessing:

- Clean data
- Add thyroid cancer
- Colum names
- Manual feature selection (data leakage - look at mutual information)


Pipeline:

Test-Train split

1. Imputation
2. One-hot encoding
3. Feature scaling (not always)
4. Oversampling (not always)
5. Model selection
6. Cross validation grid search for hyperparameters and feature selection
7. Build Model
8.