# Practice Random Forest Pipeline/Workflow

In [73]:
# Imports
import pandas as pd
import numpy as np
import random
import string

### Create fake/dummy data

In [74]:
row_num = 5000

In [75]:
# Create columns names for dataset
columns = [''.join(random.choices(string.ascii_uppercase + string.digits, k=5)) for x in range(380)]

In [76]:
# Add Thyroid cancer to columns
columns.insert(0, 'userId')
# Add userId to columns
columns.append('thyroid_cancer')

In [134]:
# Create dataframe rows with different possible data types

dictionary = {}

for item in columns:

    column_options = [[random.randint(0,1) for i in range(row_num)], np.random.randint(0, 10, row_num, dtype=int), np.random.randint(0, 3, row_num, dtype=int),
                      np.random.random_sample(size = row_num), np.random.uniform(low=0, high=3000, size=row_num), np.random.uniform(low=0, high=150, size=row_num)]

    dictionary[item] = random.choice(column_options)


In [135]:
# Create the pandas DataFrame
df = pd.DataFrame(dictionary)

In [136]:
# Change userId values
df['userId'] = [x for x in range(row_num)]

# Make userId index
df.set_index('userId', inplace=True)

In [137]:
# Add missing values
for col in df.columns:
    df.loc[df.sample(frac=0.1).index, col] = np.nan

In [138]:
# Make thyroid cancer data binary with no missing values
df['thyroid_cancer'] = [random.randint(0, 1) for x in range(row_num)]

### Data preprocessing

In [139]:
df


Unnamed: 0_level_0,WJ8JY,I8TW5,F8YPW,LXWAW,AE2MP,BY8WN,0RAFY,7YM3A,VKIPV,8IYQ0,...,TYCKQ,JGCCC,SLFAN,7S72N,2V6ZN,H4MHW,5O094,PLNIC,F79GD,thyroid_cancer
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.418408,131.953365,2334.364488,1.0,2.0,2013.305890,0.0,1.0,8.0,...,1987.786242,0.0,0.053345,0.0,0.961286,1.0,2.0,60.104376,93.392851,1
1,4.0,0.238648,15.852358,1484.842465,1.0,0.0,,0.0,0.0,1.0,...,2891.726738,2.0,0.994365,2.0,0.822770,1.0,1.0,56.985685,123.352895,0
2,7.0,0.460547,67.491232,701.426396,,0.0,2809.147418,0.0,0.0,4.0,...,2128.510179,0.0,0.058699,7.0,0.169334,0.0,2.0,6.631761,24.476376,1
3,8.0,0.314217,146.885621,2338.942198,0.0,2.0,,1.0,0.0,8.0,...,28.007005,1.0,0.044292,8.0,0.981455,0.0,2.0,131.315222,140.450377,1
4,0.0,0.988836,142.380616,2550.182666,1.0,1.0,1822.446291,0.0,0.0,,...,703.515014,1.0,0.760373,7.0,0.104623,1.0,1.0,144.459508,128.280684,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,8.0,0.969992,,2454.982100,0.0,0.0,,2.0,,2.0,...,,,0.229091,6.0,0.512711,,2.0,58.602944,24.234916,1
4996,4.0,0.252477,109.593193,559.348639,1.0,1.0,249.253127,,0.0,4.0,...,2633.885799,1.0,0.225533,0.0,0.285324,1.0,1.0,39.888530,139.698478,0
4997,6.0,0.724609,122.741152,,1.0,2.0,2405.025080,1.0,0.0,4.0,...,1878.751381,1.0,0.429800,7.0,0.607901,1.0,0.0,78.425190,,0
4998,6.0,0.678043,100.637353,2743.252341,1.0,2.0,2174.972229,1.0,1.0,1.0,...,1798.681846,2.0,0.685401,9.0,0.121107,1.0,1.0,39.294898,126.164221,0


In [140]:
# Make sure thyroid cancer column has been added

In [141]:
# Make sure diseases descriptions are the column names rather than just phecodes

In [142]:
# Manual feature selection - use mutual information and consider data leakage.

### Pipeline

In [146]:
# Create test-train split


In [158]:
# Find categorical data columns in dataframe

def find_categorical_columns(dataframe):

    columns = [list(dataframe[i]) for i in dataframe]

    uniques = [len(set([i for i in a if pd.notna(i)])) for a in columns]

    categorical_indexes = [i for i, c in enumerate(uniques) if c <= 10]

    unique_cat_num = [uniques[i] for i in categorical_indexes]

    index_ncats = list(zip(categorical_indexes, unique_cat_num))

    return index_ncats # return indexes of categorical data and the number of categories for that index


In [159]:
find_categorical_columns(df)

[(0, 10),
 (4, 2),
 (5, 3),
 (7, 3),
 (8, 2),
 (9, 10),
 (12, 2),
 (14, 2),
 (17, 2),
 (19, 2),
 (20, 2),
 (21, 10),
 (23, 2),
 (24, 2),
 (26, 3),
 (27, 2),
 (28, 10),
 (29, 10),
 (30, 3),
 (31, 10),
 (32, 3),
 (33, 2),
 (34, 2),
 (35, 10),
 (36, 2),
 (37, 10),
 (39, 3),
 (40, 10),
 (41, 10),
 (42, 2),
 (45, 3),
 (46, 3),
 (52, 2),
 (53, 10),
 (54, 3),
 (56, 3),
 (58, 2),
 (60, 10),
 (61, 10),
 (63, 3),
 (68, 3),
 (72, 3),
 (75, 2),
 (76, 10),
 (77, 3),
 (79, 2),
 (81, 3),
 (85, 3),
 (86, 2),
 (91, 10),
 (92, 3),
 (99, 10),
 (104, 10),
 (107, 2),
 (108, 3),
 (109, 10),
 (110, 10),
 (111, 2),
 (112, 10),
 (113, 3),
 (114, 3),
 (115, 2),
 (117, 2),
 (119, 2),
 (120, 3),
 (121, 10),
 (122, 3),
 (129, 2),
 (130, 10),
 (137, 3),
 (140, 3),
 (142, 3),
 (143, 10),
 (144, 3),
 (146, 2),
 (147, 10),
 (152, 2),
 (153, 10),
 (154, 2),
 (156, 3),
 (157, 2),
 (158, 3),
 (160, 3),
 (162, 2),
 (165, 10),
 (166, 3),
 (167, 3),
 (171, 10),
 (173, 2),
 (174, 2),
 (176, 3),
 (177, 10),
 (180, 3),
 (181, 

In [89]:
# select categorical data columns
df.select_dtypes(include=['category','int']).dtypes

thyroid_cancer    int64
dtype: object

In [50]:
df.select_dtypes(include=['category','int','float']).dtypes

UPXE0             float64
ESY64             float64
WZ8RS             float64
82D8N             float64
35CIZ             float64
                   ...   
CFR9E             float64
Y0BGK             float64
PPMTO             float64
ZTS4W             float64
thyroid_cancer      int64
Length: 381, dtype: object

Feature selection: MRMR or Recursive Feature Elimination (RFE) or Boruta

You could consider feature selection a hyperparameter. Consequently, you can include it as part of a cross validation grid search to obtain optimum hyperparameters for the model in question.

https://medium.com/data-science-reporter/feature-selection-via-grid-search-in-supervised-models-4dc0c43d7ab1

Preproccessing:

- Clean data
- Add thyroid cancer
- Colum names
- Manual feature selection (data leakage - look at mutual information)


Pipeline:

Test-Train split

1. Imputation
2. Feature scaling (not always)
3. Oversampling (not always)
4. Model selection
5. Cross validation grid search for hyperparameters and feature selection
6. Build Model
7.

In [51]:
for column in df.columns[1:]:
    print(df[column])

userId
0       2.0
1       0.0
2       3.0
3       3.0
4       1.0
       ... 
4995    2.0
4996    3.0
4997    1.0
4998    1.0
4999    2.0
Name: ESY64, Length: 5000, dtype: float64
userId
0        0.0
1        NaN
2        8.0
3       10.0
4        9.0
        ... 
4995     0.0
4996     1.0
4997     9.0
4998    10.0
4999     6.0
Name: WZ8RS, Length: 5000, dtype: float64
userId
0       1.0
1       1.0
2       0.0
3       1.0
4       1.0
       ... 
4995    0.0
4996    0.0
4997    0.0
4998    0.0
4999    0.0
Name: 82D8N, Length: 5000, dtype: float64
userId
0       1.0
1       0.0
2       NaN
3       0.0
4       1.0
       ... 
4995    0.0
4996    1.0
4997    1.0
4998    NaN
4999    1.0
Name: 35CIZ, Length: 5000, dtype: float64
userId
0        6.0
1        0.0
2       10.0
3       10.0
4        1.0
        ... 
4995     0.0
4996     1.0
4997     0.0
4998     6.0
4999     5.0
Name: 002RD, Length: 5000, dtype: float64
userId
0        89.614320
1        85.435542
2        23.050883
3        

In [57]:
df.columns

Index(['UPXE0', 'ESY64', 'WZ8RS', '82D8N', '35CIZ', '002RD', 'MH90N', 'IZUP3',
       '1RBIO', 'N02AQ',
       ...
       'BS488', '586B9', '4RNLE', 'BVVUP', '246U4', 'CFR9E', 'Y0BGK', 'PPMTO',
       'ZTS4W', 'thyroid_cancer'],
      dtype='object', length=381)

In [90]:
a = list(np.random.randint(0, 2, row_num, dtype=int))
type(a[0])

numpy.int64