# **3. Preprocessing**

In [74]:
import sys
import os
sys.path.append(os.path.abspath("../src"))
import util

import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler

## 0. Load Config
---

In [75]:
# Load the configuration file.
config = util.load_config()

## 1. Load Data Trainning
---

In [76]:
# Load the train data.
path = config["path_train_set"]

X_train = util.pickle_load(f"../{path[0]}")
y_train = util.pickle_load(f"../{path[1]}")

In [77]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16000 entries, 19882 to 14592
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  16000 non-null  int64  
 1   Gender               16000 non-null  object 
 2   Tumor_Size           16000 non-null  float64
 3   Location             16000 non-null  object 
 4   Histology            16000 non-null  object 
 5   Stage                16000 non-null  object 
 6   Symptom_1            16000 non-null  object 
 7   Symptom_2            16000 non-null  object 
 8   Symptom_3            16000 non-null  object 
 9   Radiation_Treatment  16000 non-null  object 
 10  Surgery_Performed    16000 non-null  object 
 11  Chemotherapy         16000 non-null  object 
 12  Survival_Rate        16000 non-null  float64
 13  Tumor_Growth_Rate    16000 non-null  float64
 14  Family_History       16000 non-null  object 
 15  MRI_Result           16000 non-null  

In [78]:
# Num-Cat Split.
num_cols = list(config["int64_columns"] + config["float64_columns"])
cat_cols = list(config["object_columns"])

print(f"Numerical features  :\n{num_cols}\n")
print(f"Categorical Features:\n{cat_cols}")

Numerical features  :
['Age', 'Tumor_Size', 'Survival_Rate', 'Tumor_Growth_Rate']

Categorical Features:
['Gender', 'Location', 'Histology', 'Stage', 'Symptom_1', 'Symptom_2', 'Symptom_3', 'Radiation_Treatment', 'Surgery_Performed', 'Chemotherapy', 'Family_History', 'MRI_Result', 'Follow_Up_Required']


## 2. Split Data
---
1. Numerical-Categorical split

In [79]:
# Filter train input based on column type.
X_train_num = X_train[num_cols]
X_train_cat = X_train[cat_cols]

In [80]:
X_train_num.head()

Unnamed: 0_level_0,Age,Tumor_Size,Survival_Rate,Tumor_Growth_Rate
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
19882,50,7.544846,55.441331,2.380341
17741,46,1.450586,67.185624,1.248645
5362,30,1.108693,50.656056,2.621739
12736,38,4.5766,78.252317,1.696516
16231,57,3.043994,84.844266,1.760073


In [81]:
X_train_cat

Unnamed: 0_level_0,Gender,Location,Histology,Stage,Symptom_1,Symptom_2,Symptom_3,Radiation_Treatment,Surgery_Performed,Chemotherapy,Family_History,MRI_Result,Follow_Up_Required
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
19882,Male,Parietal,Glioblastoma,II,Nausea,Seizures,Nausea,No,No,No,No,Negative,Yes
17741,Male,Temporal,Medulloblastoma,II,Nausea,Vision Issues,Headache,No,No,No,Yes,Negative,No
5362,Female,Parietal,Medulloblastoma,IV,Seizures,Vision Issues,Seizures,No,Yes,Yes,Yes,Positive,Yes
12736,Female,Occipital,Meningioma,IV,Seizures,Nausea,Vision Issues,Yes,Yes,Yes,Yes,Positive,No
16231,Male,Parietal,Astrocytoma,I,Nausea,Seizures,Seizures,Yes,No,Yes,Yes,Negative,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17946,Male,Temporal,Glioblastoma,II,Headache,Headache,Nausea,Yes,Yes,Yes,No,Positive,Yes
7785,Male,Frontal,Meningioma,III,Headache,Seizures,Seizures,No,No,Yes,Yes,Negative,No
12846,Female,Temporal,Astrocytoma,II,Headache,Vision Issues,Vision Issues,Yes,Yes,No,No,Negative,Yes
8011,Male,Temporal,Astrocytoma,I,Headache,Nausea,Vision Issues,No,No,Yes,No,Negative,No


## **3. Encoding**
---
1. Encoding Kolom Boolean menjadi 0/1 -> no: 0, Yes:1
2. Encoding fitur `MRI_Result` menjadi 0/1 -> Negative: 0, Positif: 1
3. Encoding fitur `Gender` -> Male: 0, Female: 1
4. Encode fitu `Stage`
5. Encode fitur `Location`, `Histology`, `Symptom_1-2-3`
6. Encoding target

### 3.1 Encoding Kolom Boolean menjadi 0/1 -> no: 0, Yes:1
---

In [82]:
binary_cols = ['Radiation_Treatment', 'Surgery_Performed', 'Chemotherapy', 'Family_History', 'Follow_Up_Required']

X_train_cat[binary_cols] = X_train_cat[binary_cols].replace({'No': 0, 'Yes': 1})

  X_train_cat[binary_cols] = X_train_cat[binary_cols].replace({'No': 0, 'Yes': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_cat[binary_cols] = X_train_cat[binary_cols].replace({'No': 0, 'Yes': 1})


In [83]:
X_train_cat.head()

Unnamed: 0_level_0,Gender,Location,Histology,Stage,Symptom_1,Symptom_2,Symptom_3,Radiation_Treatment,Surgery_Performed,Chemotherapy,Family_History,MRI_Result,Follow_Up_Required
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
19882,Male,Parietal,Glioblastoma,II,Nausea,Seizures,Nausea,0,0,0,0,Negative,1
17741,Male,Temporal,Medulloblastoma,II,Nausea,Vision Issues,Headache,0,0,0,1,Negative,0
5362,Female,Parietal,Medulloblastoma,IV,Seizures,Vision Issues,Seizures,0,1,1,1,Positive,1
12736,Female,Occipital,Meningioma,IV,Seizures,Nausea,Vision Issues,1,1,1,1,Positive,0
16231,Male,Parietal,Astrocytoma,I,Nausea,Seizures,Seizures,1,0,1,1,Negative,1


### 3.2 Encoding fitur `MRI_Result` menjadi 0/1 -> Negative: 0, Positif: 1
---

In [84]:
X_train_cat['MRI_Result'] = X_train_cat['MRI_Result'].map({'Negative': 0, 'Positive': 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_cat['MRI_Result'] = X_train_cat['MRI_Result'].map({'Negative': 0, 'Positive': 1})


In [85]:
X_train_cat

Unnamed: 0_level_0,Gender,Location,Histology,Stage,Symptom_1,Symptom_2,Symptom_3,Radiation_Treatment,Surgery_Performed,Chemotherapy,Family_History,MRI_Result,Follow_Up_Required
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
19882,Male,Parietal,Glioblastoma,II,Nausea,Seizures,Nausea,0,0,0,0,0,1
17741,Male,Temporal,Medulloblastoma,II,Nausea,Vision Issues,Headache,0,0,0,1,0,0
5362,Female,Parietal,Medulloblastoma,IV,Seizures,Vision Issues,Seizures,0,1,1,1,1,1
12736,Female,Occipital,Meningioma,IV,Seizures,Nausea,Vision Issues,1,1,1,1,1,0
16231,Male,Parietal,Astrocytoma,I,Nausea,Seizures,Seizures,1,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17946,Male,Temporal,Glioblastoma,II,Headache,Headache,Nausea,1,1,1,0,1,1
7785,Male,Frontal,Meningioma,III,Headache,Seizures,Seizures,0,0,1,1,0,0
12846,Female,Temporal,Astrocytoma,II,Headache,Vision Issues,Vision Issues,1,1,0,0,0,1
8011,Male,Temporal,Astrocytoma,I,Headache,Nausea,Vision Issues,0,0,1,0,0,0


### 3.3 Encoding fitur `Gender` -> Male: 0, Female: 1
---

In [86]:
X_train_cat['Gender'] = X_train_cat['Gender'].map({'Male': 0, 'Female': 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_cat['Gender'] = X_train_cat['Gender'].map({'Male': 0, 'Female': 1})


In [87]:
X_train_cat.head()

Unnamed: 0_level_0,Gender,Location,Histology,Stage,Symptom_1,Symptom_2,Symptom_3,Radiation_Treatment,Surgery_Performed,Chemotherapy,Family_History,MRI_Result,Follow_Up_Required
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
19882,0,Parietal,Glioblastoma,II,Nausea,Seizures,Nausea,0,0,0,0,0,1
17741,0,Temporal,Medulloblastoma,II,Nausea,Vision Issues,Headache,0,0,0,1,0,0
5362,1,Parietal,Medulloblastoma,IV,Seizures,Vision Issues,Seizures,0,1,1,1,1,1
12736,1,Occipital,Meningioma,IV,Seizures,Nausea,Vision Issues,1,1,1,1,1,0
16231,0,Parietal,Astrocytoma,I,Nausea,Seizures,Seizures,1,0,1,1,0,1


### 3.4 Encoding fitur `Stage`
---

In [88]:
X_train_cat['Stage'] = X_train_cat['Stage'].map({'I': 1, 'II': 2, 'III': 3, 'IV': 4})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_cat['Stage'] = X_train_cat['Stage'].map({'I': 1, 'II': 2, 'III': 3, 'IV': 4})


In [89]:
X_train_cat.head()

Unnamed: 0_level_0,Gender,Location,Histology,Stage,Symptom_1,Symptom_2,Symptom_3,Radiation_Treatment,Surgery_Performed,Chemotherapy,Family_History,MRI_Result,Follow_Up_Required
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
19882,0,Parietal,Glioblastoma,2,Nausea,Seizures,Nausea,0,0,0,0,0,1
17741,0,Temporal,Medulloblastoma,2,Nausea,Vision Issues,Headache,0,0,0,1,0,0
5362,1,Parietal,Medulloblastoma,4,Seizures,Vision Issues,Seizures,0,1,1,1,1,1
12736,1,Occipital,Meningioma,4,Seizures,Nausea,Vision Issues,1,1,1,1,1,0
16231,0,Parietal,Astrocytoma,1,Nausea,Seizures,Seizures,1,0,1,1,0,1


### 3.5 Encode fitur `Location`, `Histology`, `Symptom_1-2-3`
---

In [90]:
ohe = OneHotEncoder(handle_unknown='ignore')

In [91]:
nominal_cat = ['Location', 'Histology', 'Symptom_1', 'Symptom_2', 'Symptom_3']
X_train_ohe = pd.get_dummies(X_train_cat, columns=nominal_cat, drop_first=False, dtype=int)

In [92]:
X_train_ohe

Unnamed: 0_level_0,Gender,Stage,Radiation_Treatment,Surgery_Performed,Chemotherapy,Family_History,MRI_Result,Follow_Up_Required,Location_Frontal,Location_Occipital,...,Symptom_1_Seizures,Symptom_1_Vision Issues,Symptom_2_Headache,Symptom_2_Nausea,Symptom_2_Seizures,Symptom_2_Vision Issues,Symptom_3_Headache,Symptom_3_Nausea,Symptom_3_Seizures,Symptom_3_Vision Issues
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19882,0,2,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0
17741,0,2,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
5362,1,4,0,1,1,1,1,1,0,0,...,1,0,0,0,0,1,0,0,1,0
12736,1,4,1,1,1,1,1,0,0,1,...,1,0,0,1,0,0,0,0,0,1
16231,0,1,1,0,1,1,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17946,0,2,1,1,1,0,1,1,0,0,...,0,0,1,0,0,0,0,1,0,0
7785,0,3,0,0,1,1,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
12846,1,2,1,1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
8011,0,1,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


### 3.6 Encoding Target
---

In [93]:
y_train = y_train.map({'Benign': 0, 'Malignant': 1})

In [94]:
y_train

Patient_ID
19882    0
17741    0
5362     0
12736    0
16231    1
        ..
17946    1
7785     0
12846    0
8011     1
14592    0
Name: Tumor_Type, Length: 16000, dtype: int64

## 4. Scaling

In [95]:
scaler = StandardScaler()

In [96]:
X_train_num_scaled = pd.DataFrame(
    scaler.fit_transform(X_train_num),
    columns=X_train_num.columns,
    index=X_train_num.index
)

In [98]:
X_train_num_scaled.describe()

Unnamed: 0,Age,Tumor_Size,Survival_Rate,Tumor_Growth_Rate
count,16000.0,16000.0,16000.0,16000.0
mean,-2.9309890000000003e-17,6.039613e-17,-3.299583e-16,-1.332268e-16
std,1.000031,1.000031,1.000031,1.000031
min,-1.689869,-1.710593,-1.742232,-1.731053
25%,-0.8857097,-0.8748137,-0.8636811,-0.8554275
50%,-0.02411042,-0.01764688,0.008110031,-0.006017621
75%,0.8949288,0.865574,0.8639931,0.8553647
max,1.699088,1.736234,1.731223,1.739159


## Kesimpulan
- Guankan `X_train_ohe` untuk mempresentasikan `X_train_cat` yang sudah di encoding
- Gunakan `X_train_num_scaled` untuk mempresentasikan `X_train_num` yang sudah di scaling