# **3. Preprocessing**

In [1]:
import sys
import os
sys.path.append(os.path.abspath("../src"))
import util

import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler

## 0. Load Config
---

In [2]:
# Load the configuration file.
config = util.load_config()

## 1. Load Data Trainning
---

In [3]:
# Load the train data.
path_train = config["path_train_set"]
path_test = config["path_test_set"]

X_train = util.pickle_load(f"../{path_train[0]}")
X_test = util.pickle_load(f"../{path_test[0]}")

In [4]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16000 entries, 19882 to 14592
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  16000 non-null  int64  
 1   Gender               16000 non-null  object 
 2   Tumor_Size           16000 non-null  float64
 3   Location             16000 non-null  object 
 4   Histology            16000 non-null  object 
 5   Stage                16000 non-null  object 
 6   Symptom_1            16000 non-null  object 
 7   Symptom_2            16000 non-null  object 
 8   Symptom_3            16000 non-null  object 
 9   Radiation_Treatment  16000 non-null  object 
 10  Surgery_Performed    16000 non-null  object 
 11  Chemotherapy         16000 non-null  object 
 12  Survival_Rate        16000 non-null  float64
 13  Tumor_Growth_Rate    16000 non-null  float64
 14  Family_History       16000 non-null  object 
 15  MRI_Result           16000 non-null  

In [5]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4000 entries, 9801 to 14733
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  4000 non-null   int64  
 1   Gender               4000 non-null   object 
 2   Tumor_Size           4000 non-null   float64
 3   Location             4000 non-null   object 
 4   Histology            4000 non-null   object 
 5   Stage                4000 non-null   object 
 6   Symptom_1            4000 non-null   object 
 7   Symptom_2            4000 non-null   object 
 8   Symptom_3            4000 non-null   object 
 9   Radiation_Treatment  4000 non-null   object 
 10  Surgery_Performed    4000 non-null   object 
 11  Chemotherapy         4000 non-null   object 
 12  Survival_Rate        4000 non-null   float64
 13  Tumor_Growth_Rate    4000 non-null   float64
 14  Family_History       4000 non-null   object 
 15  MRI_Result           4000 non-null   ob

In [6]:
# Num-Cat Split.
num_cols = list(config["int64_columns"] + config["float64_columns"])
cat_cols = list(config["object_columns"])

print(f"Numerical features  :\n{num_cols}\n")
print(f"Categorical Features:\n{cat_cols}")

Numerical features  :
['Age', 'Tumor_Size', 'Survival_Rate', 'Tumor_Growth_Rate']

Categorical Features:
['Gender', 'Location', 'Histology', 'Stage', 'Symptom_1', 'Symptom_2', 'Symptom_3', 'Radiation_Treatment', 'Surgery_Performed', 'Chemotherapy', 'Family_History', 'MRI_Result', 'Follow_Up_Required']


## 2. Split Data
---
1. Numerical-Categorical split

In [7]:
# Filter train input based on column type.
X_train_num = X_train[num_cols]
X_train_cat = X_train[cat_cols]

X_test_num = X_test[num_cols]
X_test_cat = X_test[cat_cols]

In [8]:
X_train_num.head()

Unnamed: 0_level_0,Age,Tumor_Size,Survival_Rate,Tumor_Growth_Rate
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
19882,50,7.544846,55.441331,2.380341
17741,46,1.450586,67.185624,1.248645
5362,30,1.108693,50.656056,2.621739
12736,38,4.5766,78.252317,1.696516
16231,57,3.043994,84.844266,1.760073


In [9]:
X_train_cat

Unnamed: 0_level_0,Gender,Location,Histology,Stage,Symptom_1,Symptom_2,Symptom_3,Radiation_Treatment,Surgery_Performed,Chemotherapy,Family_History,MRI_Result,Follow_Up_Required
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
19882,Male,Parietal,Glioblastoma,II,Nausea,Seizures,Nausea,No,No,No,No,Negative,Yes
17741,Male,Temporal,Medulloblastoma,II,Nausea,Vision Issues,Headache,No,No,No,Yes,Negative,No
5362,Female,Parietal,Medulloblastoma,IV,Seizures,Vision Issues,Seizures,No,Yes,Yes,Yes,Positive,Yes
12736,Female,Occipital,Meningioma,IV,Seizures,Nausea,Vision Issues,Yes,Yes,Yes,Yes,Positive,No
16231,Male,Parietal,Astrocytoma,I,Nausea,Seizures,Seizures,Yes,No,Yes,Yes,Negative,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17946,Male,Temporal,Glioblastoma,II,Headache,Headache,Nausea,Yes,Yes,Yes,No,Positive,Yes
7785,Male,Frontal,Meningioma,III,Headache,Seizures,Seizures,No,No,Yes,Yes,Negative,No
12846,Female,Temporal,Astrocytoma,II,Headache,Vision Issues,Vision Issues,Yes,Yes,No,No,Negative,Yes
8011,Male,Temporal,Astrocytoma,I,Headache,Nausea,Vision Issues,No,No,Yes,No,Negative,No


In [10]:
X_test_num.head()

Unnamed: 0_level_0,Age,Tumor_Size,Survival_Rate,Tumor_Growth_Rate
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9801,75,0.969833,57.32444,0.3017
19240,29,4.604909,74.402753,2.044861
10545,37,7.603519,81.713524,2.324049
13982,49,6.27986,76.585017,2.69079
12578,70,6.272735,88.145463,1.924815


In [11]:
X_test_cat.head()

Unnamed: 0_level_0,Gender,Location,Histology,Stage,Symptom_1,Symptom_2,Symptom_3,Radiation_Treatment,Surgery_Performed,Chemotherapy,Family_History,MRI_Result,Follow_Up_Required
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
9801,Female,Parietal,Medulloblastoma,I,Headache,Nausea,Headache,Yes,No,Yes,No,Positive,No
19240,Male,Parietal,Astrocytoma,I,Vision Issues,Nausea,Nausea,Yes,Yes,Yes,Yes,Negative,No
10545,Male,Parietal,Glioblastoma,II,Vision Issues,Vision Issues,Seizures,No,No,No,No,Negative,No
13982,Male,Temporal,Astrocytoma,IV,Seizures,Nausea,Nausea,No,No,No,Yes,Negative,No
12578,Male,Parietal,Meningioma,IV,Headache,Headache,Headache,Yes,Yes,No,Yes,Negative,Yes


## **3. Encoding**
---
1. Encoding Kolom Boolean menjadi 0/1 -> no: 0, Yes:1
2. Encoding fitur `MRI_Result` menjadi 0/1 -> Negative: 0, Positif: 1
3. Encoding fitur `Gender` -> Male: 0, Female: 1
4. Encode fitu `Stage`
5. Encode fitur `Location`, `Histology`, `Symptom_1-2-3`
6. Encoding target

### 3.1 Encoding Kolom Boolean menjadi 0/1 -> no: 0, Yes:1
---

In [12]:
binary_cols = ['Radiation_Treatment', 'Surgery_Performed', 'Chemotherapy', 'Family_History', 'Follow_Up_Required']

X_train_cat[binary_cols] = X_train_cat[binary_cols].replace({'No': 0, 'Yes': 1})
X_test_cat[binary_cols] = X_test_cat[binary_cols].replace({'No': 0, 'Yes': 1})

  X_train_cat[binary_cols] = X_train_cat[binary_cols].replace({'No': 0, 'Yes': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_cat[binary_cols] = X_train_cat[binary_cols].replace({'No': 0, 'Yes': 1})
  X_test_cat[binary_cols] = X_test_cat[binary_cols].replace({'No': 0, 'Yes': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_cat[binary_cols] = X_test_cat[binary_cols].replace({'No': 0, 'Yes': 1})


In [13]:
X_train_cat.head()

Unnamed: 0_level_0,Gender,Location,Histology,Stage,Symptom_1,Symptom_2,Symptom_3,Radiation_Treatment,Surgery_Performed,Chemotherapy,Family_History,MRI_Result,Follow_Up_Required
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
19882,Male,Parietal,Glioblastoma,II,Nausea,Seizures,Nausea,0,0,0,0,Negative,1
17741,Male,Temporal,Medulloblastoma,II,Nausea,Vision Issues,Headache,0,0,0,1,Negative,0
5362,Female,Parietal,Medulloblastoma,IV,Seizures,Vision Issues,Seizures,0,1,1,1,Positive,1
12736,Female,Occipital,Meningioma,IV,Seizures,Nausea,Vision Issues,1,1,1,1,Positive,0
16231,Male,Parietal,Astrocytoma,I,Nausea,Seizures,Seizures,1,0,1,1,Negative,1


In [14]:
X_test_cat.head()

Unnamed: 0_level_0,Gender,Location,Histology,Stage,Symptom_1,Symptom_2,Symptom_3,Radiation_Treatment,Surgery_Performed,Chemotherapy,Family_History,MRI_Result,Follow_Up_Required
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
9801,Female,Parietal,Medulloblastoma,I,Headache,Nausea,Headache,1,0,1,0,Positive,0
19240,Male,Parietal,Astrocytoma,I,Vision Issues,Nausea,Nausea,1,1,1,1,Negative,0
10545,Male,Parietal,Glioblastoma,II,Vision Issues,Vision Issues,Seizures,0,0,0,0,Negative,0
13982,Male,Temporal,Astrocytoma,IV,Seizures,Nausea,Nausea,0,0,0,1,Negative,0
12578,Male,Parietal,Meningioma,IV,Headache,Headache,Headache,1,1,0,1,Negative,1


### 3.2 Encoding fitur `MRI_Result` menjadi 0/1 -> Negative: 0, Positif: 1
---

In [15]:
X_train_cat['MRI_Result'] = X_train_cat['MRI_Result'].map({'Negative': 0, 'Positive': 1})
X_test_cat['MRI_Result'] = X_test_cat['MRI_Result'].map({'Negative': 0, 'Positive': 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_cat['MRI_Result'] = X_train_cat['MRI_Result'].map({'Negative': 0, 'Positive': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_cat['MRI_Result'] = X_test_cat['MRI_Result'].map({'Negative': 0, 'Positive': 1})


In [16]:
X_train_cat

Unnamed: 0_level_0,Gender,Location,Histology,Stage,Symptom_1,Symptom_2,Symptom_3,Radiation_Treatment,Surgery_Performed,Chemotherapy,Family_History,MRI_Result,Follow_Up_Required
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
19882,Male,Parietal,Glioblastoma,II,Nausea,Seizures,Nausea,0,0,0,0,0,1
17741,Male,Temporal,Medulloblastoma,II,Nausea,Vision Issues,Headache,0,0,0,1,0,0
5362,Female,Parietal,Medulloblastoma,IV,Seizures,Vision Issues,Seizures,0,1,1,1,1,1
12736,Female,Occipital,Meningioma,IV,Seizures,Nausea,Vision Issues,1,1,1,1,1,0
16231,Male,Parietal,Astrocytoma,I,Nausea,Seizures,Seizures,1,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17946,Male,Temporal,Glioblastoma,II,Headache,Headache,Nausea,1,1,1,0,1,1
7785,Male,Frontal,Meningioma,III,Headache,Seizures,Seizures,0,0,1,1,0,0
12846,Female,Temporal,Astrocytoma,II,Headache,Vision Issues,Vision Issues,1,1,0,0,0,1
8011,Male,Temporal,Astrocytoma,I,Headache,Nausea,Vision Issues,0,0,1,0,0,0


In [17]:
X_test_cat.head()

Unnamed: 0_level_0,Gender,Location,Histology,Stage,Symptom_1,Symptom_2,Symptom_3,Radiation_Treatment,Surgery_Performed,Chemotherapy,Family_History,MRI_Result,Follow_Up_Required
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
9801,Female,Parietal,Medulloblastoma,I,Headache,Nausea,Headache,1,0,1,0,1,0
19240,Male,Parietal,Astrocytoma,I,Vision Issues,Nausea,Nausea,1,1,1,1,0,0
10545,Male,Parietal,Glioblastoma,II,Vision Issues,Vision Issues,Seizures,0,0,0,0,0,0
13982,Male,Temporal,Astrocytoma,IV,Seizures,Nausea,Nausea,0,0,0,1,0,0
12578,Male,Parietal,Meningioma,IV,Headache,Headache,Headache,1,1,0,1,0,1


### 3.3 Encoding fitur `Gender` -> Male: 0, Female: 1
---

In [18]:
X_train_cat['Gender'] = X_train_cat['Gender'].map({'Male': 0, 'Female': 1})
X_test_cat['Gender'] = X_test_cat['Gender'].map({'Male': 0, 'Female': 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_cat['Gender'] = X_train_cat['Gender'].map({'Male': 0, 'Female': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_cat['Gender'] = X_test_cat['Gender'].map({'Male': 0, 'Female': 1})


In [19]:
X_train_cat.head()

Unnamed: 0_level_0,Gender,Location,Histology,Stage,Symptom_1,Symptom_2,Symptom_3,Radiation_Treatment,Surgery_Performed,Chemotherapy,Family_History,MRI_Result,Follow_Up_Required
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
19882,0,Parietal,Glioblastoma,II,Nausea,Seizures,Nausea,0,0,0,0,0,1
17741,0,Temporal,Medulloblastoma,II,Nausea,Vision Issues,Headache,0,0,0,1,0,0
5362,1,Parietal,Medulloblastoma,IV,Seizures,Vision Issues,Seizures,0,1,1,1,1,1
12736,1,Occipital,Meningioma,IV,Seizures,Nausea,Vision Issues,1,1,1,1,1,0
16231,0,Parietal,Astrocytoma,I,Nausea,Seizures,Seizures,1,0,1,1,0,1


In [20]:
X_test_cat.head()

Unnamed: 0_level_0,Gender,Location,Histology,Stage,Symptom_1,Symptom_2,Symptom_3,Radiation_Treatment,Surgery_Performed,Chemotherapy,Family_History,MRI_Result,Follow_Up_Required
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
9801,1,Parietal,Medulloblastoma,I,Headache,Nausea,Headache,1,0,1,0,1,0
19240,0,Parietal,Astrocytoma,I,Vision Issues,Nausea,Nausea,1,1,1,1,0,0
10545,0,Parietal,Glioblastoma,II,Vision Issues,Vision Issues,Seizures,0,0,0,0,0,0
13982,0,Temporal,Astrocytoma,IV,Seizures,Nausea,Nausea,0,0,0,1,0,0
12578,0,Parietal,Meningioma,IV,Headache,Headache,Headache,1,1,0,1,0,1


### 3.4 Encoding fitur `Stage`
---

In [21]:
X_train_cat['Stage'] = X_train_cat['Stage'].map({'I': 1, 'II': 2, 'III': 3, 'IV': 4})
X_test_cat['Stage'] = X_test_cat['Stage'].map({'I': 1, 'II': 2, 'III': 3, 'IV': 4})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_cat['Stage'] = X_train_cat['Stage'].map({'I': 1, 'II': 2, 'III': 3, 'IV': 4})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_cat['Stage'] = X_test_cat['Stage'].map({'I': 1, 'II': 2, 'III': 3, 'IV': 4})


In [22]:
X_train_cat.head()

Unnamed: 0_level_0,Gender,Location,Histology,Stage,Symptom_1,Symptom_2,Symptom_3,Radiation_Treatment,Surgery_Performed,Chemotherapy,Family_History,MRI_Result,Follow_Up_Required
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
19882,0,Parietal,Glioblastoma,2,Nausea,Seizures,Nausea,0,0,0,0,0,1
17741,0,Temporal,Medulloblastoma,2,Nausea,Vision Issues,Headache,0,0,0,1,0,0
5362,1,Parietal,Medulloblastoma,4,Seizures,Vision Issues,Seizures,0,1,1,1,1,1
12736,1,Occipital,Meningioma,4,Seizures,Nausea,Vision Issues,1,1,1,1,1,0
16231,0,Parietal,Astrocytoma,1,Nausea,Seizures,Seizures,1,0,1,1,0,1


In [23]:
X_test_cat.head()

Unnamed: 0_level_0,Gender,Location,Histology,Stage,Symptom_1,Symptom_2,Symptom_3,Radiation_Treatment,Surgery_Performed,Chemotherapy,Family_History,MRI_Result,Follow_Up_Required
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
9801,1,Parietal,Medulloblastoma,1,Headache,Nausea,Headache,1,0,1,0,1,0
19240,0,Parietal,Astrocytoma,1,Vision Issues,Nausea,Nausea,1,1,1,1,0,0
10545,0,Parietal,Glioblastoma,2,Vision Issues,Vision Issues,Seizures,0,0,0,0,0,0
13982,0,Temporal,Astrocytoma,4,Seizures,Nausea,Nausea,0,0,0,1,0,0
12578,0,Parietal,Meningioma,4,Headache,Headache,Headache,1,1,0,1,0,1


### 3.5 Encode fitur `Location`, `Histology`, `Symptom_1-2-3`
---

In [24]:
ohe = OneHotEncoder(handle_unknown='ignore')

In [25]:
nominal_cat = ['Location', 'Histology', 'Symptom_1', 'Symptom_2', 'Symptom_3']
X_train_ohe = pd.get_dummies(X_train_cat, columns=nominal_cat, drop_first=False, dtype=int)
X_test_ohe = pd.get_dummies(X_test_cat, columns=nominal_cat, drop_first=False, dtype=int)

In [26]:
X_train_ohe

Unnamed: 0_level_0,Gender,Stage,Radiation_Treatment,Surgery_Performed,Chemotherapy,Family_History,MRI_Result,Follow_Up_Required,Location_Frontal,Location_Occipital,...,Symptom_1_Seizures,Symptom_1_Vision Issues,Symptom_2_Headache,Symptom_2_Nausea,Symptom_2_Seizures,Symptom_2_Vision Issues,Symptom_3_Headache,Symptom_3_Nausea,Symptom_3_Seizures,Symptom_3_Vision Issues
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19882,0,2,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0
17741,0,2,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
5362,1,4,0,1,1,1,1,1,0,0,...,1,0,0,0,0,1,0,0,1,0
12736,1,4,1,1,1,1,1,0,0,1,...,1,0,0,1,0,0,0,0,0,1
16231,0,1,1,0,1,1,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17946,0,2,1,1,1,0,1,1,0,0,...,0,0,1,0,0,0,0,1,0,0
7785,0,3,0,0,1,1,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
12846,1,2,1,1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
8011,0,1,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [27]:
X_test_ohe

Unnamed: 0_level_0,Gender,Stage,Radiation_Treatment,Surgery_Performed,Chemotherapy,Family_History,MRI_Result,Follow_Up_Required,Location_Frontal,Location_Occipital,...,Symptom_1_Seizures,Symptom_1_Vision Issues,Symptom_2_Headache,Symptom_2_Nausea,Symptom_2_Seizures,Symptom_2_Vision Issues,Symptom_3_Headache,Symptom_3_Nausea,Symptom_3_Seizures,Symptom_3_Vision Issues
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9801,1,1,1,0,1,0,1,0,0,0,...,0,0,0,1,0,0,1,0,0,0
19240,0,1,1,1,1,1,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
10545,0,2,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0
13982,0,4,0,0,0,1,0,0,0,0,...,1,0,0,1,0,0,0,1,0,0
12578,0,4,1,1,0,1,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18092,0,4,0,0,0,1,0,1,0,0,...,0,0,0,0,1,0,1,0,0,0
9016,0,1,1,0,1,1,1,0,0,0,...,0,1,1,0,0,0,0,0,0,1
9054,1,4,0,1,0,0,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
19217,0,4,1,0,0,0,1,1,0,0,...,0,0,0,1,0,0,1,0,0,0


### 3.6 Encoding Target
---

In [28]:
# y_train_final = y_train.map({'Benign': 0, 'Malignant': 1})

In [29]:
# y_train_final

## 4. Scaling

In [30]:
scaler = StandardScaler()

In [31]:
scaler.fit(X_train_num)

X_train_num_scaled = pd.DataFrame(
    scaler.transform(X_train_num),
    columns=X_train_num.columns,
    index=X_train_num.index
)

# Gunakan scaler yang sudah dilatih (tanpa fit)
X_test_num_scaled = pd.DataFrame(
    scaler.transform(X_test_num),
    columns=X_test_num.columns,
    index=X_test_num.index
)

In [32]:
X_train_num_scaled.describe()

Unnamed: 0,Age,Tumor_Size,Survival_Rate,Tumor_Growth_Rate
count,16000.0,16000.0,16000.0,16000.0
mean,-2.9309890000000003e-17,6.039613e-17,-3.299583e-16,-1.332268e-16
std,1.000031,1.000031,1.000031,1.000031
min,-1.689869,-1.710593,-1.742232,-1.731053
25%,-0.8857097,-0.8748137,-0.8636811,-0.8554275
50%,-0.02411042,-0.01764688,0.008110031,-0.006017621
75%,0.8949288,0.865574,0.8639931,0.8553647
max,1.699088,1.736234,1.731223,1.739159


In [33]:
X_test_num_scaled.describe()

Unnamed: 0,Age,Tumor_Size,Survival_Rate,Tumor_Growth_Rate
count,4000.0,4000.0,4000.0,4000.0
mean,0.004006,0.039687,0.010526,0.00464
std,0.994181,0.996722,0.999432,1.003603
min,-1.689869,-1.710661,-1.741191,-1.732015
25%,-0.82827,-0.811408,-0.85053,-0.860897
50%,-0.02411,0.035738,0.022009,-0.007853
75%,0.837489,0.907899,0.87273,0.859705
max,1.699088,1.735793,1.730632,1.739197


## Gabungkan ke-2 X_train.

In [34]:
X_train_combined = pd.concat([X_train_num_scaled, X_train_ohe], axis=1)
X_test_combined = pd.concat([X_test_num_scaled, X_test_ohe], axis=1)

## Kesimpulan
- Guankan `X_train_ohe` untuk mempresentasikan `X_train_cat` yang sudah di encoding
- Gunakan `X_train_num_scaled` untuk mempresentasikan `X_train_num` yang sudah di scaling

In [35]:
# Serialize the train data.
util.pickle_dump(X_train_combined, f"../{config['path_set_final'][0]}")
util.pickle_dump(X_test_combined, f"../{config['path_set_final'][1]}")

Data serialized.
Data serialized.


In [36]:
X_train_combined

Unnamed: 0_level_0,Age,Tumor_Size,Survival_Rate,Tumor_Growth_Rate,Gender,Stage,Radiation_Treatment,Surgery_Performed,Chemotherapy,Family_History,...,Symptom_1_Seizures,Symptom_1_Vision Issues,Symptom_2_Headache,Symptom_2_Nausea,Symptom_2_Seizures,Symptom_2_Vision Issues,Symptom_3_Headache,Symptom_3_Nausea,Symptom_3_Seizures,Symptom_3_Vision Issues
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19882,0.033330,0.845546,-0.848394,0.997635,0,2,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
17741,-0.196430,-1.365872,-0.168458,-0.357055,0,2,0,0,0,1,...,0,0,0,0,0,1,1,0,0,0
5362,-1.115469,-1.489934,-1.125438,1.286599,1,4,0,1,1,1,...,1,0,0,0,0,1,0,0,1,0
12736,-0.655950,-0.231538,0.472249,0.179066,1,4,1,1,1,1,...,1,0,0,1,0,0,0,0,0,1
16231,0.435409,-0.787674,0.853890,0.255147,0,1,1,0,1,1,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17946,-0.138990,1.478380,-1.393425,-0.180023,0,2,1,1,1,0,...,0,0,1,0,0,0,0,1,0,0
7785,1.699088,-1.644460,1.326931,-0.146320,0,3,0,0,1,1,...,0,0,0,0,1,0,0,0,1,0
12846,0.492849,-0.005219,-1.705539,1.562141,1,2,1,1,0,0,...,0,0,0,0,0,1,0,0,0,1
8011,-0.483630,0.479210,-0.397255,-0.295015,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1


In [38]:
print(f"Shape of X_test_combined before exporting: {X_test_combined.shape}")

Shape of X_test_combined before exporting: (4000, 32)
