# **1 - Data Preparation**
---

In [1]:
# Import required libraries.
import sys
import os
sys.path.append(os.path.abspath("../src"))
import util

import pandas as pd

from sklearn.model_selection import train_test_split

## 0. Load Config
---

In [2]:
# Load the configuration file.
config = util.load_config()

## 1. Load Data
---

In [3]:
# Function to load data.
def load_data(data_dir: str) -> pd.DataFrame:
    """
    Load the csv data.
    
    Parameters:
    ----------
    data_dir : str
        The location of dataset.
        
    Returns:
    -------
    data : pd.DataFrame
        The loaded data.
    """
    # Load the csv data.
    data = pd.read_csv(data_dir)
    
    # Set Patient_ID as index.
    data = data.set_index(config["identifier_column"])
    
    # Print the data shape.
    print(f"Raw data shape: {data.shape}")
    
    # Return the loaded data as Pandas DataFrame.
    return data

In [4]:
# Load the csv data and check the top 5 data.
DATA_DIR = f"../{config["path_dataset"]}"

data = load_data(DATA_DIR)
data.head()

Raw data shape: (20000, 18)


Unnamed: 0_level_0,Age,Gender,Tumor_Type,Tumor_Size,Location,Histology,Stage,Symptom_1,Symptom_2,Symptom_3,Radiation_Treatment,Surgery_Performed,Chemotherapy,Survival_Rate,Tumor_Growth_Rate,Family_History,MRI_Result,Follow_Up_Required
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,73,Male,Malignant,5.375612,Temporal,Astrocytoma,III,Vision Issues,Seizures,Seizures,No,No,No,51.312579,0.111876,No,Positive,Yes
2,26,Male,Benign,4.847098,Parietal,Glioblastoma,II,Headache,Headache,Nausea,Yes,Yes,Yes,46.373273,2.165736,Yes,Positive,Yes
3,31,Male,Benign,5.588391,Parietal,Meningioma,I,Vision Issues,Headache,Seizures,No,No,No,47.072221,1.884228,No,Negative,No
4,29,Male,Malignant,1.4366,Temporal,Medulloblastoma,IV,Vision Issues,Seizures,Headache,Yes,No,Yes,51.853634,1.283342,Yes,Negative,No
5,54,Female,Benign,2.417506,Parietal,Glioblastoma,I,Headache,Headache,Seizures,No,No,Yes,54.708987,2.069477,No,Positive,Yes


In [5]:
# Serialize the raw dataset for reproducibility.
raw_path = config["path_raw_dataset"]
util.pickle_dump(data, f"../{raw_path}")

Data serialized.


## 2. Split Data
---
1. `X`-`y` split
2. Train-Test split

In [6]:
# Function to split input (X) and output (y).
def split_input_output(data: pd.DataFrame, target_col: str):
    """
    Split the input and output features.
    
    Parameters:
    ----------
    data : pd.DataFrame
        The loaded dataset.
        
    target_col : str
        The name of target column.
        
    Returns:
    -------
    X : pd.DataFrame
        The input data.
        
    y : pd.Series
        The output data.
    """
    # Input features.
    X = data.drop(columns = target_col,
                  axis = 1)
    
    # Output features.
    y = data[target_col]
    
    # Print the data shape.
    print(f"Original data shape : {data.shape}")
    print(f"X data shape        : {X.shape}")
    print(f"y data shape        : {y.shape}")
    
    # Return the input and output data.
    return X, y

In [7]:
# Split input-output.
TARGET_COL = config['label']

X, y = split_input_output(data = data,
                          target_col = TARGET_COL)

Original data shape : (20000, 18)
X data shape        : (20000, 17)
y data shape        : (20000,)


In [8]:
# Function to split train and test data.
def split_train_test(X: pd.DataFrame, y: pd.Series, test_size: float, random_state: int = None):
    """
    Stratify split train test.
        Stratify: target proportion before and after split will be same.
        
    Parameters:
    ----------
    X : pd.DataFrame
        The input data.
    
    y : pd.Series
        The output data.
        
    test_size : float
        The proportion of test data.
        
    random_state : int, default = None
        State of randomness, for reproducibility.
        
    Returns:
    -------
    X_train : pd.DataFrame
        The input data train.
        
    X_test : pd.DataFrame
        The input data test.
        
    y_train : pd.Series
        The output data train.
        
    y_test : pd.Series
        The output data test.
    """
    # Split the data into train and test set.
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size = test_size,
        stratify = y,
        random_state = random_state
    )
    
    # Print the data shape.
    print(f"X_train shape: {X_train.shape}")
    print(f"y_train shape: {y_train.shape}")
    print(f"X_test shape : {X_test.shape}")
    print(f"y_test shape : {y_test.shape}")
    
    # Return the splitted data.
    return X_train, X_test, y_train, y_test

In [9]:
# Split train-test from X and y.
X_train, X_test, y_train, y_test = split_train_test(
    X = X,
    y = y,
    test_size = 0.2,
    random_state = 123
)

X_train shape: (16000, 17)
y_train shape: (16000,)
X_test shape : (4000, 17)
y_test shape : (4000,)


## 3. Serialize Data
---

In [10]:
# Serialize the train data.
util.pickle_dump(X_train, f"../{config["path_train_set"][0]}")
util.pickle_dump(y_train, f"../{config["path_train_set"][1]}")

# Serialize the test data.
util.pickle_dump(X_test, f"../{config["path_test_set"][0]}")
util.pickle_dump(y_test, f"../{config["path_test_set"][1]}")

Data serialized.
Data serialized.
Data serialized.
Data serialized.
