In [1]:
# Import libraries for preprocessing and encoding
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Load the data saved from the previous notebook
df = pd.read_csv('breast_msk_2018_processed_step1.csv')

print(f"Dataset loaded. Shape: {df.shape}")

Dataset loaded. Shape: (1918, 59)


In [2]:
# List of columns to drop because they are unique identifiers or redundant
cols_to_drop = ['Study_ID', 'Patient_ID', 'Sample_ID', 'Oncotree_Code']

# Drop the columns
df_reduced = df.drop(columns=cols_to_drop)

print(f"Shape after dropping redundant columns: {df_reduced.shape}")

Shape after dropping redundant columns: (1918, 55)


In [3]:
# Fill missing values for numerical columns with the median
num_cols = df_reduced.select_dtypes(include=[np.number]).columns
df_reduced[num_cols] = df_reduced[num_cols].fillna(df_reduced[num_cols].median())

# Fill missing values for categorical columns with 'Unknown'
cat_cols = df_reduced.select_dtypes(include=['object']).columns
df_reduced[cat_cols] = df_reduced[cat_cols].fillna('Unknown')

print("Missing values handled.")

Missing values handled.


In [4]:
# Standardizing the mapping: Metastasis = 1, Primary = 0
# This avoids confusion in all subsequent notebooks and metrics
mapping = {'Primary': 0, 'Metastasis': 1}

# Apply the mapping
df_reduced['Sample_Type'] = df_reduced['Sample_Type'].map(mapping)

print("Mapping Fixed Successfully:")
print(df_reduced['Sample_Type'].value_counts())
# Now 1 should represent Metastasis and 0 represents Primary

Mapping Fixed Successfully:
Sample_Type
1    1000
0     918
Name: count, dtype: int64


In [5]:
# Select categorical columns excluding the target we already encoded
categorical_features = df_reduced.select_dtypes(include=['object']).columns

# Apply One-Hot Encoding
df_final = pd.get_dummies(df_reduced, columns=categorical_features, drop_first=True)

print(f"Final shape after One-Hot Encoding: {df_final.shape}")
df_final.head()

Final shape after One-Hot Encoding: (1918, 527)


Unnamed: 0,Disease_Free_Event,Disease_Free_Months,Fraction_Genome_Altered,Invasive_Carcinoma_Diagnosis_Age,Time_to_Diagnosis_Invasive_Carcinoma,Last_Communication_Contact,Metastatic_Recurrence_Time,Mutation_Count,NGS_Sample_Collection_Time_Period,Overall_Survival_Months,...,T_Stage_T3,T_Stage_T4,T_Stage_T4a,T_Stage_T4b,T_Stage_T4c,T_Stage_T4d,T_Stage_TX,T_Stage_Tis,T_Stage_unk,Patient's_Vital_Status_Deceased
0,1,1.1,0.29735,37,444.87,14484,446.0,4.0,445,31.5,...,False,False,False,False,False,False,False,False,False,False
1,0,218.0,0.6891,43,516.48,22336,655.0,1.0,517,218.0,...,False,False,False,False,False,False,False,False,False,False
2,1,68.9,0.29735,37,449.8,16656,519.0,7.0,534,98.0,...,False,False,False,False,False,False,False,False,False,True
3,1,90.2,0.6367,43,513.68,19364,604.0,10.0,618,123.1,...,False,False,False,False,False,False,False,False,False,True
4,1,0.5,0.5093,38,458.36,15871,459.0,5.0,489,63.6,...,False,False,False,False,False,False,True,False,False,True


In [6]:
# Save the final preprocessed data for the modeling notebook
df_final.to_csv('breast_cancer_ready_for_ml.csv', index=False)
print("Final preprocessed data saved as 'breast_cancer_ready_for_ml.csv'")

Final preprocessed data saved as 'breast_cancer_ready_for_ml.csv'
