In [102]:
%pip install kagglehub numpy pandas seaborn matplotlib scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [103]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [104]:
dataset_path = "./Breast_Cancer.csv"

In [105]:
df = pd.read_csv(dataset_path)

df.head()

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4,Positive,Positive,24,1,60,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35,Positive,Positive,14,5,62,Alive
2,58,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63,Positive,Positive,14,7,75,Alive
3,58,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,18,Positive,Positive,2,1,84,Alive
4,47,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41,Positive,Positive,3,1,50,Alive


In [106]:
# Drop columns: "6th Stage", "Race", "Marital Status"
df = df.drop(columns=["6th Stage", "Race", "Marital Status", "Status", "differentiate"])

In [107]:
# Create a new column "Regional Node Positive" / "Regional Node Examined" ratio
df["Node Positive Ratio"] = df["Reginol Node Positive"] / df["Regional Node Examined"]
df.drop(columns=["Reginol Node Positive", "Regional Node Examined"])

# Rename column "A Stage" to "M Stage"
df = df.rename(columns={"A Stage": "M Stage"})

In [108]:
df.head()

Unnamed: 0,Age,T Stage,N Stage,Grade,M Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Node Positive Ratio
0,68,T1,N1,3,Regional,4,Positive,Positive,24,1,60,0.041667
1,50,T2,N2,2,Regional,35,Positive,Positive,14,5,62,0.357143
2,58,T3,N3,2,Regional,63,Positive,Positive,14,7,75,0.5
3,58,T1,N1,3,Regional,18,Positive,Positive,2,1,84,0.5
4,47,T2,N1,3,Regional,41,Positive,Positive,3,1,50,0.333333


In [109]:
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

# 0 -> T1, 1 -> T2, 2 -> T3, 3 -> T4
t_stage = df["T Stage "].unique().tolist()
t_stage_ordinal_encoder = OrdinalEncoder(categories=[t_stage])
df["T Stage Encoded"] = t_stage_ordinal_encoder.fit_transform(df[["T Stage "]])

# 0 -> Distant, 1 -> Regional
df["M Stage Encoded"] = LabelEncoder().fit_transform(df["M Stage"])

# 0 -> N1, 1 -> N2, 2 -> N3
n_stage = df["N Stage"].unique().tolist()
n_stage_ordinal_encoder = OrdinalEncoder(categories=[n_stage])
df["N Stage Encoded"] = n_stage_ordinal_encoder.fit_transform(df[["N Stage"]])

# 0 -> Negative, 1 -> Positive
df['Estrogen Status Encoded'] = LabelEncoder().fit_transform(df['Estrogen Status'])
df['Progesterone Status Encoded'] = LabelEncoder().fit_transform(df['Progesterone Status'])

In [110]:
df["Survival More Than 6 Years"] = (df["Survival Months"] > 6 * 12).astype(int)

df["Survival More Than 6 Years"].value_counts()

Survival More Than 6 Years
1    2033
0    1991
Name: count, dtype: int64

In [111]:
df

Unnamed: 0,Age,T Stage,N Stage,Grade,M Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Node Positive Ratio,T Stage Encoded,M Stage Encoded,N Stage Encoded,Estrogen Status Encoded,Progesterone Status Encoded,Survival More Than 6 Years
0,68,T1,N1,3,Regional,4,Positive,Positive,24,1,60,0.041667,0.0,1,0.0,1,1,0
1,50,T2,N2,2,Regional,35,Positive,Positive,14,5,62,0.357143,1.0,1,1.0,1,1,0
2,58,T3,N3,2,Regional,63,Positive,Positive,14,7,75,0.500000,2.0,1,2.0,1,1,1
3,58,T1,N1,3,Regional,18,Positive,Positive,2,1,84,0.500000,0.0,1,0.0,1,1,1
4,47,T2,N1,3,Regional,41,Positive,Positive,3,1,50,0.333333,1.0,1,0.0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4019,62,T1,N1,2,Regional,9,Positive,Positive,1,1,49,1.000000,0.0,1,0.0,1,1,0
4020,56,T2,N2,2,Regional,46,Positive,Positive,14,8,69,0.571429,1.0,1,1.0,1,1,0
4021,68,T2,N1,2,Regional,22,Positive,Negative,11,3,69,0.272727,1.0,1,0.0,1,0,0
4022,58,T2,N1,2,Regional,44,Positive,Positive,11,1,72,0.090909,1.0,1,0.0,1,1,0
