# Cleaned Data

In [1]:
%pip install kagglehub numpy pandas seaborn matplotlib scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
dataset_path = "./Breast_Cancer.csv"

df = pd.read_csv(dataset_path)

df.head()

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4,Positive,Positive,24,1,60,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35,Positive,Positive,14,5,62,Alive
2,58,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63,Positive,Positive,14,7,75,Alive
3,58,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,18,Positive,Positive,2,1,84,Alive
4,47,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41,Positive,Positive,3,1,50,Alive


In [4]:
# Drop columns: "6th Stage", "Race", "Marital Status"
df = df.drop(columns=["6th Stage", "Race", "Marital Status", "Status", "Grade"])

In [5]:
# Create a new column "Regional Node Positive" / "Regional Node Examined" ratio
df["Node Positive Ratio"] = df["Reginol Node Positive"] / df["Regional Node Examined"]
df.drop(columns=["Reginol Node Positive", "Regional Node Examined"])

# Rename column "A Stage" to "M Stage"
df = df.rename(columns={"A Stage": "M Stage"})

In [6]:
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

# 0 -> T1, 1 -> T2, 2 -> T3, 3 -> T4
t_stage = df["T Stage "].unique().tolist()
t_stage_ordinal_encoder = OrdinalEncoder(categories=[t_stage])
df["T Stage Encoded"] = t_stage_ordinal_encoder.fit_transform(df[["T Stage "]])

# 0 -> Distant, 1 -> Regional
df["M Stage Encoded"] = LabelEncoder().fit_transform(df["M Stage"])

# 0 -> N1, 1 -> N2, 2 -> N3
n_stage = df["N Stage"].unique().tolist()
n_stage_ordinal_encoder = OrdinalEncoder(categories=[n_stage])
df["N Stage Encoded"] = n_stage_ordinal_encoder.fit_transform(df[["N Stage"]])

# 0 -> Negative, 1 -> Positive
df['Estrogen Status Encoded'] = LabelEncoder().fit_transform(df['Estrogen Status'])
df['Progesterone Status Encoded'] = LabelEncoder().fit_transform(df['Progesterone Status'])

In [7]:
df["Survival More Than 6 Years"] = (df["Survival Months"] > 6 * 12).astype(int)

df["Survival More Than 6 Years"].value_counts()

Survival More Than 6 Years
1    2033
0    1991
Name: count, dtype: int64

In [8]:
df["differentiate"] = df["differentiate"].replace({
    "Well differentiated": 1,
    "Moderately differentiated": 2,
    "Poorly differentiated": 3,
    "Undifferentiated": 4
}).astype(int)


  df["differentiate"] = df["differentiate"].replace({


In [9]:
df = df.drop(columns=["Reginol Node Positive", "Regional Node Examined"])

In [10]:
cleaned_data = df.drop(
    columns=[
        "T Stage ",
        "N Stage",
        "M Stage",
        "Estrogen Status",
        "Progesterone Status",
    ]
)

regression_df = cleaned_data.drop(columns=["Survival More Than 6 Years"])
classification_df = cleaned_data.drop(columns=["Survival Months"])

In [11]:
cleaned_data.dtypes

Age                              int64
differentiate                    int64
Tumor Size                       int64
Survival Months                  int64
Node Positive Ratio            float64
T Stage Encoded                float64
M Stage Encoded                  int64
N Stage Encoded                float64
Estrogen Status Encoded          int64
Progesterone Status Encoded      int64
Survival More Than 6 Years       int64
dtype: object

In [12]:
# Save cleaned datasets
cleaned_data.to_csv("cleaned_breast_cancer_data.csv", index=False)

In [13]:
classification_df = cleaned_data.drop(columns=["Survival Months"])
classification_df.to_csv("classification_breast_cancer_data.csv", index=False)

regression_df = cleaned_data.drop(columns=["Survival More Than 6 Years"])
regression_df.to_csv("regression_breast_cancer_data.csv", index=False)