In [2]:
%pip install kagglehub numpy pandas seaborn matplotlib scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
dataset_path = "./Breast_Cancer.csv"

In [5]:
df = pd.read_csv(dataset_path)

df.head()

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4,Positive,Positive,24,1,60,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35,Positive,Positive,14,5,62,Alive
2,58,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63,Positive,Positive,14,7,75,Alive
3,58,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,18,Positive,Positive,2,1,84,Alive
4,47,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41,Positive,Positive,3,1,50,Alive


In [6]:
# Drop columns: "6th Stage", "Race", "Marital Status"
df = df.drop(columns=["6th Stage", "Race", "Marital Status", "Status", "differentiate"])

In [7]:
# Create a new column "Regional Node Positive" / "Regional Node Examined" ratio
df["Node Positive Ratio"] = df["Reginol Node Positive"] / df["Regional Node Examined"]
df.drop(columns=["Reginol Node Positive", "Regional Node Examined"])

# Rename column "A Stage" to "M Stage"
df = df.rename(columns={"A Stage": "M Stage"})

In [8]:
df.head()

Unnamed: 0,Age,T Stage,N Stage,Grade,M Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Node Positive Ratio
0,68,T1,N1,3,Regional,4,Positive,Positive,24,1,60,0.041667
1,50,T2,N2,2,Regional,35,Positive,Positive,14,5,62,0.357143
2,58,T3,N3,2,Regional,63,Positive,Positive,14,7,75,0.5
3,58,T1,N1,3,Regional,18,Positive,Positive,2,1,84,0.5
4,47,T2,N1,3,Regional,41,Positive,Positive,3,1,50,0.333333


In [9]:
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

# 0 -> T1, 1 -> T2, 2 -> T3, 3 -> T4
t_stage = df["T Stage "].unique().tolist()
t_stage_ordinal_encoder = OrdinalEncoder(categories=[t_stage])
df["T Stage Encoded"] = t_stage_ordinal_encoder.fit_transform(df[["T Stage "]])

# 0 -> Distant, 1 -> Regional
df["M Stage Encoded"] = LabelEncoder().fit_transform(df["M Stage"])

# 0 -> N1, 1 -> N2, 2 -> N3
n_stage = df["N Stage"].unique().tolist()
n_stage_ordinal_encoder = OrdinalEncoder(categories=[n_stage])
df["N Stage Encoded"] = n_stage_ordinal_encoder.fit_transform(df[["N Stage"]])

# 0 -> Negative, 1 -> Positive
df["Estrogen Status Encoded"] = LabelEncoder().fit_transform(df["Estrogen Status"])
df["Progesterone Status Encoded"] = LabelEncoder().fit_transform(
    df["Progesterone Status"]
)

# Grade value is string, and it has " anaplastic; Grade IV" in 1 row for some reason
grade_map = {"1": 1, "2": 2, "3": 3, " anaplastic; Grade IV": 4}
df["Grade"] = df["Grade"].map(grade_map)

In [10]:
df["Survival More Than 6 Years"] = (df["Survival Months"] > 6 * 12).astype(int)

df["Survival More Than 6 Years"].value_counts()

Survival More Than 6 Years
1    2033
0    1991
Name: count, dtype: int64

In [11]:
df = df.drop(
    columns=[
        "T Stage ",
        "N Stage",
        "M Stage",
        "Estrogen Status",
        "Progesterone Status",
        "Regional Node Examined",
        "Reginol Node Positive",
    ]
)

df.to_csv(
    "Cleaned_Breast_Cancer.csv", sep=",", encoding="utf-8", index=False, header=True
)

print(df.dtypes)
df

Age                              int64
Grade                            int64
Tumor Size                       int64
Survival Months                  int64
Node Positive Ratio            float64
T Stage Encoded                float64
M Stage Encoded                  int64
N Stage Encoded                float64
Estrogen Status Encoded          int64
Progesterone Status Encoded      int64
Survival More Than 6 Years       int64
dtype: object


Unnamed: 0,Age,Grade,Tumor Size,Survival Months,Node Positive Ratio,T Stage Encoded,M Stage Encoded,N Stage Encoded,Estrogen Status Encoded,Progesterone Status Encoded,Survival More Than 6 Years
0,68,3,4,60,0.041667,0.0,1,0.0,1,1,0
1,50,2,35,62,0.357143,1.0,1,1.0,1,1,0
2,58,2,63,75,0.500000,2.0,1,2.0,1,1,1
3,58,3,18,84,0.500000,0.0,1,0.0,1,1,1
4,47,3,41,50,0.333333,1.0,1,0.0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...
4019,62,2,9,49,1.000000,0.0,1,0.0,1,1,0
4020,56,2,46,69,0.571429,1.0,1,1.0,1,1,0
4021,68,2,22,69,0.272727,1.0,1,0.0,1,0,0
4022,58,2,44,72,0.090909,1.0,1,0.0,1,1,0


## Classification

In [12]:
# Importing stuff
import numpy as np
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

import warnings
warnings.filterwarnings('ignore')

In [14]:
# Split numerical and non numerical columns
cancer=pd.read_csv('./Breast_Cancer.csv')

ob=cancer.select_dtypes(include='object')
non_ob=cancer.select_dtypes(exclude='object')

ob.head()

Unnamed: 0,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Estrogen Status,Progesterone Status,Status
0,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,Positive,Positive,Alive
1,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,Positive,Positive,Alive
2,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,Positive,Positive,Alive
3,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,Positive,Positive,Alive
4,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,Positive,Positive,Alive


In [16]:
la = LabelEncoder()

for i in range(0, ob.shape[1]):
    ob.iloc[:, i] = la.fit_transform(ob.iloc[:, i])

ob=ob.astype('int')
ob.head()

Unnamed: 0,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Estrogen Status,Progesterone Status,Status
0,2,1,0,0,0,1,3,1,1,1,0
1,2,1,1,1,2,0,2,1,1,1,0
2,2,0,2,2,4,0,2,1,1,1,0
3,2,1,0,0,0,1,3,1,1,1,0
4,2,1,1,0,1,1,3,1,1,1,0


In [18]:
cancer=pd.concat([ob,non_ob],axis=1)
cancer

Unnamed: 0,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Estrogen Status,Progesterone Status,Status,Age,Tumor Size,Regional Node Examined,Reginol Node Positive,Survival Months
0,2,1,0,0,0,1,3,1,1,1,0,68,4,24,1,60
1,2,1,1,1,2,0,2,1,1,1,0,50,35,14,5,62
2,2,0,2,2,4,0,2,1,1,1,0,58,63,14,7,75
3,2,1,0,0,0,1,3,1,1,1,0,58,18,2,1,84
4,2,1,1,0,1,1,3,1,1,1,0,47,41,3,1,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4019,1,1,0,0,0,0,2,1,1,1,0,62,9,1,1,49
4020,2,0,1,1,2,0,2,1,1,1,0,56,46,14,8,69
4021,2,1,1,0,1,0,2,1,1,0,0,68,22,11,3,69
4022,0,0,1,0,1,0,2,1,1,1,0,58,44,11,1,72


In [None]:
cancer = cancer.rename(
    columns={
        "T Stage ": "T Stage Encoded",
        "N Stage": "N Stage Encoderd",
        "A Stage": "M Stage Encoded",
        "6th Stage": "6th Stage Encoded",
    }
)
cancer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024 entries, 0 to 4023
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   Race                    4024 non-null   int64
 1   Marital Status          4024 non-null   int64
 2   T Stage Encoded         4024 non-null   int64
 3   N Stage Encoded         4024 non-null   int64
 4   6th Stage Encoded       4024 non-null   int64
 5   differentiate           4024 non-null   int64
 6   Grade                   4024 non-null   int64
 7   M Stage Encoded         4024 non-null   int64
 8   Estrogen Status         4024 non-null   int64
 9   Progesterone Status     4024 non-null   int64
 10  Status                  4024 non-null   int64
 11  Age                     4024 non-null   int64
 12  Tumor Size              4024 non-null   int64
 13  Regional Node Examined  4024 non-null   int64
 14  Reginol Node Positive   4024 non-null   int64
 15  Survival Months      

In [23]:
cancer.to_csv(
    "Label_Encoded_Breast_Cancer.csv", sep=",", encoding="utf-8", index=False, header=True
)