In [1196]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier


Read data and show first 5 rows

In [1197]:
df = pd.read_csv('./Breast_Cancer.csv')

df.head()

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4,Positive,Positive,24,1,60,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35,Positive,Positive,14,5,62,Alive
2,58,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63,Positive,Positive,14,7,75,Alive
3,58,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,18,Positive,Positive,2,1,84,Alive
4,47,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41,Positive,Positive,3,1,50,Alive


Data information

In [1198]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024 entries, 0 to 4023
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Age                     4024 non-null   int64 
 1   Race                    4024 non-null   object
 2   Marital Status          4024 non-null   object
 3   T Stage                 4024 non-null   object
 4   N Stage                 4024 non-null   object
 5   6th Stage               4024 non-null   object
 6   differentiate           4024 non-null   object
 7   Grade                   4024 non-null   object
 8   A Stage                 4024 non-null   object
 9   Tumor Size              4024 non-null   int64 
 10  Estrogen Status         4024 non-null   object
 11  Progesterone Status     4024 non-null   object
 12  Regional Node Examined  4024 non-null   int64 
 13  Reginol Node Positive   4024 non-null   int64 
 14  Survival Months         4024 non-null   int64 
 15  Stat

Data descriptions

In [1199]:
# Categorical data
df.describe(include=['object'])

Unnamed: 0,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Estrogen Status,Progesterone Status,Status
count,4024,4024,4024,4024,4024,4024,4024,4024,4024,4024,4024
unique,3,5,4,3,5,4,4,2,2,2,2
top,White,Married,T2,N1,IIA,Moderately differentiated,2,Regional,Positive,Positive,Alive
freq,3413,2643,1786,2732,1305,2351,2351,3932,3755,3326,3408


In [1200]:
# Numerical data
df.describe()

Unnamed: 0,Age,Tumor Size,Regional Node Examined,Reginol Node Positive,Survival Months
count,4024.0,4024.0,4024.0,4024.0,4024.0
mean,53.972167,30.473658,14.357107,4.158052,71.297962
std,8.963134,21.119696,8.099675,5.109331,22.92143
min,30.0,1.0,1.0,1.0,1.0
25%,47.0,16.0,9.0,1.0,56.0
50%,54.0,25.0,14.0,2.0,73.0
75%,61.0,38.0,19.0,5.0,90.0
max,69.0,140.0,61.0,46.0,107.0


Convert and encode non numerical columns

In [1201]:
categorical_cols = list(df.select_dtypes(exclude=[np.number]).columns)
for cat in categorical_cols:
    # Label Encoding
    label_encoder = LabelEncoder()
    encode_label = cat + '_encoded'
    df[encode_label] = label_encoder.fit_transform(df[cat])
    df.drop(columns=[cat], inplace=True)


Normalize columns based on min-max

In [1202]:
column_names_list = df.columns.tolist()
features = df[column_names_list]
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(features)
data_frame = pd.DataFrame(scaled_features, columns=df.columns)

Split train and test data

In [1203]:
X_train, X_test, y_train, y_test = train_test_split(df.values[:,:-1], df.values[:,-1], test_size=0.2, random_state=42)

Decision Tree Classifier

In [1204]:
decision_tree = DecisionTreeClassifier(
    criterion='entropy',
    splitter='best',
    min_samples_leaf=60
)

# Train the Naive Bayes Classifier 
decision_tree.fit(X_train, y_train)

# Predict the labels for the test data
y_test_pred_dt = decision_tree.predict(X_test)

# Evaluate the performance
print(classification_report(y_test, y_test_pred_dt))
print(accuracy_score(y_test, y_test_pred_dt))

              precision    recall  f1-score   support

           0       0.92      0.99      0.95       685
           1       0.90      0.53      0.66       120

    accuracy                           0.92       805
   macro avg       0.91      0.76      0.81       805
weighted avg       0.92      0.92      0.91       805

0.9204968944099379


Naive Bayes Classifier

In [1205]:
naive_bayes = GaussianNB(var_smoothing=1e-1, priors=[0.74, 0.26])

# Train the Naive Bayes Classifier 
naive_bayes.fit(X_train, y_train)

# Predict the labels for the test data
y_test_pred_dt = naive_bayes.predict(X_test)

# Evaluate the performance
print(classification_report(y_test, y_test_pred_dt))
print(accuracy_score(y_test, y_test_pred_dt))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94       685
           1       0.70      0.50      0.58       120

    accuracy                           0.89       805
   macro avg       0.81      0.73      0.76       805
weighted avg       0.88      0.89      0.89       805

0.893167701863354


Voting Classifier

In [1206]:
voting_clf = VotingClassifier(estimators=[('nb', naive_bayes), ('dt', decision_tree)])

# Train the Voting Classifier
voting_clf.fit(X_train, y_train)

# Predict the labels for the test data
y_pred = voting_clf.predict(X_test)

# Evaluate the performance
print("Accuracy: ", accuracy_score(y_test, y_pred))

Accuracy:  0.9142857142857143
