In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import pickle as pkl

df = pd.read_csv('breast_cancer_self_check_data.csv')

print("done ...")

: 

In [None]:
df.describe()

In [None]:
df.info()

# DATA CLEANING & FEATURE ENGINEERING

In [None]:
encoder = LabelEncoder()

df['PhysicalActivity'] = encoder.fit_transform(df['physical_activity'])
df = df.drop(columns=['physical_activity'],axis=1)

df.head()

In [None]:
df['mammogram_recency'] = (
    (pd.to_datetime('today') - pd.to_datetime(df['last_mammogram']))
    .dt.days / 365.25   # convert days to years
)
df['mammogram_recency'] = np.trunc(df['mammogram_recency'])  # Truncate decimals
df['mammogram_recency'].fillna(-1, inplace=True)  # -1 for no history

df = df.drop(columns=['last_mammogram'],axis=1)

df.head(20)

In [None]:
x = df.drop(columns=['diagnosis'],axis=1)
y = df['diagnosis']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42)
X_train.shape

In [None]:
X_test.shape

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# DATA VISUALIZATION

In [None]:
# Count of diagnosis==1 per age
age_counts = df[df['diagnosis'] == 1]['age'].value_counts().sort_index()
plt.bar(age_counts.index, age_counts.values, color='red')
plt.xlabel('age')
plt.ylabel('count of diagnosis==1')
plt.title('Count of Positive cases by Age')
plt.show()

from the figure we can tell that ages 20 - around 47 are least affected but ages above are mostly afected

In [None]:
# Count of diagnosis==1 per age
age_counts = df[df['diagnosis'] == 0]['age'].value_counts().sort_index()
plt.bar(age_counts.index, age_counts.values, color='green')
plt.xlabel('age')
plt.ylabel('count of diagnosis==0')
plt.title('Count of Normal/not affected by Age')
plt.show()

From the figure we can tell that ages between 20 - 60 normal and the numbers keeps decreasing as the age increases

In [None]:
# Count of diagnosis==1 per age
age_counts = df[df['diagnosis'] == 2]['age'].value_counts().sort_index()
plt.bar(age_counts.index, age_counts.values, color='blue')
plt.xlabel('age')
plt.ylabel('count of diagnosis==2')
plt.title('Count of At Risk by Age')
plt.show()

We can tell the ages below 40 are least at risk but the numbers incease as age increases

# Modeling

In [None]:
# initializing the models

decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier(random_state=21)
svc = SVC()
neighbors = KNeighborsClassifier()
naive = GaussianNB()
logistic = LogisticRegression()

In [None]:
decision_tree.fit(X_train, Y_train)
random_forest.fit(X_train, Y_train)
svc.fit(X_train, Y_train)
neighbors.fit(X_train, Y_train)
naive.fit(X_train, Y_train)
logistic.fit(X_train, Y_train)

In [None]:
def runtest():
    decision_pred = decision_tree.predict(X_test)
    decision_acc = accuracy_score(Y_test, decision_pred)
    print(f'Decision tree : {decision_acc}')

    svc_pred = svc.predict(X_test)
    svc_acc = accuracy_score(Y_test, svc_pred)
    print(f'SVC : {svc_acc}')

    random_pred = random_forest.predict(X_test)
    random_acc = accuracy_score(Y_test, random_pred)
    print(f'Random Forest : {accuracy_score(Y_test, random_pred)}')

    neighbor_pred = neighbors.predict(X_test)
    neighbor_acc = accuracy_score(Y_test, neighbor_pred)
    print(f'K Nearest : {accuracy_score(Y_test, neighbor_pred)}')

    naive_pred = naive.predict(X_test)
    naive_acc = accuracy_score(Y_test, naive_pred)
    print(f'Naive Bayes : {accuracy_score(Y_test, naive_pred)}')

    logistic_pred = logistic.predict(X_test)
    logistic_acc = accuracy_score(Y_test, logistic_pred)
    print(f'Logistic Regression : {logistic_acc}')

    # Bar chart of model accuracies
    models = ['Decision Tree', 'SVC', 'Random Forest', 'K Nearest', 'Naive Bayes', 'Logistic Regression']
    accuracies = [decision_acc, svc_acc, random_acc, neighbor_acc, naive_acc, logistic_acc ]
    plt.figure(figsize=(8, 5))
    bars = plt.bar(models, accuracies, color=['orange', 'blue', 'green', 'purple', 'red', 'cyan'])
    plt.ylim(0, 1)
    plt.ylabel('Accuracy Score')
    plt.title('Model Accuracy Comparison')
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    for bar, acc in zip(bars, accuracies):
        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01, f'{acc:.3f}', ha='center', va='bottom')

    plt.show()

    model = "self_check_breast_cancer_model.pkl"
    with open(model, 'wb') as file:
        pkl.dump(svc, file)


runtest()