# Breast Cancer Detector

In [None]:
# import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# load data

df = pd.read_csv('data.csv')

In [None]:
# display sample

pd.set_option('display.max_columns', None)
df.head(7)

In [None]:
# count the number rows and columns

df.shape

In [None]:
# count the number of empty values in each column

df.isna().sum()

In [None]:
# drop the column with all missing values

df = df.dropna(axis=1)

In [None]:
# count the number of Malignant (M) or Benign (B) cells

df['diagnosis'].value_counts()

In [None]:
# visualize the count

sns.countplot(x=df['diagnosis'], label='count')

In [None]:
# look at the data types to see which columns need to be encoded

df.dtypes

In [None]:
# encode the categorical data values

from sklearn.preprocessing import LabelEncoder

labelencoder_y = LabelEncoder()

df.iloc[:,1] = labelencoder_y.fit_transform(df.iloc[:,1].values)

In [None]:
# create a pair plots

sns.pairplot(df.iloc[:,1:6], hue='diagnosis')

In [None]:
# print the first 5 rows of the new data

df.head(5)

In [None]:
# get the correlation of the columns

df.iloc[:,1:12].corr()

In [None]:
plt.figure(figsize=(11,9))

sns.heatmap(df.iloc[:,1:12].corr(), annot=True, fmt='.0%')

In [None]:
# split the data set into independent (X) and dependent (Y) data sets

X = df.iloc[:,2:31].values
Y = df.iloc[:,1].values

In [None]:
# split the data set into 75% training and 25% testing data

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)

In [None]:
# scale the data (feature scaling)

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [None]:
# create a function for the models

def models(X_train, Y_train):
    
    # logistic regression
    from sklearn.linear_model import LogisticRegression
    
    log = LogisticRegression(random_state=0)
    log.fit(X_train, Y_train)

    # decision tree
    from sklearn.tree import DecisionTreeClassifier
    tree = DecisionTreeClassifier(criterion='entropy', random_state=0)
    tree.fit(X_train, Y_train)

    # random forest classifier
    from sklearn.ensemble import RandomForestClassifier
    forest = RandomForestClassifier(n_estimators = 10, criterion='entropy', random_state=0)
    forest.fit(X_train, Y_train)

    # print the models accuracy on the training data
    print('[0] Logistic Regression Training Accuracy:\t', log.score(X_train, Y_train))
    print('[1] Decision Tree Classifier Training Accuracy:\t', tree.score(X_train, Y_train))
    print('[2] Random Forest Classifier Training Accuracy:\t', forest.score(X_train, Y_train))

    return log, tree, forest

In [None]:
import numpy as np

# throws an error otherwise
Y_train = np.array(Y_train, dtype=int)
Y_test = np.array(Y_test, dtype=int)

# getting all of the models
model = models(X_train, Y_train)

In [None]:
# test model accuracy on test data on confusion matrix
from sklearn.metrics import confusion_matrix

model_names = { 0: "Logistic Regression", 1: "Decision Tree Classifier", 2: "Random Forest Classifier" }

print("--------------------------------------")

for i in range(len(model)):
    print("Model:", model_names[i])
    cm = confusion_matrix(Y_test, model[i].predict(X_test))

    TP = cm[0][0]
    TN = cm[1][1]
    FN = cm[1][0]
    FP = cm[0][1]

    print(cm)
    print('Testing Accuracy =',(TP + TN)/(TP + TN + FP + FN))
    print("--------------------------------------")

In [None]:
# faster way to get metrics of the models
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

print("-----------------------------------------------------")

for i in range(len(model)):
    print("Model:", model_names[i], "\n")

    print(classification_report(Y_test, model[i].predict(X_test)))
    print("Accuracy:", accuracy_score(Y_test, model[i].predict(X_test)))
    
    print("-----------------------------------------------------")

# Model Selection Justification

After evaluating three different models - Logistic Regression, Decision Tree Classifier, and Random Forest Classifier - on the chest cancer classification task, the **Random Forest Classifier** was identified as the most suitable model for the following reasons:

## 1. Performance Metrics

- **Accuracy**: The Random Forest model achieved the highest accuracy (96.5%), indicating its superior ability to correctly classify both cancerous and non-cancerous cases compared to the other models tested.
- **Precision and Recall**: This model demonstrated very high precision and recall across both classes. Specifically, it showed a high ability to minimize false positives (precision) and false negatives (recall), which are crucial in medical diagnosis contexts. High recall is especially important for cancer detection to ensure that as few cases as possible are missed.
- **F1-Score**: The balanced F1-scores across classes suggest that the model maintains a balanced performance between precision and recall, which is essential for the sensitive nature of cancer classification tasks.

## 2. Importance of Recall in Medical Diagnostics

Given the critical importance of minimizing false negatives (missing a cancer diagnosis) in medical diagnostics, a model's recall is highly valued. The Random Forest Classifier demonstrated excellent recall, making it a prudent choice for applications where missing a positive case could have significant consequences.

## 3. Model Complexity vs. Interpretability

While the Random Forest Classifier is more complex and slightly less interpretable than simpler models like Logistic Regression, its superior performance metrics justify its selection. However, it's essential to balance model complexity with the need for interpretability in a clinical setting. Advanced techniques and tools can be employed to interpret Random Forest models, making them more understandable to healthcare professionals.

## 4. Conclusion

Considering the high stakes involved in cancer diagnosis, the Random Forest Classifier's superior ability to accurately identify cancerous cases, coupled with its robust performance across various metrics, makes it the best choice among the evaluated models. Future work could involve exploring model interpretability improvements to enhance clinical usability and patient outcomes further.


In [None]:
# export the preferred model - in this case the random forest classifier - to use it externally

import joblib

joblib.dump(model[2], 'random_forest_classifier.joblib')