In [None]:
# importing libraries, etc...

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set()

path = "https://raw.githubusercontent.com/LennardVaarten/ML-Workshops/main/data/"

Breast cancer is cancer that uncontrollably grows breast cells. The grown cells form a tumor can be malignant (dangerous) or benign (not malignant). Breast cancer diagnosis is to determine whether a tumor is malignant or not.

Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe the characteristics of the cell nuclei present in the image.

# **Loading And Viewing The Data**

In [None]:
# loading the dataset

bc = pd.read_csv(path+"breast_cancer.csv")

In [None]:
# viewing

bc

In [None]:
# how often does each target label appear?

bc["diagnosis_M"].value_counts()

In [None]:
# scaling the features, so that each feature ranges from 0 to 1

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(bc.iloc[:,1:])

bc.iloc[:,1:] = scaler.transform(bc.iloc[:,1:])

In [None]:
bc

In [None]:
# splitting the data into a training set and test set

from sklearn.model_selection import train_test_split

features_train, features_test, target_train, target_test = train_test_split(bc.iloc[:,1:], 
                                                                            bc.iloc[:,0], 
                                                                            random_state=99)

# **Overfitting and Underfitting**

In [None]:
# training k-NN with different values of k to show overfitting and underfitting

from sklearn.neighbors import KNeighborsClassifier

neighbors = []
training_accuracy = []
test_accuracy = []
neighbors_settings = [n for n in range(1,41,2)]

for k in neighbors_settings:
    knn = KNeighborsClassifier(n_neighbors=k).fit(features_train, target_train)
    neighbors.append(k)
    training_accuracy.append(knn.score(features_train, target_train))
    test_accuracy.append(knn.score(features_test, target_test))

results = pd.DataFrame([neighbors, training_accuracy, test_accuracy]).T
results.columns = ["k", "training accuracy", "test accuracy"]
results = pd.melt(results, 
                  id_vars='k', 
                  var_name="train_test", 
                  value_name="score")

sns.lineplot(x='k', 
             y='score', 
             hue='train_test', 
             data=results)

# **Decision Tree Classifier**

In [None]:
# training the decision tree classifier

from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(random_state=99).fit(features_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(features_train, target_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(features_test, target_test)))

In [None]:
# how does our decision tree classifier come to a decision?

from sklearn.tree import plot_tree

fn = list(bc.columns)
cn = ["B", "M"]
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(17,17), dpi=150)
plot_tree(tree, feature_names=fn, class_names=cn, filled=True, fontsize=6);

## Pruning with max_depth

In [None]:
# pruning by limiting the maximum depth of the decision tree

tree = DecisionTreeClassifier(max_depth=3, random_state=99).fit(features_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(features_train, target_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(features_test, target_test)))

In [None]:
# plotting our pruned decision tree

fn = list(bc.columns)
cn = ["B", "M"]
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(7,7), dpi=150)
plot_tree(tree, feature_names=fn, class_names=cn, filled=True, fontsize=6);

## Pruning with min_samples_split

In [None]:
# another way of pruning is by only allowing a split to be made

tree = DecisionTreeClassifier(max_depth=6, min_samples_split=5, max_leaf_nodes=10, random_state=99).fit(features_train, target_train)
print("Accuracy on training set: {:.3f}".format(tree.score(features_train, target_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(features_test, target_test)))

In [None]:
fn = list(bc.columns)
cn = ["B", "M"]
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(15,15), dpi=100)
plot_tree(tree, feature_names=fn, class_names=cn, filled=True, fontsize=6);

## Feature Importances

In [None]:
fn = bc.columns[1:]
fi = tree.feature_importances_[1:]
fn_sorted = [x for _, x in sorted(zip(fi,fn), reverse=True)]
fi_sorted = sorted(fi, reverse=True)

for fn, fi in zip(fn_sorted, fi_sorted):
    print(f"{fn:25} {fi:.3f}")

In [None]:
print(sum(tree.feature_importances_))

# **Random Forest Classifier**

In [None]:
# training the random forest classifier

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=1000, max_depth=4, random_state=99)
rf.fit(features_train, target_train)

print("Accuracy on training set: {:.3f}".format(rf.score(features_train, target_train)))
print("Accuracy on test set: {:.3f}".format(rf.score(features_test, target_test)))

## Feature Importances

In [None]:
# looking at feature importances of random forest classifier

fn_sorted = [fn for fi, fn in sorted(zip(rf.feature_importances_, list(bc.columns)[1:]), reverse=True)]
fi_sorted = sorted(list(rf.feature_importances_), reverse=True)

for fn, fi in zip(fn_sorted, fi_sorted):
    print(f"{fn:25} {fi:.3f}")

# **Gradient Boosting Classifier**

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=1000, max_depth=3, subsample=0.4, learning_rate=0.1, random_state=99)
gbc.fit(features_train, target_train)

print("Accuracy on training set: {:.3f}".format(gbc.score(features_train, target_train)))
print("Accuracy on test set: {:.3f}".format(gbc.score(features_test, target_test)))

# **Cross-Validation**

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=500, random_state=99)
rf.fit(features_train, target_train)

rfScores = cross_val_score(rf, features_train, target_train, cv=10)
print(rfScores)
print(f"Random Forest mean 5-fold Cross-Validation score: {np.mean(rfScores):.3f}")

In [None]:
knn = KNeighborsClassifier(n_neighbors=5).fit(features_train, target_train)
gbc.fit(features_train, target_train)

knnScores = cross_val_score(knn, features_train, target_train, cv=10)
print(knnScores)
print(f"k-NN mean 5-fold Cross-Validation score: {np.mean(knnScores):.3f}")

## Leave-One-Out Cross-Validation

In [None]:
from sklearn.model_selection import LeaveOneOut

knn = KNeighborsClassifier(n_neighbors=5)
knnScores = cross_val_score(knn, features_train, target_train, cv=LeaveOneOut())

print(f"k-NN mean Leave-One-Out Cross-Validation score: {np.mean(knnScores):.3f}")

In [None]:
print(f"Total models trained: {len(knnScores)}")
print("Score for each model:")
print(knnScores)

# **Grid Search**

In [None]:
from sklearn.model_selection import GridSearchCV

params = {
    "n_neighbors": [k for k in range(1, 21, 2)],
    "weights": ["uniform", "distance"]
}

knn = GridSearchCV(estimator=KNeighborsClassifier(),
                   param_grid=params) 

knn.fit(features_train, target_train)

print("Best CV score on training set: {:.3f}".format(knn.best_score_))
print("Score on test set: {:.3f}".format(knn.score(features_test, target_test)))

In [None]:
# Check out which model parameters performed best

knn.best_params_

# **Evaluation Metrics**

## Confusion Matrix

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C=10, random_state=99)
logreg.fit(features_train, target_train)

print("Accuracy on training set: {:.4f}".format(logreg.score(features_train, target_train)))
print("Accuracy on test set: {:.4f}".format(logreg.score(features_test, target_test)))

In [None]:
from sklearn.metrics import confusion_matrix

pred_logreg = logreg.predict(features_test)

confusion_matrix(target_test, pred_logreg)

## Accuracy, Precision, Recall, F-Score

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

print("Accuracy: {:.3f}".format(accuracy_score(target_test, pred_logreg)))
print("Precision: {:.3f}".format(precision_score(target_test, pred_logreg)))
print("Recall: {:.3f}".format(recall_score(target_test, pred_logreg)))
print("F-Score: {:.3f}".format(f1_score(target_test, pred_logreg)))

### Manually Influencing Precision and Recall

In [None]:
logreg.predict(features_test)

In [None]:
[format(x, '.3f') for x in logreg.predict_proba(features_test)[:,1]]

In [None]:
pred_logreg_thresh = logreg.predict_proba(features_test)[:,1] > 0.25

confusion_matrix(target_test, pred_logreg_thresh)

In [None]:
print("Accuracy: {:.3f}".format(accuracy_score(target_test, pred_logreg_thresh)))
print("Precision: {:.3f}".format(precision_score(target_test, pred_logreg_thresh)))
print("Recall: {:.3f}".format(recall_score(target_test, pred_logreg_thresh)))
print("F-Score: {:.3f}".format(f1_score(target_test, pred_logreg_thresh)))

## Precision-Recall Curve

In [None]:
from sklearn.metrics import precision_recall_curve

plt.figure(figsize=(8,6))
precision, recall, thresholds = precision_recall_curve(target_test, logreg.decision_function(features_test))
close_zero = np.argmin(np.abs(thresholds))
plt.plot(precision[close_zero], recall[close_zero], 'o', markersize=10, label="threshold 0.5", fillstyle="none", c='k')
plt.plot(precision, recall, label="precision recall curve")
plt.xlabel("Precision")
plt.ylabel("Recall")

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=1000, max_depth=3, subsample=1, learning_rate=0.1, random_state=99)
gbc.fit(features_train, target_train)

print("Accuracy on training set: {:.3f}".format(gbc.score(features_train, target_train)))
print("Accuracy on test set: {:.3f}".format(gbc.score(features_test, target_test)))

In [None]:
# plot precision-recall curves of LogReg vs GBC

plt.figure(figsize=(8,6))

precision, recall, thresholds = precision_recall_curve(target_test, logreg.decision_function(features_test))
close_zero = np.argmin(np.abs(thresholds))
plt.xlabel("Precision")
plt.ylabel("Recall")

precision_gbc, recall_gbc, thresholds_gbc = precision_recall_curve(
target_test, gbc.predict_proba(features_test)[:, 1])
plt.plot(precision, recall, label="logreg")
plt.plot(precision[close_zero], recall[close_zero], 'o', markersize=10,
label="threshold 0.5 logreg", fillstyle="none", c='k')
plt.plot(precision_gbc, recall_gbc, label="gbc")
close_default_gbc = np.argmin(np.abs(thresholds_gbc - 0.5))
plt.plot(precision_gbc[close_default_gbc], recall_gbc[close_default_gbc], '^', c='k',
markersize=10, label="threshold 0.5 gbc", fillstyle="none", mew=2)
plt.xlabel("Precision")
plt.ylabel("Recall")
plt.legend(loc="best")

## Area Under Precision-Recall Curve

In [None]:
from sklearn.metrics import average_precision_score

auc_rf = average_precision_score(target_test, gbc.predict_proba(features_test)[:, 1])
auc_logreg = average_precision_score(target_test, logreg.decision_function(features_test))
print("PRC AUC of Gradient Boosting Classifier: {:.3f}".format(ap_rf))
print("PRC AUC of Logistic Regression: {:.3f}".format(ap_logreg))