Table of contents:
- What is ROC
- What is AUC
- What does it mean to be balanced? -> exactly 50/50 or what?
- xaxis in ROC when dataset is:
    - imbalanced : Precision as x-axis
    - balanced: FPR as x-axis
    - explain why certain x-axis is prefered

Previously in <a link="https://medium.com/analytics-vidhya/classification-performance-metric-with-python-sklearn-d8342ac25898">Classification Performance Metric with python Sklearn</a> we've covered various performance metrics in classification including ROC curve and AUC however they were briefly mentioned. 

Readers are assumed to have understanding about confusion matrix, precision, recall, TPR,  and FPR. If you don't, it is recommended to read previous blog.

We will dive deeper into ROC to understand its pros/cons, AUC, and when it should be replaced with PR curve.

We will use same dataset as before, breast cancer dataset from sklearn.

Just for my own sake, I've labelled malignant as 1 and benign as 0 which is the opposite labelling from previous blog.

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, precision_recall_curve, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


import plotly.graph_objects as go
import numpy as np
import pandas as pd

bc = load_breast_cancer()
df = pd.DataFrame(data=bc.data, columns=bc.feature_names)
df["target"] = bc.target
df["target"] = df["target"].map({0:1, 1:0})

print("memory size before ", df.memory_usage(deep=True).sum())
df.iloc[:, :-1] = StandardScaler().fit_transform(df.iloc[:, :-1])
df.iloc[:, :-1] = df.iloc[:, :-1].astype(np.float16)
df.iloc[:, -1] = df.iloc[:, -1].astype(np.int8)
print("memory size after ", df.memory_usage(deep=True).sum())

In [None]:
class ConfusionMatrix:
    """
    Contains metrics used to create confusion matrix:
    True positive, False positive, False negative, and True negative.
    
    This class implements various calculations such as fpr, tpr, recall, and precision.
    
    Parameters
    ----------
    true_y : list
             each element is true label
    pred_y : list
             each element is predicted label
    """
    def __init__(self, true_y, pred_y):
        self.tp = 0
        self.fp = 0
        self.fn = 0
        self.tn = 0

        for tr_y, pr_y in zip(true_y, pred_y):
            # Positive
            if pr_y == 1:
                if tr_y == 1:
                    self.tp += 1
                elif tr_y == 0:
                    self.fp += 1
            # Negative
            elif pr_y == 0:
                if tr_y == 1:
                    self.fn += 1
                elif tr_y == 0:
                    self.tn += 1
                    
    def calc_tpr(self):
        """Calculate tpr a.k.a. recall"""
        try:
            r = self.tp / (self.tp + self.fn)
        except ZeroDivisionError:
            r = 0
        return r
    
    def calc_fpr(self):
        """Calculate fpr"""
        return self.fp / (self.fp + self.tn) 
        
    def calc_precision(self):
        try:
            pr = self.tp / (self.tp + self.fp)
        except ZeroDivisionError:
            pr = 0
        return pr

# Comparing df with different fraction of class 1
- df with different level of imbalanceness

In [None]:
class1_frac = [0.1, 0.3, 0.5, 0.7, 1]
results_dfs = dict()

for frac in class1_frac:
    malignant_subset_df = df.loc[df["target"]==1].sample(frac=frac)
    new_df = pd.concat([malignant_subset_df, df.loc[df["target"] == 0]])
    
    y = new_df["target"].values
    X = new_df.drop(columns=["target"]).values
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.35)
    log_reg = LogisticRegression().fit(X_train, y_train)
    probs = log_reg.predict_proba(X_test)
    
    result_df = pd.DataFrame(probs, columns=["0", "1"])
    result_df["y_test"] = y_test
    
    results_dfs[frac] = result_df

For each result_df, we will see how as __number of true negative become larger it lowers FPR__ making look as if performing good when it is not.
To show why ROC is not good perf metrics when class is imbalanced. 

In [None]:
result_df = results_dfs[0.1]
y_pred = np.where(result_df["1"] > 0.5, 1, 0)
result_df["y_pred"] = y_pred
cm_10 = ConfusionMatrix(y_test, y_pred)

class0_r, class1_r = round(result_df["y_test"].value_counts(normalize=True),2).tolist()
print(f"class0:class_1 ratio = {class0_r}:{class1_r}" )
print("% of correctly classifying class 0 = ", round(cm_10.tn / (cm_10.tn + cm_10.fp), 3))
print("% of correctly classifying class 1 = ", round(cm_10.tp / (cm_10.tp + cm_10.fn), 3))

In [None]:
result_df = results_dfs[0.3]
y_pred = np.where(result_df["1"] > 0.5, 1, 0)
result_df["y_pred"] = y_pred
cm_30 = ConfusionMatrix(y_test, y_pred)

class0_r, class1_r = round(result_df["y_test"].value_counts(normalize=True),2).tolist()
print(f"class0:class_1 ratio = {class0_r}:{class1_r}" )
print("% of correctly classifying class 0 = ", round(cm_30.tn / (cm_30.tn + cm_30.fp), 3))
print("% of correctly classifying class 1 = ", round(cm_30.tp / (cm_30.tp + cm_30.fn), 3))

In [None]:
result_df = results_dfs[0.7]
y_pred = np.where(result_df["1"] > 0.5, 1, 0)
result_df["y_pred"] = y_pred
cm_70 = ConfusionMatrix(y_test, y_pred)

class0_r, class1_r = round(result_df["y_test"].value_counts(normalize=True),2).tolist()
print(f"class0:class_1 ratio = {class0_r}:{class1_r}" )
print("% of correctly classifying class 0 = ", round(cm_70.tn / (cm_70.tn + cm_70.fp), 3))
print("% of correctly classifying class 1 = ", round(cm_70.tp / (cm_70.tp + cm_70.fn), 3))

In [None]:
# ROC using custom tpr, fpr calculator
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        name = "random guessing",
        x = np.linspace(0, 1, 6),
        y = np.linspace(0, 1, 6),
        line = dict(dash='dash')
    )
)

fig.update_layout(
    title="ROC",
    xaxis_title="False Positive Rate",
    yaxis_title="True Positive Rate"
)

def add_to_roc_plot(fig, frac, thresholds, tpr, fpr):
    fig.add_trace(
        go.Scatter(
            name = f"frac={frac} - Logistic regression",
            x=fpr,
            y=tpr,
            mode='lines+markers+text',
#             text=np.round(thresholds, 2),
#             textposition='top right',
#             textfont = dict(
#                 family="sans serif",
#                 size=18,
#                 color="LightSeaGreen"
#             )
        )
    )

In [None]:
thresholds = np.linspace(0, 1, 10)

for frac, rdf in results_dfs.items():
    tpr = list()
    fpr = list()
    precision_list = list()
    y_test = rdf["y_test"]
    for thld in thresholds:
        y_pred = np.where(rdf["1"] > thld, 1, 0)
        cm = ConfusionMatrix(y_test, y_pred)

        tpr.append(cm.calc_tpr())
        fpr.append(cm.calc_fpr())
        precision_list.append(cm.calc_precision())
        
    add_to_roc_plot(fig, frac, thresholds, tpr, fpr)

In [None]:
fig.show()

# Single df

In [None]:
# ROC using custom tpr, fpr calculator
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        name = "Logistic regression",
        x=fpr,
        y=tpr,
        mode='lines+markers+text',
        text=np.round(thresholds, 2),
        textposition='top right',
        textfont = dict(
            family="sans serif",
            size=18,
            color="LightSeaGreen"
        )
    )
)

fig.add_trace(
    go.Scatter(
        name = "random guessing",
        x = np.linspace(0, 1, 6),
        y = np.linspace(0, 1, 6),
        line = dict(dash='dash')
    )
)

fig.update_layout(
    title="ROC",
    xaxis_title="False Positive Rate",
    yaxis_title="True Positive Rate"
)

fig.show()

In [None]:
# Custom PR Curve
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        name = "Logistic regression",
        x=tpr,
        y=precision_list,
        mode='lines+markers+text',
        text=np.round(thresholds, 2),
        textposition='top right',
        textfont = dict(
            family="sans serif",
            size=18,
            color="LightSeaGreen"
        )
    )
)

fig.add_trace(
    go.Scatter(
        name = "random guessing",
        x = np.linspace(0, 1, 6),
        y = np.linspace(0, 1, 6),
        line = dict(dash='dash')
    )
)

fig.update_layout(
    title="Precision-Recall Curve",
    xaxis_title="Recall",
    yaxis_title="Precision"
)

fig.show()