Table of contents:
- What is ROC
- What is AUC
- What does it mean to be balanced? -> exactly 50/50 or what?
- xaxis in ROC when dataset is:
    - imbalanced : Precision as x-axis
    - balanced: FPR as x-axis
    - explain why certain x-axis is prefered

Previously in <a link="https://medium.com/analytics-vidhya/classification-performance-metric-with-python-sklearn-d8342ac25898">Classification Performance Metric with python Sklearn</a> we've covered various performance metrics in classification including ROC curve and AUC however they were briefly mentioned. 

Readers are assumed to have understanding about confusion matrix, precision, recall, TPR,  and FPR. If you don't, it is recommended to read previous blog.

We will dive deeper into ROC to understand its pros/cons, AUC, and when it should be replaced with PR curve.

We will use same dataset as before, breast cancer dataset from sklearn.

Just for my own sake, I've labelled malignant as 1 and benign as 0 which is the opposite labelling from previous blog.

In [25]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, precision_recall_curve, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


import plotly.graph_objects as go
import numpy as np
import pandas as pd

bc = load_breast_cancer()
df = pd.DataFrame(data=bc.data, columns=bc.feature_names)
df["target"] = bc.target
df["target"] = df["target"].map({0:1, 1:0})

print("memory size before ", df.memory_usage(deep=True).sum())
df.iloc[:, :-1] = StandardScaler().fit_transform(df.iloc[:, :-1])
df.iloc[:, :-1] = df.iloc[:, :-1].astype(np.float16)
df.iloc[:, -1] = df.iloc[:, -1].astype(np.int8)
print("memory size after ", df.memory_usage(deep=True).sum())

memory size before  141240
memory size after  34837


In [None]:
y = new_df["target"].values
X = new_df.drop(columns=["target"]).values

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)

In [26]:
class1_frac = [0.1, 0.3, 0.5, 0.7, 1]

# df with different level of imbalanceness
dfs = dict()

for frac in class1_frac:
    malignant_subset_df = df.loc[df["target"]==1].sample(frac=frac)
    new_df = pd.concat([malignant_subset_df, df.loc[df["target"] == 0]])
    dfs[frac] = new_df

In [27]:
for frac, df in dfs.items():
    class0_r, class1_r = round(df["target"].value_counts(normalize=True),2).tolist()
    print(frac)
    print(f"class0:class_1 ratio = {class0_r}:{class1_r}" )
    print("----------")

0.1
class0:class_1 ratio = 0.94:0.06
----------
0.3
class0:class_1 ratio = 0.85:0.15
----------
0.5
class0:class_1 ratio = 0.77:0.23
----------
0.7
class0:class_1 ratio = 0.71:0.29
----------
1
class0:class_1 ratio = 0.63:0.37
----------


In [2]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)
probas = log_reg.predict_proba(X_test)

In [3]:
result_df = pd.DataFrame(probas, columns=["0", "1"])
result_df["true_y"] = y_test

In order to plot ROC we need to measure FPR, TPR at different thresholds.

In [4]:
class ConfusionMatrix:
    """
    Contains metrics used to create confusion matrix:
    True positive, False positive, False negative, and True negative.
    
    This class implements various calculations such as fpr, tpr, recall, and precision.
    
    Parameters
    ----------
    true_y : list
             each element is true label
    pred_y : list
             each element is predicted label
    """
    def __init__(self, true_y, pred_y):
        self.tp = 0
        self.fp = 0
        self.fn = 0
        self.tn = 0

        for tr_y, pr_y in zip(true_y, pred_y):
            # Positive
            if pr_y == 1:
                if tr_y == 1:
                    self.tp += 1
                elif tr_y == 0:
                    self.fp += 1
            # Negative
            elif pr_y == 0:
                if tr_y == 1:
                    self.fn += 1
                elif tr_y == 0:
                    self.tn += 1
                    
    def calc_tpr(self):
        """Calculate tpr a.k.a. recall"""
        try:
            r = self.tp / (self.tp + self.fn)
        except ZeroDivisionError:
            r = 0
        return r
    
    def calc_fpr(self):
        """Calculate fpr"""
        return self.fp / (self.fp + self.tn) 
        
    def calc_precision(self):
        try:
            pr = self.tp / (self.tp + self.fp)
        except ZeroDivisionError:
            pr = 0
        return pr

In [None]:
tpr = list()
fpr = list()
precision_list = list()

thresholds = np.linspace(0, 1, 10)
for thld in thresholds:
    pred_y = np.where(result_df["1"] > thld, 1, 0)
    cm = ConfusionMatrix(y_test, pred_y)
    
    tpr.append(cm.calc_tpr())
    fpr.append(cm.calc_fpr())
    precision_list.append(cm.calc_precision())

In [None]:
# ROC using custom tpr, fpr calculator
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        name = "Logistic regression",
        x=fpr,
        y=tpr,
        mode='lines+markers+text',
        text=np.round(thresholds, 2),
        textposition='top right',
        textfont = dict(
            family="sans serif",
            size=18,
            color="LightSeaGreen"
        )
    )
)

fig.add_trace(
    go.Scatter(
        name = "random guessing",
        x = np.linspace(0, 1, 6),
        y = np.linspace(0, 1, 6),
        line = dict(dash='dash')
    )
)

fig.update_layout(
    title="ROC",
    xaxis_title="False Positive Rate",
    yaxis_title="True Positive Rate"
)

fig.show()

In [None]:
# check ROC with as class goes from imbalanced to balanced with same % of wrongly predicting positives.

In [None]:
# Custom PR Curve
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        name = "Logistic regression",
        x=tpr,
        y=precision_list,
        mode='lines+markers+text',
        text=np.round(thresholds, 2),
        textposition='top right',
        textfont = dict(
            family="sans serif",
            size=18,
            color="LightSeaGreen"
        )
    )
)

fig.add_trace(
    go.Scatter(
        name = "random guessing",
        x = np.linspace(0, 1, 6),
        y = np.linspace(0, 1, 6),
        line = dict(dash='dash')
    )
)

fig.update_layout(
    title="Precision-Recall Curve",
    xaxis_title="Recall",
    yaxis_title="Precision"
)

fig.show()

In [None]:
# ROC using sklearn library
scores = probas[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, scores)

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        name = "Logistic regression",
        x=fpr,
        y=tpr,
        text=thresholds
    )
)

fig.add_trace(
    go.Scatter(
        name = "random guessing",
        x = np.linspace(0, 1, 6),
        y = np.linspace(0, 1, 6),
        line = dict(dash='dash')
    )
)

fig.update_layout(
    title="ROC",
    xaxis_title="False Positive Rate",
    yaxis_title="True Positive Rate"
)

fig.show()

In [None]:
result_df.head()

In [None]:
correct = 0
for row in result_df.itertuples():
    true_y = getattr(row, "target")
    pred_y = getattr(row, "pred_t")
    
    if true_y == pred_y:
        correct += 1

In [None]:
correct / len(result_df)

In [None]:
result_df.loc[result_df["target"] != result_df["pred_t"]]

In [None]:
result_df

In [None]:
np.where(result_df["0"] > 0.5, 0, 1)

In [None]:
result_df.loc[(result_df["0"] >= 0.4) &
              (result_df["0"] <= 0.6)]

In [None]:
def sigmoid(z):
    return 1/(1+np.exp(z))

In [None]:
log_reg.coef_

In [None]:
log_reg.intercept_

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x = result_df["1"],
        y = result_df["1"],
        mode="markers"
    )
)

fig.show()

In [None]:
new_df.columns

## What is ROC

- show how ROC is created under the hood of sklearn

## What is AUC

balanced => exactly 50/50?

## ROC Vs. PR curve

<b>References</b>

- https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc