In [35]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt

In [36]:
def my_roc_curve(labels, scores):
    # sort scores and corresponding truth values
    desc_score_indices = np.argsort(scores, kind="mergesort")[::-1]
    scores = scores[desc_score_indices]
    labels = labels[desc_score_indices]

    # scores typically has many tied values. Here we extract
    # the indices associated with the distinct values. We also
    # concatenate a value for the end of the curve.
    distinct_value_indices = np.where(np.diff(scores))[0]
    threshold_idxs = np.r_[distinct_value_indices, labels.size - 1]

    # accumulate the true positives with decreasing threshold
    tps = np.cumsum(labels, dtype=np.float64)[threshold_idxs]
    fps = 1 + threshold_idxs - tps
    thresholds = scores[threshold_idxs]

    optimal_idxs = np.where(
        np.r_[True, np.logical_or(np.diff(fps, 2), np.diff(tps, 2)), True]
    )[0]
    fps = fps[optimal_idxs]
    tps = tps[optimal_idxs]
    thresholds = thresholds[optimal_idxs]

    # Add an extra threshold position
    # to make sure that the curve starts at (0, 0)
    tps = np.r_[0, tps]
    fps = np.r_[0, fps]

    fpr = fps / fps[-1]
    tpr = tps / tps[-1]
    thresholds = np.r_[np.inf, thresholds]

    return fpr, tpr, thresholds



def roc_range(scores, labels, maxfpr, minfpr = 0):
  """
    Inputs:
    scores - predictions from any given classifier
    labels - the true label (either 0, 1) of the data
    maxfpr - the maximum false positive rate (FPR)
    minfpr (optional, default = 0) - the minimum false positive rate (FPR)

    Outputs:
    fpr_in_range - list of FPR values from minpfa to maxpfa
    tpr_in_range - list of true positive rate (TPR) values from minfpr to maxfpr
    auc_in_range - single value of area under curve from minpfa to maxpfa
  """
  fpr, tpr, _ = my_roc_curve(labels, scores)
  
  auc_in_range = 0
  fpr_in_range = []
  tpr_in_range = []
  for i in range(len(fpr)):
    if fpr[i] >= minfpr and fpr[i] <= maxfpr:
      fpr_in_range.append(fpr[i])
      tpr_in_range.append(tpr[i])
      auc_in_range += (fpr[i] - fpr[i-1]) * tpr[i]

  return fpr_in_range, tpr_in_range, auc_in_range

In [37]:
# Sample Usage
TEST_MODE = False # Change to True to test on small dataset
data = np.load("assignment6.npz")

scores_small = data['scores_small']
scores_large = data['scores_large']
labels_small = data['labels_small']
labels_large = data['labels_large']

if TEST_MODE:
  scores = scores_small
  labels = labels_small
else:
  scores = scores_large
  labels = labels_large

"""
a) PF A ∈ [0, 1.0], the full range of thresholds
b) PF A ∈ [0, 0.4]
c) PF A ∈ [0, 0.75]
d) PF A ∈ [0.25, 0.75]
"""

ranges = [[0, 1.0], [0, 0.4], [0, 0.75], [0.25, 0.75]]
for idx, r in enumerate(ranges):
  fpr, tpr, auc = roc_range(scores, labels, r[1], r[0])
  plt.plot(fpr, tpr, label='FPR ∈ [0.25, 0.75] (AUC = %0.2f)' % auc)
  plt.xlabel('False Positive Rate')
  plt.ylabel('True Positive Rate')
  plt.title('ROC Curve')
  plt.legend(loc="lower right")
  plt.savefig(f"roc_curve_{chr(ord('a') + idx)}.png")
  plt.clf()
  print(f"Range: FPR ∈ [{r[0]}, {r[1]}], AUC = {auc}")

Range: FPR ∈ [0, 1.0], AUC = 0.824412
Range: FPR ∈ [0, 0.4], AUC = 0.2522660000000001
Range: FPR ∈ [0, 0.75], AUC = 0.574101
Range: FPR ∈ [0.25, 0.75], AUC = 0.44496199999999997


<Figure size 640x480 with 0 Axes>

In [62]:
# Get precision for the full range of thresholds
# What thresholds provide a precision of 0.9?

from sklearn.metrics import precision_score, accuracy_score, roc_curve

def precision_range(scores, labels):
  _, _, thresholds = roc_curve(labels, scores)
  precision = []
  accuracy = []

  for i in range(len(thresholds)):
    y_pred = scores > thresholds[i]
    precision.append(precision_score(labels, y_pred, zero_division=0))
    accuracy.append(accuracy_score(labels, y_pred))

  return precision, accuracy, thresholds

# Sample Usage
precision, accuracy, thresholds = precision_range(scores_large, labels_large)

# Find the threshold that provides a precision of 0.9
for i in range(len(precision)):
  if 0.91 > precision[i] >= 0.9:
    print(f"Threshold: {thresholds[i]}, Precision: {precision[i]}")
    print(f"Accuracy: {accuracy[i]}")
    break

plt.plot(thresholds, precision)
plt.xlabel('Thresholds')
plt.ylabel('Precision')
plt.title('Precision vs Thresholds')
plt.savefig("precision_thresholds.png")
plt.clf()

plt.plot(thresholds, accuracy)
plt.xlabel('Thresholds')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Thresholds')
plt.savefig("accuracy_thresholds.png")
plt.clf()


Threshold: 1.1259688898342692, Precision: 0.9096209912536443
Accuracy: 0.6405


<Figure size 640x480 with 0 Axes>