In [1]:
import pandas as pd

data = pd.read_csv('https://raw.githubusercontent.com/npradaschnor/Pima-Indians-Diabetes-Dataset/master/diabetes.csv')

data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
data.shape

(768, 9)

In [3]:
# Seperating X & y

X = data.drop('Outcome', axis=1)
y = data['Outcome']

In [4]:
# Doing Train-Test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [5]:
X_test.shape

(154, 8)

In [6]:
# Training LogisticRegression Model as it calculate prediction_probability internally

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [7]:
# Calculation prediction_probability for test dataset

y_scores = model.predict_proba(X_test)[:,1]

In [14]:
y_scores

array([0.04954171, 0.17461728, 0.09365462, 0.25515628, 0.63559626,
       0.11671163, 0.06567497, 0.42198811, 0.04864644, 0.57568054,
       0.33866945, 0.41291293, 0.69854675, 0.19967318, 0.02000902,
       0.82464922, 0.86661139, 0.03101471, 0.25508003, 0.89492496,
       0.95243128, 0.83474384, 0.11741276, 0.4468076 , 0.08923271,
       0.06880745, 0.6511607 , 0.41200622, 0.17865604, 0.28690263,
       0.24550833, 0.43894269, 0.00942   , 0.24258682, 0.35159954,
       0.96164341, 0.34359719, 0.80920389, 0.29382291, 0.0508366 ,
       0.18184821, 0.08675016, 0.42209491, 0.18739704, 0.03012082,
       0.03949675, 0.24744293, 0.42776148, 0.10530832, 0.37970258,
       0.99355988, 0.10670125, 0.39102192, 0.76878514, 0.36865297,
       0.45771265, 0.95035633, 0.41991816, 0.39159445, 0.04376447,
       0.37290548, 0.90126219, 0.88176436, 0.88842814, 0.33088435,
       0.06931983, 0.95262224, 0.200482  , 0.2605429 , 0.35459099,
       0.10359205, 0.0816923 , 0.4504409 , 0.09221062, 0.08648

In [15]:
# For different threshold value calculating TPR & FPR

from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_test, y_scores)

In [20]:
# unique threshold values at which the fpr and tpr were computed

thresholds

array([1.99355988e+00, 9.93559880e-01, 9.52622236e-01, 9.52431281e-01,
       8.24649217e-01, 7.68785139e-01, 7.62084165e-01, 6.83892849e-01,
       6.75401963e-01, 6.74194852e-01, 6.51160703e-01, 6.50852686e-01,
       6.35596265e-01, 5.86285586e-01, 5.75680539e-01, 5.69730880e-01,
       5.25254827e-01, 4.70273418e-01, 4.61218138e-01, 4.50440904e-01,
       4.38942687e-01, 4.38655977e-01, 4.22094906e-01, 4.19918162e-01,
       4.12912934e-01, 4.12006218e-01, 4.04764937e-01, 3.79702578e-01,
       3.68652975e-01, 3.10181073e-01, 3.08805269e-01, 2.77743854e-01,
       2.71997928e-01, 2.58474746e-01, 2.55156282e-01, 2.47442932e-01,
       2.45508327e-01, 2.36554726e-01, 2.28894872e-01, 1.81848208e-01,
       1.78656037e-01, 1.74617283e-01, 1.73313406e-01, 1.46364532e-01,
       1.40805420e-01, 1.16711628e-01, 1.14930426e-01, 1.10000756e-01,
       1.06701251e-01, 8.92327115e-02, 8.67501626e-02, 2.00090231e-02,
       1.93062126e-02, 1.60776532e-03])

##### It's worth noting that the number of unique thresholds may not always be the same as the number of samples. The roc_curve function computes the thresholds based on the unique values present in y_scores. If there are duplicate scores, they will be collapsed into a single threshold value. Hence, the number of thresholds can be smaller than the number of samples.

In [25]:
import plotly.graph_objects as go
import numpy as np
from sklearn.metrics import roc_auc_score

# Assuming fpr, tpr, thresholds are already calculated as before
fpr, tpr, thresholds = roc_curve(y_test, y_scores)

# Calculate the AUC (Area Under the Curve)
roc_auc = roc_auc_score(y_test, y_scores)

# Generate a trace for ROC curve
trace0 = go.Scatter(
    x=fpr,
    y=tpr,
    mode='lines',
    name=f'ROC curve (Area = {roc_auc:.2f})'
)

# Only label every nth point to avoid cluttering
n = 10
indices = np.arange(len(thresholds)) % n == 0  # Choose indices where index mod n is 0

trace1 = go.Scatter(
    x=fpr[indices],
    y=tpr[indices],
    mode='markers+text',
    name='Threshold points',
    text=[f"Thr={thr:.2f}" for thr in thresholds[indices]],
    textposition='top center'
)

# Diagonal line
trace2 = go.Scatter(
    x=[0, 1],
    y=[0, 1],
    mode='lines',
    name='Random (Area = 0.5)',
    line=dict(dash='dash')
)

data = [trace0, trace1, trace2]

# Define layout with square aspect ratio
layout = go.Layout(
    title='Receiver Operating Characteristic',
    xaxis=dict(title='False Positive Rate'),
    yaxis=dict(title='True Positive Rate'),
    autosize=False,
    width=800,
    height=800,
    showlegend=True
)

# Define figure and add data
fig = go.Figure(data=data, layout=layout)

# Show figure
fig.show()

In [26]:
# Finding best threshold value

optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print("Optimal threshold is:", optimal_threshold)

Optimal threshold is: 0.3686529748384333


### AUC-ROC

##### Compare two models using AUC-ROC

In [24]:
import numpy as np
import plotly.graph_objects as go
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler

# Assuming that X_train, X_test, y_train, y_test are already defined

# SVM requires feature scaling for better performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
lr_scores = lr_model.predict_proba(X_test)[:,1]

# SVM model
svm_model = SVC(probability=True)
svm_model.fit(X_train_scaled, y_train)
svm_scores = svm_model.predict_proba(X_test_scaled)[:,1]

# Generate ROC curve data for logistic regression model
lr_fpr, lr_tpr, lr_thresholds = roc_curve(y_test, lr_scores)
lr_auc = roc_auc_score(y_test, lr_scores)

# Generate ROC curve data for SVM model
svm_fpr, svm_tpr, svm_thresholds = roc_curve(y_test, svm_scores)
svm_auc = roc_auc_score(y_test, svm_scores)

# Generate a trace for the Logistic Regression ROC curve
trace0 = go.Scatter(
    x=lr_fpr,
    y=lr_tpr,
    mode='lines',
    name=f'Logistic Regression (Area = {lr_auc:.2f})'
)

# Generate a trace for the SVM ROC curve
trace1 = go.Scatter(
    x=svm_fpr,
    y=svm_tpr,
    mode='lines',
    name=f'SVM (Area = {svm_auc:.2f})'
)

# Diagonal line
trace2 = go.Scatter(
    x=[0, 1],
    y=[0, 1],
    mode='lines',
    name='Random (Area = 0.5)',
    line=dict(dash='dash')
)

data = [trace0, trace1, trace2]

# Define layout with square aspect ratio
layout = go.Layout(
    title='Receiver Operating Characteristic',
    xaxis=dict(title='False Positive Rate'),
    yaxis=dict(title='True Positive Rate'),
    autosize=False,
    width=800,
    height=800,
    showlegend=True
)

# Define figure and add data
fig = go.Figure(data=data, layout=layout)

# Show figure
fig.show()

##### From above graph, SVM is a better classifier across all threshold levels