In [None]:
import numpy as np
np.random.seed(0)
from sklearn import preprocessing
import statsmodels.api as sm
import pandas as pd
#!pip install plot_metric
#from plot_metric.functions import BinaryClassification
from sklearn.metrics import confusion_matrix, roc_auc_score, plot_roc_curve
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression


In [None]:
x = np.array([25, 66, 77, 55, 70, 30, 42, 37, 18, 26, 21, 29, 34, 90, 48, 50, 52, 55, 57, 61])
y = np.array(['N','Y','Y','N','Y','N','Y','Y','N','N','N','Y','N','Y','N','Y','Y','N','Y','N'])

df=pd.DataFrame({'Age':x,'Stroke_y':y})


In [None]:
df['Stroke_y'] = preprocessing.LabelEncoder().fit_transform(df['Stroke_y'])


In [None]:
# X is a dataframe, Y is a numpy array
X = df[['Age']]
X = sm.add_constant(X)
y = df.Stroke_y
df.rename(columns={'Stroke_y':'y'},inplace=True)


In [None]:
X

In [None]:
y

### 1.Using statsmodel

In [None]:
model = sm.Logit(y,X).fit()

In [None]:
model.summary()

In [None]:
b0 = -3.1529    # b0 is intercept
b1 = 0.0676     # b1 is coefficient

z = b0 + b1 * X.Age
df['prob_y=1']=np.exp(z) / (1 + np.exp(z))

In [None]:
model.predict(X) # probabilities for Y=1

In [None]:
df['prob_y=0']=1-df['prob_y=1']
df

In [None]:
df['likelihood']=df['y']

In [None]:
# likelihood is probability of our actual 'y' 

for i in range(len(df['y'])):
    if df['y'][i]==1:
        df['likelihood'][i]=df['prob_y=1'][i]
    else:
        df['likelihood'][i]=df['prob_y=0'][i]

In [None]:
df

In [None]:
df['log_of_likelihood']=np.log(df['likelihood'])
df

In [None]:
df['log_of_likelihood'].sum()

In [None]:
df['prob'] = model.predict(X)
df[df.prob < 0.5]
model.aic


In [None]:
df_roc = pd.DataFrame(columns=['Threshold', 'TP', 'TN', 'FP', 'FN', 'TPR', 'FPR'])

### ROC curve (receiver operating characteristics curve) is a graph showing the performance of a classification model at all classification thresholds.

In [None]:
for threshold in np.arange(0, 1.1, 0.1):
    y_pred = (df['prob'] > threshold).astype(int)
    cm = confusion_matrix(y, y_pred)
    TP = cm[1, 1]
    TN = cm[0, 0]
    FP = cm[0, 1]
    FN = cm[1, 0]
    TPR = TP / (TP + FN)
    FPR = FP / (FP + TN)
    df_roc = df_roc.append({'Threshold': threshold, 'TP': TP, 'TN': TN, 'FP': FP, 'FN': FN, 'TPR': TPR, 'FPR': FPR}, ignore_index=True)
df_roc

In [None]:
# AUC stands for Area under the ROC curve

auc = roc_auc_score(y, df['prob'])
auc

In [None]:
# Plot ROC curve from df_ROC
plt.plot(df_roc['FPR'], df_roc['TPR'], label=f'AUC={auc:.2f}')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

In [None]:
df_roc['TPR'] = df_roc.TP / (df_roc.TP+df_roc.FN)
df_roc['FPR'] = df_roc.FP / (df_roc.TN+df_roc.FP)

In [None]:
df_roc

In [None]:
# for a given threshold, confusion matrix
Threshold = 0.5
df_roc[df_roc.Threshold==Threshold][['Threshold','TP','TN','FP','FN']]


In [None]:
# as TPR - FPR is more, threshold is better. one method of selecting threshold is this.
df_roc['(TPR-FPR)']=(df_roc.TPR - df_roc.FPR)
df_roc

### The threshold of 0.3 yields the highest value for the difference between the true positive rate and false positive rate. Therefore, using a threshold of 0.3 in this model would result in the highest accuracy.

In [None]:
# Plot histogram of predicted probabilities for each true class
plt.hist(df['prob'][y==1], alpha=0.5, label='True Negative')
plt.hist(df['prob'][y==0], alpha=0.5, label='True Positive')
plt.legend()
plt.xlabel('Predicted Probability')
plt.ylabel('Count')
plt.title('Distribution of Predicted Probabilities')
plt.show()

### ROC curve by library 

In [None]:
# Visualisation with plot_metric
bc = BinaryClassification(y, model.predict(X), labels=["0", "1"])

# Figures
plt.figure(figsize=(5,5))
bc.plot_roc_curve()
plt.show()

### 2. Using sklearn library

In [None]:
X = df[['Age', 'y']]
y= df[['y']]

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
model = lr.fit(X,y)

In [None]:
df2 = df[['Age', 'y']]

In [None]:
df2['prob'] = model.predict_proba(X)[:,1]

In [None]:
df2

In [None]:
df_ROC = pd.DataFrame(columns=['Threshold', 'TP', 'TN', 'FP', 'FN', 'TPR', 'FPR'])

In [None]:
# Loop through different thresholds and calculate metrics
for threshold in np.arange(0, 1.1, 0.1):
    y_pred = (df2['prob'] > threshold).astype(int)
    cm = confusion_matrix(y, y_pred)
    TP = cm[1, 1]
    TN = cm[0, 0]
    FP = cm[0, 1]
    FN = cm[1, 0]
    TPR = TP / (TP + FN)
    FPR = FP / (FP + TN)
    df_ROC = df_ROC.append({'Threshold': threshold, 'TP': TP, 'TN': TN, 'FP': FP, 'FN': FN, 'TPR': TPR, 'FPR': FPR}, ignore_index=True)

In [None]:
auc = roc_auc_score(y, df2['prob'])
# Print ROC curve and AUC
print(df_ROC)
print('AUC:', auc)

In [None]:
# Plot ROC curve from df_ROC
plt.plot(df_ROC['FPR'], df_ROC['TPR'], label=f'AUC={auc:.2f}')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()