In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.datasets import make_classification
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import precision_recall_curve, roc_curve, confusion_matrix, classification_report
from sklearn.utils import resample
from sklearn.utils.class_weight import compute_class_weight

import matplotlib.pyplot as plt

In [2]:
X, y= make_classification(n_classes=2,
                          weights=[.95, .05], # majority are non-fraud, minority are fraud
                          n_features=20,
                          n_samples=1000,
                          random_state=23
                         )

In [3]:
X.shape

(1000, 20)

In [4]:
pd.Series(y).value_counts()

0    949
1     51
dtype: int64

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, stratify=y, random_state=123)

# normalize the data after the splitting
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
tmp = pd.Series(y_train).value_counts()

#portion of minority in train
tmp[1]/sum(tmp)

0.050666666666666665

In [7]:
tmp = pd.Series(y_test).value_counts()

#portion of minority in test
tmp[1]/sum(tmp)

0.052

In [8]:
# We train a simple model
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [10]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)

#show how y_pred is related to y_prob
pd.DataFrame({'label':y_pred[:20] ,'prob(y=0)': y_prob[:20, 1]})

Unnamed: 0,label,prob(y=0)
0,0,0.015959
1,0,8.6e-05
2,0,0.000157
3,0,3.8e-05
4,0,0.007308
5,0,0.000673
6,0,0.000306
7,0,4.3e-05
8,0,0.000307
9,0,0.000184


In [11]:
a = y_prob[:,1]>0.5
b = y_pred 

a==b

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [13]:
#confusion matrix
confusion_matrix(y_test, y_pred)


array([[235,   2],
       [  7,   6]])

In [15]:
#confusion matrix with .5 threshold
confusion_matrix(y_test, y_prob[:,1] > 0.5)

array([[235,   2],
       [  7,   6]])

In [16]:
#confusion matrix with .1 threshold
confusion_matrix(y_test, y_prob[:,1] > 0.1)

array([[223,  14],
       [  3,  10]])

In [17]:
print(classification_report(y_test, y_prob[:,1] > 0.5))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       237
           1       0.75      0.46      0.57        13

    accuracy                           0.96       250
   macro avg       0.86      0.73      0.78       250
weighted avg       0.96      0.96      0.96       250

