In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from dotenv import load_dotenv
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import average_precision_score, roc_auc_score, precision_recall_curve, roc_curve, auc, confusion_matrix, classification_report


%load_ext autoreload
%autoreload 2

In [4]:
np.random.seed(42)

In [5]:
load_dotenv(override=True)

True

In [6]:
DIR_DATA_PROCESSED: str = Path(os.getenv("DIR_DATA_PROCESSED"))

In [7]:
df = pd.read_csv(f"{DIR_DATA_PROCESSED}/data.csv")

In [8]:
X = df.drop(columns="isFraud")
y = df["isFraud"]

In [9]:
print(f"skew = {y[y==1].shape[0]/y.shape[0]}")

skew = 0.002964544224336551


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [1]:
# Logistic Regression

In [None]:
numeric_preprocessor = ColumnTransformer(
    transformers=[
        ("scaler", StandardScaler, [c for c in X.columns if c != "type"]),
    ]
)

In [None]:
pipeline = Pipeline([
    ("preprocessor", numeric_preprocessor),
    ("smote", SMOTE(.1)),
    ("lr", LogisticRegression(solver="lbfgs", max_iter=1000))
])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict_proba(X_test)
labels_pred = y_pred[:, 1] > 0.5

In [None]:
print('AUPRC = {}'.format(average_precision_score(y_test, y_pred[:, 1])))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr,tpr)

# Plot ROC
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.0])
plt.ylim([-0.1,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
precision_recall_curve(y_test, y_pred)

In [None]:
confusion_matrix(y_test, labels_pred)

In [None]:
classification_report(y_test, labels_pred)

In [2]:
# XGBoost

In [None]:
# Hyper Optimization