In [None]:
# Credit Card Fraud Detection
# Author: Geisiana Maurício
# Objective:
# Build a complete data analysis and machine learning pipeline
# to detect fraudulent credit card transactions using Python.


In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

from imblearn.over_sampling import SMOTE


In [None]:
import pandas as pd
df = pd.read_csv("../data/raw/creditcard.csv")
df.head()

In [None]:
df.info()
df.describe()
df.shape

In [None]:
df['Class'].value_counts()
df['Class'].value_counts(normalize=True) * 100
sns.countplot(x='Class', data=df)
plt.title("Distribution of Legitimate vs. Fraudulent Transactions")
plt.show()

### Class Imbalance Insight

The dataset is highly imbalanced, with fraudulent transactions representing a very small percentage of the total.
This characteristic requires specific techniques such as resampling before applying machine learning models.


In [None]:
x = df.drop('Class', axis=1)
y = df['Class']


In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
x_resampled, y_resampled = smote.fit_resample(x, y)



In [None]:
x.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x_resampled,
    y_resampled,
    test_size=0.3,
    random_state=42,
    stratify=y_resampled
)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=1
)

In [None]:
rf_model.fit(x_train, y_train)

In [None]:
y_pred = rf_model.predict(x_test)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)

sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confunsion Matrix - Randon Forest")
plt.show()

### Random Forest Model Evaluation

The Random Forest classifier demonstrated strong performance in detecting fraudulent transactions.
Given the business context, recall was prioritized to minimize undetected fraud cases.


In [None]:
import pandas as pd
import numpy as np

importances = rf_model.feature_importances_
features = x_train.columns

feature_importance_df = pd.DataFrame({
    'Feature': features,
    'Importance' : importances
}).sort_values(by='Importance', ascending=False)

In [None]:
feature_importance_df.head(10)


In [None]:
plt.figure(figsize=(10,6))
sns.barplot(
    x='Importance',
    y='Feature',
    data=feature_importance_df.head(10)
)
plt.title("Top 10 Most Important Features - Random Forest")
plt.show()

### Feature Importance Analysis

The Random Forest model highlights specific transformed features (e.g., V14, V10) as key contributors to fraud detection.
This aligns with known fraud patterns and supports model interpretability for stakeholders.


In [None]:
import shap



In [None]:
x_test_sample = x_test.sample(1000, random_state=42)

explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(x_test_sample)
shap.summary_plot(shap_values, x_test_sample)




In [None]:
y_proba = rf_model.predict_proba(x_test)[:, 1]


In [None]:
import numpy as np
from sklearn.metrics import recall_score

thresholds = np.arange(0.05, 0.9, 0.05)
recalls = []

for t in thresholds:
    y_pred_t = (y_proba >= t).astype(int)
    recalls.append(recall_score(y_test, y_pred_t))


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))
plt.plot(thresholds, recalls, marker='o')
plt.axhline(0.95, color='red', linestyle='--', label='Recall target = 95%')
plt.xlabel("Threshold")
plt.ylabel("Recall")
plt.title("Recall vs Threshold")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
optimal_threshold = thresholds[np.where(np.array(recalls) >= 0.95)][0]
optimal_threshold


### Threshold Optimization

The classification threshold was tuned to achieve a recall above 95%, prioritizing the detection of fraudulent transactions.
This approach minimizes false negatives at the cost of increased false positives, which is acceptable in fraud detection contexts.


In [None]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

roc_auc


In [None]:
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.3f}")
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Credit Card Fraud Detection")
plt.legend()
plt.grid(True)
plt.show()


### ROC Curve and AUC

The ROC-AUC score demonstrates the model’s strong discriminative power in distinguishing fraudulent from legitimate transactions, even under severe class imbalance.


In [None]:
metrics_df = pd.DataFrame({
    "Metric": ["ROC_AUC", "Optimal_Threshold"],
    "Value": [roc_auc, optimal_threshold]
})

metrics_df.to_csv("../outputs/metrics/model_metrics.csv", index=False)


In [None]:
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.3f}")
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Credit Card Fraud Detection")
plt.legend()
plt.savefig("../outputs/figures/roc_curve.png", dpi=300, bbox_inches="tight")
plt.close()


In [None]:
import pandas as pd

def load_data(path):
    return pd.read_csv(path)


In [None]:
from imblearn.over_sampling import SMOTE

def apply_smote(x, y, random_state=42):
    smote = SMOTE(random_state=random_state)
    return smote.fit_resample(x, y)


In [None]:
from sklearn.ensemble import RandomForestClassifier

def train_random_forest(x_train, y_train, n_estimators=100):
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        random_state=42,
        n_jobs=-1
    )
    model.fit(x_train, y_train)
    return model


In [None]:
import numpy as np
from sklearn.metrics import roc_curve, auc, recall_score

def evaluate_model(model, x_test, y_test, recall_target=0.95):
    y_proba = model.predict_proba(x_test)[:, 1]

    thresholds = np.arange(0.05, 0.9, 0.05)
    recalls = [recall_score(y_test, (y_proba >= t).astype(int)) for t in thresholds]

    optimal_threshold = thresholds[np.where(np.array(recalls) >= recall_target)][0]

    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)

    return optimal_threshold, roc_auc


In [None]:
pred_df = pd.DataFrame({
    "Actual": y_test,
    "Probability_Fraud": y_proba,
    "Prediction": (y_proba >= optimal_threshold).astype(int)
})

pred_df.to_csv("../outputs/metrics/predictions.csv", index=False)
