In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Step 1: Generate synthetic fraud detection data
n_samples = 10000
n_features = 30

# Create features
X = np.random.randn(n_samples, n_features)

# Create imbalanced class labels (0 = not fraud, 1 = fraud)
y = np.random.choice([0, 1], size=n_samples, p=[0.98, 0.02])

# Create a DataFrame to simulate 'creditcard.csv' structure
columns = [f'V{i}' for i in range(1, n_features + 1)]
df = pd.DataFrame(X, columns=columns)
df['Class'] = y

# Step 2: Split the data
X = df.drop('Class', axis=1)
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 3: Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Step 4: Evaluate using ROC AUC score
y_pred_proba = model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"ROC AUC Score: {roc_auc:.4f}")


ROC AUC Score: 0.5188
