In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_curve, auc, precision_recall_curve
from joblib import Parallel, delayed
import plotly.express as px
import plotly.graph_objects as go

# Load dataset
data = pd.read_csv('creditcard.csv')

# Display the first few rows of the dataset
print(data.head())

# Optional: Use a subset of data for initial testing
data_sample = data.sample(frac=0.1, random_state=42)

# Separate features and target variable
X = data_sample.drop(['Class'], axis=1)
y = data_sample['Class']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Visualize class distribution
fig = px.histogram(data_sample, x='Class', title='Class Distribution', color='Class', barmode='group')
fig.show()

# Additional EDA plots
fig = px.histogram(data_sample, x='Amount', title='Distribution of Transaction Amounts', nbins=50)
fig.show()

# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=10000),
    'Random Forest': RandomForestClassifier(n_estimators=50)  # Reduced number of estimators for speed
}

# Function to train and evaluate a model
def train_and_evaluate(model, model_name, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Model: {model_name}")
    print(classification_report(y_test, y_pred))
    
    # ROC curve
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    roc_auc = auc(fpr, tpr)
    
    # Precision-Recall curve
    precision, recall, _ = precision_recall_curve(y_test, y_pred_prob)
    
    return model_name, (fpr, tpr, roc_auc), (precision, recall)

# Train and evaluate models in parallel using batches
batch_size = 10000  # Define a batch size for fitting the models
n_batches = len(X_train) // batch_size

results = []

for i in range(n_batches):
    X_train_batch = X_train_scaled[i*batch_size:(i+1)*batch_size]
    y_train_batch = y_train.iloc[i*batch_size:(i+1)*batch_size]
    
    batch_results = Parallel(n_jobs=-1)(delayed(train_and_evaluate)(model, name, X_train_batch, X_test_scaled, y_train_batch, y_test) for name, model in models.items())
    results.extend(batch_results)

# Extract and plot ROC and PR curves
roc_curves = [(name, fpr, tpr, roc_auc) for name, (fpr, tpr, roc_auc), _ in results]
pr_curves = [(name, precision, recall) for name, _, (precision, recall) in results]

# Plot ROC curves with Plotly
fig = go.Figure()

for name, fpr, tpr, roc_auc in roc_curves:
    fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'{name} (AUC = {roc_auc:.2f})'))

fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random Chance', line=dict(dash='dash')))

fig.update_layout(
    title='ROC Curves',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    showlegend=True
)

fig.show()

# Plot Precision-Recall curves with Plotly
fig = go.Figure()

for name, precision, recall in pr_curves:
    fig.add_trace(go.Scatter(x=recall, y=precision, mode='lines', name=name))

fig.update_layout(
    title='Precision-Recall Curves',
    xaxis_title='Recall',
    yaxis_title='Precision',
    showlegend=True
)

fig.show()


   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 