In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras import layers, models
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline

In [9]:
# Load and preprocess data
def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path)
    data.drop(columns=['isFlaggedFraud'], inplace=True)
    data["type"] = data["type"].map({"CASH_OUT": 1, "PAYMENT": 2, "CASH_IN": 3, "TRANSFER": 4, "DEBIT": 5})
    return data


In [10]:
# Feature Engineering
def engineer_features(df):
    df['amount_to_oldbalanceOrg'] = df['amount'] / (df['oldbalanceOrg'] + 1)
    df['amount_to_newbalanceOrig'] = df['amount'] / (df['newbalanceOrig'] + 1)
    df['balance_change_ratio_orig'] = (df['newbalanceOrig'] - df['oldbalanceOrg']) / (df['oldbalanceOrg'] + 1)
    df['balance_change_ratio_dest'] = (df['newbalanceDest'] - df['oldbalanceDest']) / (df['oldbalanceDest'] + 1)
    df['transaction_freq_orig'] = df.groupby('nameOrig')['step'].transform('count')
    df['transaction_freq_dest'] = df.groupby('nameDest')['step'].transform('count')
    df['avg_amount_orig'] = df.groupby('nameOrig')['amount'].transform('mean')
    df['avg_amount_dest'] = df.groupby('nameDest')['amount'].transform('mean')
    df['amount_to_avg_orig'] = df['amount'] / df['avg_amount_orig']
    df['amount_to_avg_dest'] = df['amount'] / df['avg_amount_dest']
    df['hour'] = df['step'] % 24
    df['is_night'] = ((df['hour'] >= 22) | (df['hour'] <= 6)).astype(int)
    return df.drop(['nameOrig', 'nameDest'], axis=1)

In [4]:
# Build VAE model
def build_vae(input_dim, encoding_dim):
    inputs = layers.Input(shape=(input_dim,))
    x = layers.Dense(32, activation='relu')(inputs)
    z_mean = layers.Dense(encoding_dim)(x)
    z_log_var = layers.Dense(encoding_dim)(x)

    def sampling(args):
        z_mean, z_log_var = args
        epsilon = tf.keras.backend.random_normal(shape=(tf.keras.backend.shape(z_mean)[0], encoding_dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

    z = layers.Lambda(sampling)([z_mean, z_log_var])
    
    decoder = layers.Dense(32, activation='relu')(z)
    outputs = layers.Dense(input_dim)(decoder)

    vae = models.Model(inputs, outputs)

    reconstruction_loss = tf.reduce_mean(tf.square(inputs - outputs))
    kl_loss = -0.5 * tf.reduce_sum(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1)
    vae_loss = tf.reduce_mean(reconstruction_loss + kl_loss)

    vae.add_loss(vae_loss)
    vae.compile(optimizer='adam')
    return vae


In [5]:
# Load and preprocess data
data = load_and_preprocess_data("onlinefraud .csv")
data_engineered = engineer_features(data)
data_cleaned = data_engineered.dropna()

# Prepare features and target
X = data_cleaned.drop('isFraud', axis=1)
y = data_cleaned['isFraud']


# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Define resampling strategy
undersampler = RandomUnderSampler(sampling_strategy=0.5, random_state=42)
oversampler = SMOTENC(categorical_features=[1], sampling_strategy=0.7, random_state=42)  # Assuming 'type' is the second column

# Create resampling pipeline
resampling_pipeline = Pipeline([
    ('undersampler', undersampler),
    ('oversampler', oversampler)
])

# Apply resampling to training data only
X_train_resampled, y_train_resampled = resampling_pipeline.fit_resample(X_train_scaled, y_train)



In [None]:
print("Original training dataset shape:", X_train_scaled.shape)
print("Resampled training dataset shape:", X_train_resampled.shape)
print("\nResampled training set distribution:\n", pd.Series(y_train_resampled).value_counts())


In [None]:
# Train VAE
input_dim = X_train_scaled.shape[1]
encoding_dim = 10
vae = build_vae(input_dim, encoding_dim)
vae.fit(X_train_scaled, epochs=50, batch_size=128, validation_split=0.2, verbose=1)

# Compute reconstruction error
reconstructions = vae.predict(X_train_scaled)
mse = np.mean(np.square(X_train_scaled - reconstructions), axis=1)
threshold = np.percentile(mse, 95)  # Adjust this percentile as needed

# Add reconstruction error as a feature
X_train_with_re = np.column_stack((X_train_scaled, mse))
X_test_with_re = np.column_stack((X_test_scaled, np.mean(np.square(X_test_scaled - vae.predict(X_test_scaled)), axis=1)))

In [None]:
# Define resampling strategy
undersampler = RandomUnderSampler(sampling_strategy=0.5, random_state=42)
oversampler = SMOTENC(categorical_features=[1], sampling_strategy=0.7, random_state=42)  # Assuming 'type' is the second column

# Create resampling pipeline
resampling_pipeline = Pipeline([
    ('undersampler', undersampler),
    ('oversampler', oversampler)
])

# Apply resampling
X_train_resampled, y_train_resampled = resampling_pipeline.fit_resample(X_train_with_re, y_train)

print("Original dataset shape:", X_train_with_re.shape)
print("Resampled dataset shape:", X_train_resampled.shape)

In [None]:
# Train SVM
svm_classifier = SVC(kernel='rbf', class_weight='balanced', probability=True, random_state=42)
svm_classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions
y_pred = svm_classifier.predict(X_test_with_re)
y_pred_proba = svm_classifier.predict_proba(X_test_with_re)[:, 1]

# Evaluate the model
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nROC AUC Score:")
print(roc_auc_score(y_test, y_pred_proba))

In [None]:
# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': list(X.columns) + ['reconstruction_error'],
    'coefficient': np.abs(svm_classifier.coef_[0])
}).sort_values('coefficient', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='coefficient', y='feature', data=feature_importance.head(15))
plt.title('Top 15 Feature Coefficients (Absolute Value)')
plt.tight_layout()
plt.show()