In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.combine import SMOTEENN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
import pickle

# Load the full dataset
df1 = pd.read_csv("../data/old.csv")
df2 = pd.read_csv("../data/new_labeled_data.csv")
df = pd.concat([df1, df2], ignore_index=True).drop_duplicates()

# Select a subset of the data (e.g., 40% of the dataset)
subset_fraction = 0.90  # Adjust as needed
df_subset, _ = train_test_split(
    df,
    train_size=subset_fraction,
    stratify=df['is_fraud'],
    random_state=42
)

# Preserve original indices
df_subset = df_subset.reset_index(drop=False).rename(columns={'index': 'original_index'})

# Select specified features
selected_features = [
    'original_index', 'first', 'last', 'cc_num', 'amt', 'gender', 'city', 'state',
    'zip', 'dob', 'job', 'category', 'is_fraud'
]
df_selected = df_subset[selected_features].copy()

# Data Preprocessing
# Calculate age from dob
df_selected['dob'] = pd.to_datetime(df_selected['dob'])
df_selected['age'] = (pd.Timestamp.now() - df_selected['dob']).dt.days / 365.25

# Drop dob column
df_selected = df_selected.drop(['dob'], axis=1)

# Encode categorical variables
categorical_columns = ['first', 'last', 'gender', 'city', 'state', 'job', 'category']
label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    df_selected[column] = le.fit_transform(df_selected[column])
    label_encoders[column] = le

# Separate features, target, and indices
X = df_selected.drop(['is_fraud', 'original_index'], axis=1)
y = df_selected['is_fraud']
indices = df_selected['original_index']

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data (before oversampling)
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(
    X_scaled, y, indices, test_size=0.2, random_state=42, stratify=y
)

# Handle class imbalance with SMOTE + ENN
smote_enn = SMOTEENN(random_state=42)
X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train, y_train)

# Reshape data for CNN (samples, timesteps, features)
X_train_resampled = X_train_resampled.reshape((X_train_resampled.shape[0], X_train_resampled.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Build CNN model
model = Sequential([
    Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_train_resampled.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Display model summary
model.summary()

# Train the model
history = model.fit(
    X_train_resampled, y_train_resampled,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"\nTest Accuracy: {test_accuracy:.4f}")
print(f"Test Loss: {test_loss:.4f}")

# Test on 100 fraud examples
# Create a DataFrame with test data, labels, and indices
test_df = pd.DataFrame({
    'original_index': indices_test,
    'true_label': y_test,
    'features': list(X_test)
})

# Filter for fraud cases
fraud_test_df = test_df[test_df['true_label'] == 1]

# Select up to 100 fraud examples
fraud_sample = fraud_test_df.head(100)
if len(fraud_sample) < 100:
    print(f"Warning: Only {len(fraud_sample)} fraud cases available in test set.")

# Prepare features for prediction
fraud_features = np.array(fraud_sample['features'].tolist())
fraud_features_reshaped = fraud_features.reshape((fraud_features.shape[0], fraud_features.shape[1], 1))

# Make predictions
predictions = model.predict(fraud_features_reshaped)
predicted_classes = (predictions > 0.5).astype(int).flatten()

# Create results DataFrame
results_df = pd.DataFrame({
    'original_index': fraud_sample['original_index'],
    'true_label': fraud_sample['true_label'],
    'predicted_label': predicted_classes,
    'prediction_probability': predictions.flatten()
})

# Calculate accuracy for fraud cases
fraud_accuracy = (results_df['true_label'] == results_df['predicted_label']).mean()
print(f"\nAccuracy on {len(results_df)} fraud cases: {fraud_accuracy:.4f}")

# Display results
print("\nDetailed Results for Fraud Cases:")
print(results_df[['original_index', 'true_label', 'predicted_label', 'prediction_probability']])

# Save results to CSV
results_df.to_csv('fraud_test_results.csv', index=False)
print("\nFraud test results saved to 'fraud_test_results.csv'")

# Save the model
model.save('credit_fraud_cnn_model_resampled.h5')

# Save the scaler and label encoders
with open('scaler_resampled.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('label_encoders_resampled.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

print("Model and preprocessing objects saved successfully with SMOTE + ENN!")