In [39]:
import os
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from imblearn.over_sampling import SMOTE

In [40]:
# Load Multiple .pkl Files from a Directory
def load_data_from_directory(directory_path):
    all_data = []
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.pkl'):
            file_path = os.path.join(directory_path, file_name)
            print(f"Loading file: {file_path}")
            df = pd.read_pickle(file_path)
            all_data.append(df)
    combined_data = pd.concat(all_data, ignore_index=True)
    print(f"Combined Data Shape: {combined_data.shape}")
    return combined_data

In [41]:
import zipfile
import os

zip_path = 'dataset/data/dataset.zip'
extract_dir = 'dataset/data/extracted_data'

# Create the directory to extract files if it doesn't exist
os.makedirs(extract_dir, exist_ok=True)

# Extract the contents of the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f"Files extracted to: {extract_dir}")


Files extracted to: dataset/data/extracted_data


In [42]:
import os

directory_path = 'dataset/data/extracted_data/data'

# List all files in the directory
files_in_directory = os.listdir(directory_path)
print("Files in directory:", files_in_directory)

# Filter for .pkl files
pkl_files = [file for file in files_in_directory if file.endswith('.pkl')]
print("PKL files found:", pkl_files)

if not pkl_files:
    print("No .pkl files found in the directory.")


Files in directory: ['2018-04-17.pkl', '2018-04-18.pkl', '2018-08-15.pkl', '2018-09-08.pkl', '2018-05-13.pkl', '2018-07-23.pkl', '2018-05-06.pkl', '2018-07-16.pkl', '2018-09-10.pkl', '2018-07-10.pkl', '2018-09-26.pkl', '2018-09-17.pkl', '2018-08-17.pkl', '2018-07-09.pkl', '2018-06-18.pkl', '2018-07-31.pkl', '2018-09-22.pkl', '2018-08-12.pkl', '2018-07-24.pkl', '2018-06-24.pkl', '2018-07-14.pkl', '2018-04-03.pkl', '2018-09-01.pkl', '2018-05-22.pkl', '2018-09-29.pkl', '2018-04-30.pkl', '2018-09-20.pkl', '2018-07-26.pkl', '2018-05-10.pkl', '2018-08-22.pkl', '2018-05-09.pkl', '2018-08-30.pkl', '2018-05-30.pkl', '2018-09-25.pkl', '2018-06-19.pkl', '2018-05-25.pkl', '2018-09-19.pkl', '2018-06-11.pkl', '2018-04-22.pkl', '2018-08-06.pkl', '2018-05-31.pkl', '2018-07-29.pkl', '2018-07-17.pkl', '2018-06-10.pkl', '2018-05-17.pkl', '2018-09-16.pkl', '2018-05-05.pkl', '2018-07-20.pkl', '2018-06-08.pkl', '2018-05-01.pkl', '2018-09-13.pkl', '2018-06-01.pkl', '2018-04-07.pkl', '2018-07-06.pkl', '2018-0

In [43]:
# Feature Engineering
data['TX_DATETIME'] = pd.to_datetime(data['TX_DATETIME'])
data['TX_DAY'] = data['TX_DATETIME'].dt.day
data['TX_HOUR'] = data['TX_DATETIME'].dt.hour
data['AMOUNT_FLAG'] = (data['TX_AMOUNT'] > 220).astype(int)
data = data.sort_values(by='TX_DATETIME')

data['TERMINAL_FRAUD_COUNT'] = data.groupby('TERMINAL_ID')['TX_FRAUD'] \
    .rolling(window=28, min_periods=1).sum().reset_index(0, drop=True)
customer_avg_spend = data.groupby('CUSTOMER_ID')['TX_AMOUNT'].mean()
data['CUSTOMER_AVG_SPEND'] = data['CUSTOMER_ID'].map(customer_avg_spend)
data.fillna(0, inplace=True)

In [44]:
# Define Features (X) and Target (y)
X = data.drop(['TRANSACTION_ID', 'TX_DATETIME', 'TX_FRAUD'], axis=1)
y = data['TX_FRAUD']

In [45]:
# Handle Imbalance Using SMOTE
print("Balancing the dataset using SMOTE...")
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
print(f"Resampled Dataset Shape: {X_resampled.shape}")

Balancing the dataset using SMOTE...
Resampled Dataset Shape: (38110, 12)


In [46]:
# Split into Train and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [47]:
# Hyperparameter Tuning with GridSearchCV
print("Tuning model hyperparameters...")
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced']
}

model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=3, verbose=2)
grid_search.fit(X_train, y_train)

Tuning model hyperparameters...
Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END class_weight=balanced, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END class_weight=balanced, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END class_weight=balanced, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END class_weight=balanced, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.3s
[CV] END class_weight=balanced, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.3s
[CV] END class_weight=balanced, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.4s
[CV] END class_weight=balanced, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   1.9s
[CV] END class_weight=balanced, max_de

In [48]:
# Use the best model from GridSearchCV
best_model = grid_search.best_estimator_
print(f"Best Model Parameters: {grid_search.best_params_}")

Best Model Parameters: {'class_weight': 'balanced', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [49]:
# Evaluate the Model
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])

print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy:.2f}")
print(f"ROC-AUC Score: {roc_auc:.2f}")


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3794
           1       1.00      1.00      1.00      3828

    accuracy                           1.00      7622
   macro avg       1.00      1.00      1.00      7622
weighted avg       1.00      1.00      1.00      7622

Accuracy: 1.00
ROC-AUC Score: 1.00


In [50]:
# Save the Trained Model to a .pkl File
if accuracy >= 0.9:
    model_file_path = 'optimized_fraud_detection_model.pkl'
    with open(model_file_path, 'wb') as file:
        pickle.dump(best_model, file)
    print(f"\nModel saved to {model_file_path}")
else:
    print("\nThe model did not achieve the desired 90% accuracy.")

# Load and Test the Saved Model (Optional)
with open(model_file_path, 'rb') as file:
    loaded_model = pickle.load(file)

sample_data = X_test.iloc[:5]
sample_predictions = loaded_model.predict(sample_data)
print("\nSample Predictions:")
print(sample_predictions)


Model saved to optimized_fraud_detection_model.pkl

Sample Predictions:
[0 1 0 0 1]


In [51]:
import joblib
joblib.dump(best_model, 'optimized_fraud_detection_model.joblib')

['optimized_fraud_detection_model.joblib']

In [52]:
import joblib
loaded_model = joblib.load('optimized_fraud_detection_model.joblib')

In [53]:
# Prepare the data (replace with your full dataset)
X_full = data.drop(['TRANSACTION_ID', 'TX_DATETIME', 'TX_FRAUD'], axis=1)

In [54]:
y_pred_full = loaded_model.predict(X_full)

In [55]:
accuracy_full = accuracy_score(data['TX_FRAUD'], y_pred_full)
print(f"Accuracy on Full Dataset: {accuracy_full:.4f}")

Accuracy on Full Dataset: 1.0000


In [56]:
print("\nFull Dataset Classification Report:")
print(classification_report(data['TX_FRAUD'], y_pred_full))


Full Dataset Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19055
           1       1.00      1.00      1.00        16

    accuracy                           1.00     19071
   macro avg       1.00      1.00      1.00     19071
weighted avg       1.00      1.00      1.00     19071



In [57]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(data['TX_FRAUD'], y_pred_full)
print("\nConfusion Matrix:")
print(cm)


Confusion Matrix:
[[19055     0]
 [    0    16]]


In [59]:
data['PREDICTED_TX_FRAUD'] = y_pred_full

# Save the results to a CSV
data.to_csv('fraud_detection_results.csv', index=False)
print("Predictions saved to 'fraud_detection_results.csv'.")

Predictions saved to 'fraud_detection_results.csv'.
