In [11]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import IsolationForest
from kafka import KafkaProducer, KafkaConsumer
import json
import time
import numpy as np
import joblib
from multiprocessing import Process

# Load the dataset
file_path = 'C:/Users/User/fraud_detection_project/data/creditcard.csv'
data = pd.read_csv(file_path)

# Check for missing values
missing_values = data.isnull().sum()
print("Missing values in each column:\n", missing_values)

# Scale 'Amount' and 'Time' features
scaler = StandardScaler()
data[['Amount', 'Time']] = scaler.fit_transform(data[['Amount', 'Time']])

# Feature Engineering: Create new features or transform existing features
data['Amount_log'] = np.log1p(data['Amount'])
data['Time_hour'] = data['Time'] // 3600 % 24
data = data.drop(['Time'], axis=1)

# Separate features and target variable
X = data.drop('Class', axis=1)
y = data['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Anomaly detection using Isolation Forest
iso_forest = IsolationForest(contamination=0.01, random_state=42)
iso_forest.fit(X_train)
anomalies = iso_forest.predict(X_test)

# Convert anomalies to binary classification (1: anomaly, 0: normal)
anomalies = [1 if x == -1 else 0 for x in anomalies]
report_iso_forest = classification_report(y_test, anomalies)
conf_matrix_iso_forest = confusion_matrix(y_test, anomalies)
print("Isolation Forest Classification Report:\n", report_iso_forest)
print("Isolation Forest Confusion Matrix:\n", conf_matrix_iso_forest)

# Calculate AUPRC for Isolation Forest
auprc_iso_forest = average_precision_score(y_test, anomalies)
print(f"Isolation Forest AUPRC: {auprc_iso_forest:.4f}")

# Initialize and train the Logistic Regression model with class weights
log_reg = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
report_log_reg = classification_report(y_test, y_pred_log_reg)
conf_matrix_log_reg = confusion_matrix(y_test, y_pred_log_reg)
print("Logistic Regression Classification Report:\n", report_log_reg)
print("Logistic Regression Confusion Matrix:\n", conf_matrix_log_reg)

# Calculate AUPRC for Logistic Regression
auprc_log_reg = average_precision_score(y_test, log_reg.predict_proba(X_test)[:, 1])
print(f"Logistic Regression AUPRC: {auprc_log_reg:.4f}")

# Save the Logistic Regression model for real-time predictions
joblib.dump(log_reg, 'log_reg_model.pkl')
print("Logistic Regression model saved for real-time predictions.")

# Kafka Producer for Real-time Monitoring (Simulating streaming data)
def kafka_producer():
    producer = KafkaProducer(bootstrap_servers='localhost:9092', value_serializer=lambda x: json.dumps(x).encode('utf-8'))
    batch_size = 100
    for i in range(0, len(data), batch_size):
        batch = data.iloc[i:i+batch_size]
        for _, row in batch.iterrows():
            producer.send('transaction_topic', value=row.to_dict())
        time.sleep(0.1)  # Simulate the delay between batches
    print("Kafka Producer finished sending data.")

# Kafka Consumer for Real-time Predictions
def kafka_consumer():
    consumer = KafkaConsumer('transaction_topic', bootstrap_servers='localhost:9092', value_deserializer=lambda x: json.loads(x.decode('utf-8')))
    model = joblib.load('log_reg_model.pkl')
    batch_size = 100
    transactions = []
    for message in consumer:
        transaction = pd.DataFrame([message.value])
        transactions.append(transaction)
        if len(transactions) >= batch_size:
            batch = pd.concat(transactions)
            batch['Amount_log'] = np.log1p(batch['Amount'])
            batch['Time_hour'] = batch['Time'] // 3600 % 24
            batch = batch.drop(['Time'], axis=1)
            predictions = model.predict(batch.drop('Class', axis=1))
            for i, prediction in enumerate(predictions):
                if prediction == 1:
                    print(f"Fraudulent transaction detected: {batch.iloc[i].to_dict()}")
            transactions = []
    print("Kafka Consumer finished processing data.")

# Start Kafka Producer and Consumer
if __name__ == "__main__":
    producer_process = Process(target=kafka_producer)
    consumer_process = Process(target=kafka_consumer)
    producer_process.start()
    consumer_process.start()
    producer_process.join()
    consumer_process.join()


Missing values in each column:
 Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64
Isolation Forest Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99     56864
           1       0.09      0.55      0.16        98

    accuracy                           0.99     56962
   macro avg       0.55      0.77      0.58     56962
weighted avg       1.00      0.99      0.99     56962

Isolation Forest Confusion Matrix:
 [[56334   530]
 [   44    54]]
Isolation Forest AUPRC: 0.0517
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0      