In [None]:
# Step 1: Install Required Libraries
# You may need to install these packages first if you haven't already:
# pip install pandas scikit-learn matplotlib seaborn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score, precision_recall_curve
import joblib

# Step 2: Load and Preprocess Data
# Load the dataset (replace 'path_to_dataset.csv' with your dataset path)
# Here we use the Kaggle credit card fraud detection dataset
data = pd.read_csv('creditcard.csv')

# Display the first few rows of the dataset
print(data.head())

# Step 2.1: Handling Missing Values
print("Missing values in each column:\n", data.isnull().sum())
data.fillna(method='ffill', inplace=True)  # Forward fill

# Step 2.2: Feature Engineering
# Create new features based on existing ones, e.g., transaction amounts, time intervals
data['TransactionAmount_log'] = np.log(data['Amount'] + 1)  # Log transformation
data['TransactionTime_hour'] = data['Time'] // 3600  # Convert seconds to hours

# Step 2.3: Scaling Features
features = data[['TransactionAmount_log', 'TransactionTime_hour']]  # Select relevant features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Step 3: Exploratory Data Analysis (EDA)
plt.figure(figsize=(10, 6))
sns.countplot(x='Class', data=data)
plt.title('Distribution of Classes (0: Legitimate, 1: Fraud)')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(features_scaled, data['Class'], test_size=0.2, random_state=42, stratify=data['Class'])

# Step 5: Implement Anomaly Detection Algorithms
# Initialize models
models = {
    "Isolation Forest": IsolationForest(contamination=0.01, random_state=42),
    "Local Outlier Factor": LocalOutlierFactor(n_neighbors=20, contamination=0.01),
    "One-Class SVM": OneClassSVM(kernel='rbf', gamma='auto', nu=0.01)
}

results = {}

for model_name, model in models.items():
    # Fit the model
    if model_name == "Local Outlier Factor":
        model.fit(X_train)  # LOF does not require a predict method in the same way
        y_train_pred = model.fit_predict(X_train)
        y_test_pred = model.fit_predict(X_test)
    else:
        model.fit(X_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
    
    # Convert -1 and 1 to 0 and 1 for evaluation
    y_train_pred = np.where(y_train_pred == -1, 1, 0)
    y_test_pred = np.where(y_test_pred == -1, 1, 0)

    # Store results
    results[model_name] = {
        "y_train_pred": y_train_pred,
        "y_test_pred": y_test_pred,
        "confusion_matrix": confusion_matrix(y_test, y_test_pred),
        "classification_report": classification_report(y_test, y_test_pred)
    }

# Step 6: Model Evaluation
for model_name, result in results.items():
    print(f"\n{model_name} - Classification Report:\n", result["classification_report"])
    print(f"{model_name} - Confusion Matrix:\n", result["confusion_matrix"])

# Step 7: ROC and Precision-Recall Curves
plt.figure(figsize=(12, 6))

for model_name, model in models.items():
    if model_name != "Local Outlier Factor":
        y_scores = model.decision_function(X_test)
        fpr, tpr, _ = roc_curve(y_test, y_scores)
        roc_auc = roc_auc_score(y_test, y_scores)
        plt.plot(fpr, tpr, label=f'{model_name} (area = {roc_auc:.2f})')
    
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

# Step 8: Save the Best Model
# Save the Isolation Forest model (chosen for this example)
joblib.dump(models["Isolation Forest"], 'fraud_detection_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

# Step 9: Load the Model and Test with New Data (Example)
# To load the model later, you can use:
# loaded_model = joblib.load('fraud_detection_model.pkl')
# loaded_scaler = joblib.load('scaler.pkl')

# New data example
# new_data = pd.DataFrame({'TransactionAmount_log': [2.5], 'TransactionTime_hour': [5]})
# new_data_scaled = loaded_scaler.transform(new_data)
# prediction = loaded_model.predict(new_data_scaled)
# print(f'Prediction: {prediction}')  # -1 for fraud, 1 for legitimate