In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
from pylab import rcParams

# Set figure size and random seed for reproducibility
rcParams['figure.figsize'] = 14, 8
RANDOM_SEED = 42
LABELS = ["Normal", "Fraud"]

# Load the dataset
data = pd.read_csv("C:\Users\Saaketh\Desktop\creditcard\creditcard.csv")

# Checking for missing values
if data.isnull().values.any():
    print("The dataset contains missing values")
else:
    print("No missing values found")

# Plotting class distribution
count_classes = pd.value_counts(data['Class'], sort=True)
count_classes.plot(kind='bar', rot=0)
plt.title("Transaction Class Distribution")
plt.xticks(range(2), LABELS)
plt.xlabel("Class")
plt.ylabel("Frequency")
plt.show()

# Separate fraud and normal transactions
fraud = data[data['Class'] == 1]
normal = data[data['Class'] == 0]

# Descriptive statistics of fraud and normal transaction amounts
print("Fraud Transactions - Amount Statistics:\n", fraud.Amount.describe())
print("Normal Transactions - Amount Statistics:\n", normal.Amount.describe())

# Plot transaction amounts for fraud and normal transactions
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Amount per transaction by class')
ax1.hist(fraud.Amount, bins=50)
ax1.set_title('Fraud')
ax2.hist(normal.Amount, bins=50)
ax2.set_title('Normal')
plt.xlabel('Amount ($)')
plt.ylabel('Number of Transactions')
plt.yscale('log')
plt.show()

# Visualize time of transaction vs amount by class
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Time of transaction vs Amount by class')
ax1.scatter(fraud.Time, fraud.Amount)
ax1.set_title('Fraud')
ax2.scatter(normal.Time, normal.Amount)
ax2.set_title('Normal')
plt.xlabel('Time (in Seconds)')
plt.ylabel('Amount')
plt.show()

# Sample 10% of the data for faster computation
data_sample = data.sample(frac=0.1, random_state=RANDOM_SEED)

# Determine the outlier fraction (ratio of fraud cases to valid cases)
fraud_sample = data_sample[data_sample['Class'] == 1]
valid_sample = data_sample[data_sample['Class'] == 0]
outlier_fraction = len(fraud_sample) / float(len(valid_sample))
print("Outlier Fraction:", outlier_fraction)

# Correlation Heatmap
corrmat = data_sample.corr()
plt.figure(figsize=(20, 20))
sns.heatmap(corrmat, annot=True, cmap="RdYlGn")
plt.show()

# Feature matrix (X) and target vector (Y)
X = data_sample.drop(['Class'], axis=1)
Y = data_sample['Class']

# Split data into training and testing sets
xTrain, xTest, yTrain, yTest = train_test_split(X, Y, test_size=0.2, random_state=RANDOM_SEED)

# Define outlier detection models
classifiers = {
    "Isolation Forest": IsolationForest(n_estimators=100, max_samples=len(X), 
                                       contamination=outlier_fraction, random_state=RANDOM_SEED),
    "Local Outlier Factor": LocalOutlierFactor(n_neighbors=20, contamination=outlier_fraction),
    "One-Class SVM": OneClassSVM(kernel='rbf', gamma=0.1, nu=0.05)
}

# Evaluate each model
for clf_name, clf in classifiers.items():
    if clf_name == "Local Outlier Factor":
        y_pred = clf.fit_predict(xTest)
        scores_prediction = clf.negative_outlier_factor_
    else:
        clf.fit(xTrain)
        y_pred = clf.predict(xTest)
    
    # For isolation forest and OneClassSVM, convert prediction labels
    if clf_name != "Local Outlier Factor":
        y_pred[y_pred == 1] = 0  # Valid transaction
        y_pred[y_pred == -1] = 1  # Fraud transaction
    
    # Calculate metrics
    n_errors = (y_pred != yTest).sum()
    print(f"{clf_name}: Number of errors = {n_errors}")
    print(f"{clf_name} - Accuracy: {accuracy_score(yTest, y_pred)}")
    print(f"{clf_name} - Classification Report:\n{classification_report(yTest, y_pred)}")
    
    # Confusion Matrix
    conf_matrix = confusion_matrix(yTest, y_pred)
    plt.figure(figsize=(8, 8))
    sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d")
    plt.title(f"Confusion Matrix for {clf_name}")
    plt.ylabel('True Class')
    plt.xlabel('Predicted Class')
    plt.show()
