In [15]:
import pandas as pd
import numpy as np
import re
import os
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
df = pd.read_csv(r"C:\Users\niraj\Downloads\creditcards.csv")

print("Dataset Loaded Successfully ✅")
print("Shape:", df.shape)

df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'creditcards.csv'

In [None]:
missing_percent = df.isnull().mean() * 100
missing_score = missing_percent.mean()

print("Average Missing %:", missing_score)

In [13]:
duplicate_count = df.duplicated().sum()
print("Duplicate Rows:", duplicate_count)

Duplicate Rows: 1081


In [14]:
def detect_outliers_iqr(data):
    outlier_count = 0
    
    for col in data.select_dtypes(include=np.number).columns:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        
        outlier_count += ((data[col] < lower) | (data[col] > upper)).sum()
        
    return outlier_count

outlier_count = detect_outliers_iqr(df)

print("Total Outliers:", outlier_count)

Total Outliers: 385104


In [None]:
quality_score = (
    0.4 * missing_score +
    0.3 * duplicate_count +
    0.3 * outlier_count
)

quality_norm = quality_score / (df.shape[0] + 1)

print("Quality Norm:", quality_norm)

In [None]:
def detect_sensitive_data(data):
    sensitive_count = 0
    
    email_pattern = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
    credit_card_pattern = r'\b\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}\b'
    
    for col in data.columns:
        for value in data[col].astype(str):
            if re.search(email_pattern, value):
                sensitive_count += 1
            if re.search(credit_card_pattern, value):
                sensitive_count += 1
                
    return sensitive_count

leakage_count = detect_sensitive_data(df)

leakage_norm = leakage_count / (df.shape[0] + 1)

print("Leakage Norm:", leakage_norm)

In [None]:
numeric_df = df.select_dtypes(include=np.number).fillna(0)

scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)

model_if = IsolationForest(contamination=0.05, random_state=42)
model_if.fit(scaled_data)

df["ML_Anomaly"] = model_if.predict(scaled_data)

ml_anomaly_count = (df["ML_Anomaly"] == -1).sum()

ml_norm = ml_anomaly_count / (df.shape[0] + 1)

print("ML Norm:", ml_norm)

In [None]:
scaler_dl = MinMaxScaler()
scaled_dl_data = scaler_dl.fit_transform(numeric_df)

data_tensor = torch.tensor(scaled_dl_data, dtype=torch.float32)

In [None]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU()
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, input_dim),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
input_dim = scaled_dl_data.shape[1]

model = Autoencoder(input_dim)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
epochs = 20

for epoch in range(epochs):
    optimizer.zero_grad()
    
    outputs = model(data_tensor)
    loss = criterion(outputs, data_tensor)
    
    loss.backward()
    optimizer.step()
    
    if epoch % 5 == 0:
        print(f"Epoch [{epoch}/{epochs}] Loss: {loss.item():.4f}")

In [None]:
with torch.no_grad():
    reconstructed = model(data_tensor)
    mse = torch.mean((data_tensor - reconstructed) ** 2, dim=1)

threshold = torch.quantile(mse, 0.95)

deep_anomalies = mse > threshold
deep_anomaly_count = torch.sum(deep_anomalies).item()

dl_norm = deep_anomaly_count / (df.shape[0] + 1)

print("Deep Learning Norm:", dl_norm)

In [None]:
final_risk_score = (
    0.3 * quality_norm +
    0.2 * leakage_norm +
    0.2 * ml_norm +
    0.3 * dl_norm
) * 100

print("Final Risk Score:", round(final_risk_score, 2))

In [None]:
plt.hist(mse.numpy(), bins=50)
plt.axvline(threshold.item(), color='r')
plt.title("Autoencoder Reconstruction Error")
plt.show()