### Imports

In [5]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

### Load dataset

In [6]:
# Load corrupted dataset
import joblib


data = np.load("dataset_corrupted.npz")
X_corrupt = data["X"]

# Load pre-trained SVM model
svm = joblib.load("best_svm_model.pkl")


FileNotFoundError: [Errno 2] No such file or directory: 'best_svm_model.pkl'

In [None]:
# Predict class probabilities for each image
probs = svm.predict_proba(X_corrupt)

# Model confidence = highest class probability
confidence = np.max(probs, axis=1)

# Corruption score = 1 - confidence (lower confidence → higher corruption)
scores = 1 - confidence


In [None]:
# Data-driven thresholds
percentile_thr = np.percentile(scores, 90)      # Top 10% of suspicious images
std_thr = np.mean(scores) + 2 * np.std(scores)  # Mean + 2 standard deviations

print(f"90th percentile threshold: {percentile_thr:.3f}")
print(f"Mean + 2σ threshold: {std_thr:.3f}")


## Plot

In [None]:
pred_corrupt_percentile = np.where(scores >= percentile_thr)[0]
pred_corrupt_std = np.where(scores >= std_thr)[0]

print(f"Topp 10% (percentile): {len(pred_corrupt_percentile)} flagged")
print(f"Z-score (mean+2σ): {len(pred_corrupt_std)} flagged")


In [None]:
plt.figure(figsize=(8,5))
plt.hist(scores, bins=30, color="lightgray", edgecolor="black")
plt.axvline(percentile_thr, color="red", linestyle="--", label="90th percentile")
plt.axvline(std_thr, color="blue", linestyle="--", label="mean + 2σ")
plt.xlabel("Corruption score (1 – confidence)")
plt.ylabel("Frequency")
plt.title("Distribution of corruption scores – SVM model")
plt.legend()
plt.show()


In [None]:
for idx in pred_corrupt_percentile[:10]:
    plt.imshow(X_corrupt[idx].reshape(20,20), cmap="gray")
    plt.title(f"Score = {scores[idx]:.3f}")
    plt.axis("off")
    plt.show()


In [None]:
expected_corrupt = 89  # Known number of corrupted images
false_pos_rate = abs(len(pred_corrupt_percentile) - expected_corrupt) / len(X_corrupt)
print(f"Approx. false positive rate ≈ {false_pos_rate:.2%}")
