In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from skimage.feature import hog
import cv2
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc, precision_recall_curve, confusion_matrix, roc_auc_score, RocCurveDisplay, PrecisionRecallDisplay
from sklearn.preprocessing import label_binarize
import seaborn as sns
import shap

In [None]:
DATASET_PATH = r"D:\Download\archive\seg_train\seg_train"

In [None]:
def load_dataset(dataset_path, img_size=(64, 64)):
    images = []
    labels = []
    
    for category in os.listdir(dataset_path):
        category_path = os.path.join(dataset_path, category)
        if not os.path.isdir(category_path):
            continue
        
        for file in os.listdir(category_path):
            file_path = os.path.join(category_path, file)
            img = cv2.imread(file_path)
            img = cv2.resize(img, img_size)
            images.append(img)
            labels.append(category)
    
    return np.array(images), np.array(labels)

In [None]:
X, y = load_dataset(DATASET_PATH)

In [None]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
def extract_features(images):
    feature_list = []
    
    for img in images:
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        hog_features = hog(gray, orientations=9, pixels_per_cell=(8, 8),
                           cells_per_block=(2, 2), feature_vector=True)
        feature_list.append(hog_features)
    
    return np.array(feature_list)

In [None]:
X_features = extract_features(X)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_features)

In [None]:
def plot_sample_images(X, y, label_encoder, num_samples=10):
    plt.figure(figsize=(12, 6))
    indices = np.random.choice(len(X), num_samples, replace=False)
    
    for i, idx in enumerate(indices):
        plt.subplot(2, 5, i + 1)
        plt.imshow(cv2.cvtColor(X[idx], cv2.COLOR_BGR2RGB))
        plt.title(f"Label: {label_encoder.inverse_transform([y[idx]])[0]}")
        plt.axis("off")
    
    plt.tight_layout()
    plt.show()

In [None]:
def plot_hog_feature(img):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    features, hog_image = hog(gray, orientations=9, pixels_per_cell=(8, 8),
                              cells_per_block=(2, 2), visualize=True)
    
    fig, ax = plt.subplots(1, 2, figsize=(10, 5))
    ax[0].imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    ax[0].set_title("Original Image")
    ax[0].axis("off")
    
    ax[1].imshow(hog_image, cmap="gray")
    ax[1].set_title("HOG Features")
    ax[1].axis("off")
    
    plt.show()

In [None]:
plot_sample_images(X, y_encoded, label_encoder, num_samples=10)
plot_hog_feature(X[0])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_samples': ['auto', 100, 200, 300],
    'contamination' : [0.01, 0.05, 0.,],
    'max_features' : [1.0 , 0.8, 0.5, 0.3]
}

grid_search = GridSearchCV(IsolationForest(),
param_grid=param_grid,
scoring='accuracy',
cv=5,
n_jobs=-1
)

grid_search.fit(X_train, y_train)

In [None]:
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
print(y_pred)

In [None]:
matrix = confusion_matrix(y_test, y_pred)

plt.matshow(matrix)
plt.colorbar()
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
sns.heatmap(matrix, square  =True, annot = True, cbar = False)
plt.xlabel('predicted value')
plt.ylabel('true value')

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
cv_scores = cross_val_score(best_model, x_train, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", np.mean(cv_scores))

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(X_test.iloc[:, 0], X_test.iloc[:, 1], c=y_pred, cmap='coolwarm', label='Prediction')
plt.title("Isolation Forest - Outlier Detection (2D projection)")
plt.xlabel('Alcohol')
plt.ylabel('Malic Acid')
plt.colorbar(label='Outlier (1) / Normal (0)')
plt.legend()
plt.show()

In [None]:
anomaly_scores = best_model.decision_function(x)
sns.histplot(anomaly_scores, kde=True)
plt.xlabel('Anomaly Scores')
plt.title('Anomaly Score Distribution')
plt.show()

In [None]:
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_model.feature_importances_
}).sort_values(by='Importance', ascending =False)

print(feature_importances)

In [None]:
plt.figure(figsize=(6, 4))
sns.barplot(x=feature_importances['Importance'], y=feature_importances['Feature'], orient="h", palette="viridis")
plt.xlabel("Feature Importance Score")
plt.ylabel("Feature")
plt.tight_layout()

In [None]:
explainer = shap.Explainer(best_model, X_train)
shap_values = explainer(X_test)

In [None]:
shap.summary_plot(shap_values, X_test, feature_names=X.columns)
shap.plots.bar(shap_values)