In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans

# ----------------------------
# 1. Load and Preprocess Data
# ----------------------------

# Load CSV file with edge_HOG features.
# Assumes the first column is the label and the rest are features.
data = pd.read_csv('../svm model/edge_hog_features.csv', header=None)
labels = data.iloc[:, 0].values       # First column as labels
features = data.iloc[:, 1:].values      # Remaining columns as features

# Ensure labels are of the correct type (convert to strings here)
labels = labels.astype(str)

# Filter out classes with fewer than 2 samples
label_counts = pd.Series(labels).value_counts()
valid_labels = label_counts[label_counts >= 2].index
filtered_indices = [i for i, label in enumerate(labels) if label in valid_labels]
features = features[filtered_indices]
labels = labels[filtered_indices]

# Impute any missing values using the mean strategy.
imputer = SimpleImputer(strategy='mean')
features = imputer.fit_transform(features)

# Normalize the features using StandardScaler.
scaler = StandardScaler()
features = scaler.fit_transform(features)

# ----------------------------
# 2. KMeans + SVM Pipeline
# ----------------------------
# Define a range for number of clusters K.
k_values = list(range(10, 101, 5))  # Example: K from 10 to 100 in steps of 5

train_accuracies = []
test_accuracies = []

best_k = None
best_test_acc = 0

for k in k_values:
    print(f"\nEvaluating for K = {k}")
    
    # Step 1: Perform KMeans clustering on the entire (scaled) feature set.
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(features)
    
    # Step 2: Append cluster labels as an additional feature column.
    # This creates a new feature matrix with original features and the cluster ID.
    X_clustered = np.hstack([features, cluster_labels.reshape(-1, 1)])
    
    # Step 3: Split the data into training and testing sets (80% train, 20% test) with stratification.
    X_train, X_test, y_train, y_test = train_test_split(
        X_clustered, labels, test_size=0.2, random_state=42, stratify=labels)
    
    # Step 4: Train an SVM model using a linear kernel.
    svm_model = SVC(kernel='linear', C=1.0, random_state=42)
    svm_model.fit(X_train, y_train)
    
    # Step 5: Make predictions on training and test sets.
    y_train_pred = svm_model.predict(X_train)
    y_test_pred = svm_model.predict(X_test)
    
    # Step 6: Compute accuracies.
    acc_train = accuracy_score(y_train, y_train_pred)
    acc_test = accuracy_score(y_test, y_test_pred)
    
    print(f"K = {k} | Training Accuracy: {acc_train:.4f}, Test Accuracy: {acc_test:.4f}")
    
    train_accuracies.append(acc_train)
    test_accuracies.append(acc_test)
    
    # Track the best performing K based on test accuracy.
    if acc_test > best_test_acc:
        best_test_acc = acc_test
        best_k = k

print("\n----- Summary -----")
print("Best K:", best_k)
print("Best Test Accuracy:", best_test_acc)
print("Training Accuracy for Best K:", train_accuracies[k_values.index(best_k)])

# Optionally: Retrain an SVM using the best K, then print a detailed classification report.
print(f"\nRetraining SVM with Best K = {best_k}")
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(features)
X_clustered = np.hstack([features, cluster_labels.reshape(-1, 1)])
X_train, X_test, y_train, y_test = train_test_split(
    X_clustered, labels, test_size=0.2, random_state=42, stratify=labels)

svm_model = SVC(kernel='linear', C=1.0, random_state=42)
svm_model.fit(X_train, y_train)
y_test_pred = svm_model.predict(X_test)

print("\nClassification Report for Best K:")
print(classification_report(y_test, y_test_pred))


  data = pd.read_csv('../svm model/edge_hog_features.csv', header=None)



Evaluating for K = 10
K = 10 | Training Accuracy: 1.0000, Test Accuracy: 0.8167

Evaluating for K = 15
K = 15 | Training Accuracy: 1.0000, Test Accuracy: 0.8125

Evaluating for K = 20
K = 20 | Training Accuracy: 1.0000, Test Accuracy: 0.8042

Evaluating for K = 25
K = 25 | Training Accuracy: 1.0000, Test Accuracy: 0.8083

Evaluating for K = 30
K = 30 | Training Accuracy: 1.0000, Test Accuracy: 0.8167

Evaluating for K = 35
K = 35 | Training Accuracy: 1.0000, Test Accuracy: 0.8042

Evaluating for K = 40
K = 40 | Training Accuracy: 1.0000, Test Accuracy: 0.8125

Evaluating for K = 45
K = 45 | Training Accuracy: 1.0000, Test Accuracy: 0.8042

Evaluating for K = 50
K = 50 | Training Accuracy: 1.0000, Test Accuracy: 0.8083

Evaluating for K = 55
K = 55 | Training Accuracy: 1.0000, Test Accuracy: 0.8125

Evaluating for K = 60
K = 60 | Training Accuracy: 1.0000, Test Accuracy: 0.8250

Evaluating for K = 65
K = 65 | Training Accuracy: 1.0000, Test Accuracy: 0.8208

Evaluating for K = 70
K = 7