### Step 1
Load and Prepare the Data.

In [2]:
import os
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# Define the base directory
base_dir = "/Users/zhao0725/Desktop/COMP3314/Assignment3/image-classification-challenge"

# Paths relative to the base directory
train_csv_path = os.path.join(base_dir, "train.csv")
train_images_folder = os.path.join(base_dir, "train_ims")
test_images_folder = os.path.join(base_dir, "test_ims")
test_csv_path = os.path.join(base_dir, "test.csv")

# Load training labels
train_csv = pd.read_csv(train_csv_path)

# Load images into arrays using Pillow
def load_images(image_folder, image_ids, img_size=(64, 64)):
    images = []
    for image_id in tqdm(image_ids, desc="Loading images"):
        img_path = os.path.join(image_folder, image_id)
        try:
            # Open the image, resize it, and flatten into a 1D array
            img = Image.open(img_path).resize(img_size)
            img_array = np.array(img).flatten()
            images.append(img_array)
        except Exception as e:
            print(f"Error loading image {image_id}: {e}")
    return np.array(images)

# Prepare training data
X_train_images = load_images(train_images_folder, train_csv['im_name'])
y_train = train_csv['label']

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)


Loading images: 100%|██████████| 50000/50000 [00:11<00:00, 4235.08it/s]


### Step 2
Train the SVM Model

In [4]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Step 1: Split the dataset into training and validation sets
# Assuming X_train_images and y_train_encoded are prepared
X_train, X_val, y_train, y_val = train_test_split(
    X_train_images, y_train_encoded, test_size=0.2, random_state=42
)

# Step 2: Apply PCA to reduce dimensions
# Retain 10 principal components for faster SVM training
pca = PCA(n_components=10, random_state=42)
X_train_reduced = pca.fit_transform(X_train)
X_val_reduced = pca.transform(X_val)

# Step 3: Train the SVM Classifier
# Use a linear kernel for faster training and add verbose for progress updates
classifier = SVC(kernel='linear', random_state=42, verbose=True)
classifier.fit(X_train_reduced, y_train)

# Step 4: Validate the Model
# Make predictions on the validation set
y_val_pred = classifier.predict(X_val_reduced)
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy:.2f}")

[LibSVM]................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

### Step 3
Load the image and predict the lable

In [None]:
# Load test images
test_image_ids = [img_name for img_name in os.listdir(test_images_folder)]
X_test_images = load_images(test_images_folder, test_image_ids)

# Predict test labels
test_predictions = classifier.predict(X_test_images)

# Decode the predicted labels
decoded_predictions = label_encoder.inverse_transform(test_predictions)

# Save predictions to test_csv
test_csv = pd.DataFrame({'image_id': test_image_ids, 'label': decoded_predictions})
test_csv.to_csv(test_csv_path, index=False)
print(f"Predictions saved to {test_csv_path}")