In [None]:
# Assigment 1: Comparative Study of Classification Algorithms.
print("Hola mundo")

In [None]:
print("Mundo dice ciao")
print("Ciao dice mundo")
print("Mundo dice que ciao lava piati")

In [None]:
print("Mundo gioca calcio")

In [None]:
print("Giusseppe se puso la peluca")

In [None]:
import kagglehub
import os
import shutil

# Dataset directory
dataset_dir = "/content/data/raw"

# Download dataset
path = kagglehub.dataset_download("jiayuanchengala/aid-scene-classification-datasets")
src_dir = os.path.join(path, "AID")

# Move only if not already moved
if not os.path.exists(dataset_dir):
    print("Copying dataset to /content/data/raw...")
    # Note that if I move it then kagglehub wont re-download so we can=t keep consistency
    shutil.copytree(src_dir, dataset_dir)
else:
    print("Dataset already exists at /content/data/raw")

print("✅ Dataset ready at:", dataset_dir)


**Manual Data Separation**

In [None]:
image_paths =  []
labels = []
source_dir = 'data/raw'
categories = [d for d in os.listdir(source_dir)
              if os.path.isdir(os.path.join(source_dir, d))]
for category in categories:
    category_path = os.path.join(source_dir, category)
    files = [os.path.join(category_path, f) for f in os.listdir(category_path)
             if os.path.isfile(os.path.join(category_path, f))]
    image_paths.extend(files)
    labels.extend([category] * len(files))

print(f"Found {len(image_paths)} images in {len(categories)} categories")

In [None]:
import random
from pathlib import Path

# Set random seed for reproducibility
random.seed(42)

# Source directory containing all the category folders
source_dir = 'data/raw'

# Create train and test directories
train_dir = 'data/train'
test_dir = 'data/test'
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

for category in categories:
    category_path = os.path.join(source_dir, category)

    if not os.path.exists(category_path):
        print(f"Warning: {category} folder not found, skipping...")
        continue

    # Get all files in the category folder
    files = [f for f in os.listdir(category_path)
             if os.path.isfile(os.path.join(category_path, f))]

    if len(files) == 0:
        print(f"Warning: {category} folder is empty, skipping...")
        continue

    # Shuffle files randomly
    random.shuffle(files)

    # Calculate split point (90% for train)
    split_idx = int(len(files) * 0.9)
    train_files = files[:split_idx]
    test_files = files[split_idx:]

    # Create category subfolders in train and test
    train_category_dir = os.path.join(train_dir, category)
    test_category_dir = os.path.join(test_dir, category)
    os.makedirs(train_category_dir, exist_ok=True)
    os.makedirs(test_category_dir, exist_ok=True)

    # Move files to train
    for file in train_files:
        src = os.path.join(category_path, file)
        dst = os.path.join(train_category_dir, file)
        shutil.copy2(src, dst)

    # Move files to test
    for file in test_files:
        src = os.path.join(category_path, file)
        dst = os.path.join(test_category_dir, file)
        shutil.copy2(src, dst)

    print(f"{category}: {len(train_files)} files to train, {len(test_files)} files to test")

print("\nSplit complete!")

In [None]:
from collections import defaultdict

# This is for cross-validation (Pytorch and Tensorflow need manual separation)
# To scikit-learning you onl need to pass the training data and it does the division

# Configuration
k_folds = 5
output_base = 'kfolds'
os.makedirs(output_base, exist_ok=True)

print(f"Creating {k_folds}-fold cross-validation split...")

# Collect all files by category
category_files = {}
for category in categories:
    category_path = os.path.join(source_dir, category)
    files = [f for f in os.listdir(category_path)
             if os.path.isfile(os.path.join(category_path, f))]
    random.shuffle(files)  # Shuffle within each category
    category_files[category] = files

# Create fold assignments for each category
fold_assignments = defaultdict(lambda: defaultdict(list))

for category, files in category_files.items():
    n_files = len(files)
    fold_size = n_files // k_folds

    # Assign files to folds ensuring balanced distribution
    for fold_idx in range(k_folds):
        start_idx = fold_idx * fold_size
        # Last fold gets any remaining files
        end_idx = start_idx + fold_size if fold_idx < k_folds - 1 else n_files
        fold_assignments[fold_idx][category] = files[start_idx:end_idx]

# Create fold directories with class subdirectories only
for fold_idx in range(k_folds):
    fold_name = f'fold_{fold_idx + 1}'
    fold_dir = os.path.join(output_base, fold_name)

    # Create class subdirectories in each fold
    for category in categories:
        category_fold_dir = os.path.join(fold_dir, category)
        os.makedirs(category_fold_dir, exist_ok=True)

        # Copy the files assigned to this fold for this category
        files_for_this_fold = fold_assignments[fold_idx][category]

        for file in files_for_this_fold:
            src = os.path.join(source_dir, category, file)
            dst = os.path.join(category_fold_dir, file)
            shutil.copy2(src, dst)

    print(f"\n{fold_name}:")
    print("=" * 50)

    total_files = 0
    for category in categories:
        files_in_fold = fold_assignments[fold_idx][category]
        total_files += len(files_in_fold)
        print(f"  {category}: {len(files_in_fold)} files")

    print(f"  Total in fold: {total_files} files")

print(f"\n\nK-fold split complete!")
print(f"Output directory: {output_base}/")
print(f"Structure: fold_1/, fold_2/, ..., fold_{k_folds}/")
print(f"Each fold contains: {', '.join(categories)} subdirectories with their respective files")

print(f"\n\nK-fold split complete!")
print(f"Output directory: {output_base}/")
print(f"Structure: fold_1/, fold_2/, ..., fold_{k_folds}/")

# **Training**

In [None]:
# Since scikit-learning doesn't have features extraction for images
# We have to use also openCV to create our dictionary of features
from sklearn.model_selection import train_test_split
import cv2 as cv
import numpy as np
from sklearn.decomposition import PCA


X_train, X_test, y_train, y_test = train_test_split(image_paths, labels, test_size=0.2, random_state=0)

# We get all descriptors for training set before kfold, does not count as data leakage
# It just avoids calculating them k times
orb = cv.ORB_create() # Fast but poor performance
# sift = cv.SIFT_create(nfeatures=2000) #13gb is not enough for sift(40+ min)
sift = cv.SIFT_create()
# surf = cv.xfeatures2d.SURF_create() Non-free
akaze = cv.AKAZE_create(descriptor_type=cv.AKAZE_DESCRIPTOR_MLDB,
                         descriptor_size=0,   # 486 bits
                         threshold=0.001) #Slightly faster same results
fast = cv.FastFeatureDetector_create()

#Sift with 500 features performs a faster than akaze and just has 2% less accuracy (10 min training and faster kmeans)
# 2000 features takes 8gb rams and takes around 35-40min, but can't do kmeans)
# 1500 features takes 6.7 and around 35 min, but still isnt enough for kmeans
# 1000 features takes 5gb ram and around 31 min, but still doesnt work with kmeans
# Kmeans consumes around 6gb ram, so any detector method that keeps more than 6gb too will probably crash

# Note that this also affect the kmeans
# Seems like more computing time on getting labels is also translated to longer kmean

image_descriptors = []
pca = PCA(n_components=64) # Reduce descriptor dimension by half from 128 to 64
output_dir = "descriptors"
os.makedirs(output_dir, exist_ok=True)

for idx, x in enumerate(X_train):
  img = cv.imread(x, cv.IMREAD_GRAYSCALE)
  keypoints = sift.detect(img, None)
  keypoints, descriptors = sift.compute(img, keypoints)
  # Reducing precision from float32 to float16
  # if descriptors is not None:
    # descriptors = descriptors.astype('float16')
    # descriptors = pca.fit_transform(descriptors)
  np.save(f"{output_dir}/desc_{idx}.npy", descriptors)
  # image_descriptors.append(descriptors)





In [None]:
from sklearn.preprocessing import normalize
from sklearn.model_selection import StratifiedKFold
from sklearn.cluster import KMeans
from sklearn.svm import SVC

# For cross-validation we use StratifiedKFolds because it is a variation of KFolds that keeps the proportion of each class across the sets
skf = StratifiedKFold(n_splits=5, shuffle=True)
k = 3
# 30 clusters, 23% accuracy and 17min training
# 30 clusters akaze, 35% accuracy and 51min training
# 30 clusters sift500, 38% accuracy and 22 min training
accuracies = []

for train_idx, val_idx in skf.split(X_train, y_train):
  train_descriptors = [image_descriptors[i] for i in train_idx]
  val_descriptors   = [image_descriptors[i] for i in val_idx]

  # Stack all non-empty descriptors for training KMeans
  useful_descriptors = [d for d in train_descriptors if d is not None]
  useful_descriptors = np.vstack(useful_descriptors)

  # Fit KMeans on train descriptors only
  # After learning the kmeans we use it to label each descriptor
  # Then we will know which cluster will have more descriptors associated
  kmeans = KMeans(n_clusters=k, random_state=0).fit(useful_descriptors)

  def compute_histograms(descriptor_list):
        hists = []
        for desc in descriptor_list:
            if desc is not None:
                # Assign each descriptor to nearest cluster center
                cluster_labels = kmeans.predict(desc)
                # Create histogram for this image (length = number of clusters)
                hist, _ = np.histogram(cluster_labels, bins=np.arange(k + 1))
            else:
                # Image with no descriptors -> zero histogram
                hist = np.zeros(k, dtype=int)
            hists.append(hist)
        return normalize(np.array(hists), norm='l2')

  # BoVW histograms
  X_tr_hist = compute_histograms(train_descriptors)
  X_val_hist = compute_histograms(val_descriptors)

  # Label
  y_tr = [y_train[i] for i in train_idx]
  y_val = [y_train[i] for i in val_idx]

  # Train classifier
  clf = SVC(kernel='linear')
  clf.fit(X_tr_hist, y_tr)

  # Validate
  acc = clf.score(X_val_hist, y_val)
  accuracies.append(acc)

print("Mean CV accuracy:", np.mean(accuracies))
print("Std CV accuracy:", np.std(accuracies))


In [None]:
def load_desc_safe(file):
    path = os.path.join(desc_folder, file)
    desc = np.load(path, allow_pickle=True)

    if desc is None or desc.size == 0:
        return None

    # guarantee correct shape
    desc = np.asarray(desc, dtype=np.float32)

    if desc.ndim != 2 or desc.shape[1] != 128:
        return None

    return desc

desc_dir = "descriptors"

for fname in os.listdir(desc_dir):
    path = os.path.join(desc_dir, fname)

    try:
        desc = np.load(path, allow_pickle=True)
    except:
        print("Unreadable file:", fname, "— replacing with empty.")
        desc = None
        np.save(path, desc)
        continue

    # Case 0: Scalar (e.g., nan)
    if desc.ndim == 0:
        print("Scalar descriptor:", fname, "— fixing to empty.")
        desc = None
        np.save(path, desc)
        continue

    # Case 1: None or empty
    if desc is None or desc.size == 0:
        print("Fixing empty:", fname)
        desc = None
        np.save(path, desc)
        continue

    # Case 2: 1D vector
    if desc.ndim == 1:
        if desc.shape[0] == 128:  # single descriptor
            print("Fixing single vector:", fname)
            desc = desc.reshape(1,128)
        else:
            print("Invalid 1D descriptor:", fname, "— fixing to empty.")
            desc = None

        np.save(path, desc)
        continue

    # Case 3: 2D but wrong feature length
    if desc.ndim == 2 and desc.shape[1] != 128:
        print("Invalid 2D shape:", fname, desc.shape, "— fixing to empty.")
        desc = None
        np.save(path, desc)
        continue




In [None]:
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import normalize

class VLADEncoder:
    """
    VLAD (Vector of Locally Aggregated Descriptors) encoder.
    More powerful than basic BoVW histograms.
    """

    def __init__(self, n_clusters=64):
        """
        Args:
            n_clusters: Number of visual words (typically 64-256 for VLAD)
        """
        self.n_clusters = n_clusters
        self.kmeans = None
        self.centers = None

    def fit(self, descriptors):
        """
        Learn visual vocabulary from descriptors.

        Args:
            descriptors: np.array of shape (n_descriptors, descriptor_dim)
        """
        print(f"Training VLAD vocabulary with {self.n_clusters} clusters...")
        self.kmeans = MiniBatchKMeans(
            n_clusters=self.n_clusters,
            random_state=42,
            batch_size=1000,
            max_iter=100
        )
        self.kmeans.fit(descriptors)
        self.centers = self.kmeans.cluster_centers_
        print(f"Vocabulary trained. Centers shape: {self.centers.shape}")

    def encode_single(self, descriptors):
        """
        Encode a single image's descriptors into VLAD representation.

        Args:
            descriptors: np.array of shape (n_desc, descriptor_dim)

        Returns:
            vlad: np.array of shape (n_clusters * descriptor_dim,)
        """
        if descriptors is None or len(descriptors) == 0:
            # Return zero vector for empty descriptors
            return np.zeros(self.n_clusters * self.centers.shape[1])

        # Predict cluster assignments
        labels = self.kmeans.predict(descriptors)

        # Initialize VLAD vector
        descriptor_dim = descriptors.shape[1]
        vlad = np.zeros((self.n_clusters, descriptor_dim))

        # For each cluster, accumulate residuals
        for cluster_idx in range(self.n_clusters):
            # Find descriptors assigned to this cluster
            mask = (labels == cluster_idx)

            if np.sum(mask) > 0:
                # Compute residuals: descriptor - cluster_center
                residuals = descriptors[mask] - self.centers[cluster_idx]
                # Sum all residuals for this cluster
                vlad[cluster_idx] = np.sum(residuals, axis=0)

        # Flatten to 1D vector
        vlad = vlad.flatten()

        # Intra-normalization: L2 normalize each cluster's residuals
        vlad = vlad.reshape(self.n_clusters, descriptor_dim)
        vlad = normalize(vlad, norm='l2', axis=1)
        vlad = vlad.flatten()

        # Power normalization (reduces burstiness)
        vlad = np.sign(vlad) * np.sqrt(np.abs(vlad))

        # Final L2 normalization
        vlad = normalize(vlad.reshape(1, -1), norm='l2')[0]

        return vlad

    def encode_batch(self, file_list, load_desc_func):
        """
        Encode multiple images.

        Args:
            file_list: List of file paths
            load_desc_func: Function to load descriptors from file

        Returns:
            vlads: np.array of shape (n_images, n_clusters * descriptor_dim)
        """
        vlads = []
        for i, file in enumerate(file_list):
            if i % 100 == 0:
                print(f"Encoding {i}/{len(file_list)}...")

            desc = load_desc_func(file)
            vlad = self.encode_single(desc)
            vlads.append(vlad)

        return np.array(vlads)


# Example usage compatible with your code:
def compute_vlad_features(file_list, load_desc_safe, n_clusters=64):
    """
    Drop-in replacement for your compute_histograms function.

    Args:
        file_list: List of descriptor file paths
        load_desc_safe: Your function to load descriptors
        n_clusters: Number of clusters (64-256 recommended for VLAD)

    Returns:
        vlad_features: Normalized VLAD representations
    """
    # Step 1: Collect all descriptors for vocabulary training
    print("Collecting descriptors for vocabulary training...")
    all_descriptors = []
    for file in file_list[:1000]:  # Sample for efficiency
        desc = load_desc_safe(file)
        if desc is not None:
            all_descriptors.append(desc)

    all_descriptors = np.vstack(all_descriptors)
    print(f"Collected {len(all_descriptors)} descriptors")

    # Step 2: Train VLAD encoder
    encoder = VLADEncoder(n_clusters=n_clusters)
    encoder.fit(all_descriptors)

    # Step 3: Encode all images
    vlad_features = encoder.encode_batch(file_list, load_desc_safe)

    return vlad_features

In [None]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import normalize
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC

k = 256              # number of clusters
# k=3 around 12% (same as normal kmeans)
# k=30 around 43.5% with is actually higher than reduced sift features
# k = 200 around 55%
# k = 2000 around 66% but 22 min training while 200 took 5 min per cv
# k=2000 with batch size 10k, 66% with 23 min traing
# k=5000 10k size, 69%
# Note that with minibatchKmeans you can increase the number of clusters without getting to penalized (in time) by it
# They have better scaling for number of clusters than KMeans
batch_size = 8000        # minibatch size
desc_folder = "descriptors"

skf = StratifiedKFold(n_splits=5, shuffle=True)
accuracies = []

for train_idx, val_idx in skf.split(X_train, y_train):

    # --- 1) Build list of descriptor filenames for this fold ---
    train_files = [f"desc_{i}.npy" for i in train_idx]
    val_files   = [f"desc_{i}.npy" for i in val_idx]

    # --- 2) Train MiniBatchKMeans only on training descriptors ---
    kmeans = MiniBatchKMeans(
        n_clusters=k,
        batch_size=batch_size,
        random_state=0
    )

    batch = []
    current_size = 0

    for file in train_files:
        desc = load_desc_safe(file)

        # Ignore empty descriptors
        if desc is None or desc.size == 0:
          continue

        # Append descriptors for streaming
        batch.append(desc)
        current_size += desc.shape[0]

        # If enough descriptors → partial_fit
        if current_size >= batch_size:
            kmeans.partial_fit(np.vstack(batch))
            batch = []
            current_size = 0

    # Fit final batch if any
    if len(batch) > 0:
        kmeans.partial_fit(np.vstack(batch))

    # --- 3) Function to compute histograms safely ---
    def compute_histograms(file_list):
        hists = []
        for file in file_list:
            desc = load_desc_safe(file)
            if desc is None:
                hist = np.zeros(k, dtype=int)
            else:
                labels = kmeans.predict(desc)
                hist, _ = np.histogram(labels, bins=np.arange(k+1))
            hists.append(hist)
        return normalize(np.array(hists), norm='l2')

    # --- 4) Build BoVW histograms ---
    # X_tr_hist = compute_histograms(train_files)
    # X_val_hist = compute_histograms(val_files)

    X_tr_hist = compute_vlad_features(train_files, load_desc_safe, n_clusters=k)
    X_val_hist = compute_vlad_features(val_files, load_desc_safe, n_clusters=k)

    # Label
    y_tr = [y_train[i] for i in train_idx]
    y_val = [y_train[i] for i in val_idx]

    # --- 5) Train your classifier ---
    clf = SVC(kernel='linear')
    clf.fit(X_tr_hist, y_tr)

    acc = clf.score(X_val_hist, y_val)
    accuracies.append(acc)
    print("Fold accuracy:", acc)


In [None]:
class BagOfVisualWords:
    def __init__(self, n_clusters=64, batch_size = 5000):
        self.n_clusters = n_clusters
        # By default we use MiniBatchKMeans
        # To use standard KMeans set batch_size to -1
        if batch_size > 0:
            self.kmeans = MiniBatchKMeans(
                n_clusters=n_clusters,
                batch_size=batch_size,
                random_state=0
            )
        else:
          self.kmeans = KMeans(
              n_clusters=n_clusters,
              random_state=0
          )
        self.centers = None
    def calculate_centers(self, descriptors):
        self.kmeans.fit(descriptors)
        self.centers = self.kmeans.cluster_centers_
    def calculate_centers_batch(self, descriptors):
        batch = []
        current_size = 0
        for desc in descriptors:
            batch.append(desc)
            current_size += desc.shape

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100,random_state=0)
clf.fit(X_tr_hist, y_tr)

acc = clf.score(X_val_hist, y_val)
print("Fold accuracy:", acc)

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=150, random_state=42)
classifier.fit(X_tr_hist, y_tr)
acc = clf.score(X_val_hist, y_val)
print("Fold accuracy:", acc)