In [1]:
from main import *
from bovw import *

In [2]:
import os
import random
import numpy as np

%matplotlib inline
from matplotlib import pyplot as plt

In [3]:
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

In [4]:
data_train = Dataset(ImageFolder="../data/places_reduced/train")
data_test = Dataset(ImageFolder="../data/places_reduced/val")

len(data_train), len(data_test)

(8700, 2200)

In [5]:
random.shuffle(data_train)
random.shuffle(data_test)

# Dimensionality Reduction Experiments

This notebook explores different dimensionality reduction techniques applied to SIFT descriptors before building the visual vocabulary.

## Methods:

### 1. PCA (Principal Component Analysis)
- **Key Parameter**: `n_components` - number of components to keep
- **Additional**: `whiten` - whether to whiten the components (normalize variance)
- **Use case**: Unsupervised, finds directions of maximum variance

### 2. TruncatedSVD (Singular Value Decomposition)
- **Key Parameter**: `n_components` - number of components to keep
- **Additional**: `n_iter` - number of iterations for randomized solver
- **Use case**: Works with sparse data, similar to PCA but doesn't center data

### 3. LDA (Linear Discriminant Analysis)
- **Key Parameter**: `n_components` - number of components (max: n_classes - 1)
- **Additional**: `solver` - {'svd', 'lsqr', 'eigen'}
- **Use case**: Supervised, maximizes class separability
- **Note**: Requires labels, limited to n_classes - 1 components (10 for 11 classes)

### 4. t-SNE (t-Distributed Stochastic Neighbor Embedding)
- **Key Parameters**: 
  - `n_components` - typically 2 or 3 for visualization
  - `perplexity` - balance between local and global structure (5-50)
  - `learning_rate` - gradient descent step size (10-1000)
- **Use case**: Visualization, preserves local structure
- **Warning**: Very slow, not recommended for high-dimensional embedding in classification pipelines

## Baseline: No Dimensionality Reduction

In [6]:
# Baseline without dimensionality reduction
print("=" * 60)
print("BASELINE: NO DIMENSIONALITY REDUCTION")
print("=" * 60)

baseline_params = {
    "detector_type": "SIFT",
    "codebook_size": 512,
    "detector_kwargs": {"nfeatures": 1000},
    "dimensionality_reduction": None,
}

classifier_params = {
    "kernel": "linear",
    "C": 1.0,
    "class_weight": "balanced",
    "random_state": SEED,
}

baseline_scores = cross_validate_bovw(
    dataset=data_train,
    bovw_kwargs=baseline_params,
    classifier_cls=SVC,
    classifier_kwargs=classifier_params,
    n_splits=5
)

print(f"Baseline -> Train: {baseline_scores.train.accuracy.mean:.4f} ± {baseline_scores.train.accuracy.std:.4f}")
print(f"Baseline -> Test:  {baseline_scores.test.accuracy.mean:.4f} ± {baseline_scores.test.accuracy.std:.4f}")

BASELINE: NO DIMENSIONALITY REDUCTION


Phase [Setup]: Extracting the descriptors:  51%|█████     | 4434/8700 [00:01<00:01, 2330.40it/s]

Could not compute descriptors for image ../data/places_reduced/train/water_ice_snow/iceberg_00000023.jpg of class 9.


Phase [Setup]: Extracting the descriptors: 100%|██████████| 8700/8700 [00:03<00:00, 2313.47it/s]


Could not compute descriptors for image ../data/places_reduced/train/mountains_hills_desert_sky/sky_00001410.jpg of class 5.


100%|██████████| 5/5 [01:23<00:00, 16.61s/it]

Baseline -> Train: 0.4752 ± 0.0056
Baseline -> Test:  0.3262 ± 0.0083





## Experiment 1: PCA - n_components

In [7]:
# PCA: Test different numbers of components
pca_results = {}

print("\n" + "=" * 60)
print("PCA: n_components")
print("=" * 60)

# SIFT descriptors are 128-dimensional
for n_comp in [32, 64, 96, 128]:
    print(f"\n=== Testing PCA with n_components={n_comp} ===")
    
    bovw_params = {
        "detector_type": "SIFT",
        "codebook_size": 512,
        "detector_kwargs": {"nfeatures": 1000},
        "dimensionality_reduction": "PCA",
        "dimensionality_reduction_kwargs": {
            "n_components": n_comp,
            "random_state": SEED,
        },
    }
    
    scores = cross_validate_bovw(
        dataset=data_train,
        bovw_kwargs=bovw_params,
        classifier_cls=SVC,
        classifier_kwargs=classifier_params,
        n_splits=5
    )
    
    pca_results[n_comp] = scores
    print(f"n_components={n_comp} -> Test Accuracy: {scores.test.accuracy.mean:.4f} ± {scores.test.accuracy.std:.4f}")

print("\n" + "=" * 60)
print("PCA SUMMARY")
print("=" * 60)
for n_comp, result in pca_results.items():
    print(f"n_components={n_comp:3d} -> Train: {result.train.accuracy.mean:.4f} ± {result.train.accuracy.std:.4f} | Test: {result.test.accuracy.mean:.4f} ± {result.test.accuracy.std:.4f}")


PCA: n_components

=== Testing PCA with n_components=32 ===


Phase [Setup]: Extracting the descriptors:  50%|█████     | 4374/8700 [00:01<00:01, 2713.70it/s]

Could not compute descriptors for image ../data/places_reduced/train/water_ice_snow/iceberg_00000023.jpg of class 9.


Phase [Setup]: Extracting the descriptors: 100%|██████████| 8700/8700 [00:03<00:00, 2665.01it/s]


Could not compute descriptors for image ../data/places_reduced/train/mountains_hills_desert_sky/sky_00001410.jpg of class 5.


100%|██████████| 5/5 [01:20<00:00, 16.19s/it]


n_components=32 -> Test Accuracy: 0.3324 ± 0.0029

=== Testing PCA with n_components=64 ===


Phase [Setup]: Extracting the descriptors:  50%|█████     | 4379/8700 [00:01<00:01, 2721.95it/s]

Could not compute descriptors for image ../data/places_reduced/train/water_ice_snow/iceberg_00000023.jpg of class 9.


Phase [Setup]: Extracting the descriptors: 100%|██████████| 8700/8700 [00:03<00:00, 2677.46it/s]


Could not compute descriptors for image ../data/places_reduced/train/mountains_hills_desert_sky/sky_00001410.jpg of class 5.


100%|██████████| 5/5 [01:33<00:00, 18.78s/it]


n_components=64 -> Test Accuracy: 0.3250 ± 0.0129

=== Testing PCA with n_components=96 ===


Phase [Setup]: Extracting the descriptors:  50%|████▉     | 4314/8700 [00:01<00:01, 2602.52it/s]

Could not compute descriptors for image ../data/places_reduced/train/water_ice_snow/iceberg_00000023.jpg of class 9.


Phase [Setup]: Extracting the descriptors: 100%|██████████| 8700/8700 [00:03<00:00, 2611.78it/s]


Could not compute descriptors for image ../data/places_reduced/train/mountains_hills_desert_sky/sky_00001410.jpg of class 5.


100%|██████████| 5/5 [01:37<00:00, 19.45s/it]


n_components=96 -> Test Accuracy: 0.3302 ± 0.0086

=== Testing PCA with n_components=128 ===


Phase [Setup]: Extracting the descriptors:  50%|████▉     | 4317/8700 [00:01<00:01, 2683.42it/s]

Could not compute descriptors for image ../data/places_reduced/train/water_ice_snow/iceberg_00000023.jpg of class 9.


Phase [Setup]: Extracting the descriptors: 100%|██████████| 8700/8700 [00:03<00:00, 2639.70it/s]


Could not compute descriptors for image ../data/places_reduced/train/mountains_hills_desert_sky/sky_00001410.jpg of class 5.


100%|██████████| 5/5 [01:40<00:00, 20.12s/it]

n_components=128 -> Test Accuracy: 0.3219 ± 0.0061

PCA SUMMARY
n_components= 32 -> Train: 0.4868 ± 0.0028 | Test: 0.3324 ± 0.0029
n_components= 64 -> Train: 0.4824 ± 0.0040 | Test: 0.3250 ± 0.0129
n_components= 96 -> Train: 0.4753 ± 0.0049 | Test: 0.3302 ± 0.0086
n_components=128 -> Train: 0.4748 ± 0.0057 | Test: 0.3219 ± 0.0061





## Experiment 2: PCA with Whitening

In [8]:
# PCA: Test whitening effect
pca_whitening_results = {}

print("\n" + "=" * 60)
print("PCA: Whitening Effect (n_components=64)")
print("=" * 60)

for whiten in [False, True]:
    print(f"\n=== Testing PCA with whiten={whiten} ===")
    
    bovw_params = {
        "detector_type": "SIFT",
        "codebook_size": 512,
        "detector_kwargs": {"nfeatures": 1000},
        "dimensionality_reduction": "PCA",
        "dimensionality_reduction_kwargs": {
            "n_components": 64,
            "whiten": whiten,
            "random_state": SEED,
        },
    }
    
    scores = cross_validate_bovw(
        dataset=data_train,
        bovw_kwargs=bovw_params,
        classifier_cls=SVC,
        classifier_kwargs=classifier_params,
        n_splits=5
    )
    
    pca_whitening_results[whiten] = scores
    print(f"whiten={whiten} -> Test Accuracy: {scores.test.accuracy.mean:.4f} ± {scores.test.accuracy.std:.4f}")

print("\n" + "=" * 60)
print("PCA WHITENING SUMMARY")
print("=" * 60)
for whiten, result in pca_whitening_results.items():
    print(f"whiten={whiten:5} -> Train: {result.train.accuracy.mean:.4f} ± {result.train.accuracy.std:.4f} | Test: {result.test.accuracy.mean:.4f} ± {result.test.accuracy.std:.4f}")


PCA: Whitening Effect (n_components=64)

=== Testing PCA with whiten=False ===


Phase [Setup]: Extracting the descriptors:  49%|████▉     | 4290/8700 [00:01<00:01, 2665.91it/s]

Could not compute descriptors for image ../data/places_reduced/train/water_ice_snow/iceberg_00000023.jpg of class 9.


Phase [Setup]: Extracting the descriptors: 100%|██████████| 8700/8700 [00:03<00:00, 2619.38it/s]


Could not compute descriptors for image ../data/places_reduced/train/mountains_hills_desert_sky/sky_00001410.jpg of class 5.


100%|██████████| 5/5 [01:29<00:00, 17.85s/it]


whiten=False -> Test Accuracy: 0.3323 ± 0.0074

=== Testing PCA with whiten=True ===


Phase [Setup]: Extracting the descriptors:  50%|█████     | 4376/8700 [00:01<00:01, 2725.27it/s]

Could not compute descriptors for image ../data/places_reduced/train/water_ice_snow/iceberg_00000023.jpg of class 9.


Phase [Setup]: Extracting the descriptors: 100%|██████████| 8700/8700 [00:03<00:00, 2673.95it/s]


Could not compute descriptors for image ../data/places_reduced/train/mountains_hills_desert_sky/sky_00001410.jpg of class 5.


100%|██████████| 5/5 [01:29<00:00, 17.98s/it]

whiten=True -> Test Accuracy: 0.3246 ± 0.0029

PCA WHITENING SUMMARY
whiten=    0 -> Train: 0.4786 ± 0.0047 | Test: 0.3323 ± 0.0074
whiten=    1 -> Train: 0.4688 ± 0.0046 | Test: 0.3246 ± 0.0029





## Experiment 3: SVD - n_components

In [9]:
# SVD: Test different numbers of components
svd_results = {}

print("\n" + "=" * 60)
print("TRUNCATED SVD: n_components")
print("=" * 60)

for n_comp in [32, 64, 96, 127]:  # SVD requires n_components < n_features
    print(f"\n=== Testing SVD with n_components={n_comp} ===")
    
    bovw_params = {
        "detector_type": "SIFT",
        "codebook_size": 512,
        "detector_kwargs": {"nfeatures": 1000},
        "dimensionality_reduction": "SVD",
        "dimensionality_reduction_kwargs": {
            "n_components": n_comp,
            "random_state": SEED,
        },
    }
    
    scores = cross_validate_bovw(
        dataset=data_train,
        bovw_kwargs=bovw_params,
        classifier_cls=SVC,
        classifier_kwargs=classifier_params,
        n_splits=5
    )
    
    svd_results[n_comp] = scores
    print(f"n_components={n_comp} -> Test Accuracy: {scores.test.accuracy.mean:.4f} ± {scores.test.accuracy.std:.4f}")

print("\n" + "=" * 60)
print("SVD SUMMARY")
print("=" * 60)
for n_comp, result in svd_results.items():
    print(f"n_components={n_comp:3d} -> Train: {result.train.accuracy.mean:.4f} ± {result.train.accuracy.std:.4f} | Test: {result.test.accuracy.mean:.4f} ± {result.test.accuracy.std:.4f}")


TRUNCATED SVD: n_components

=== Testing SVD with n_components=32 ===


Phase [Setup]: Extracting the descriptors:  51%|█████     | 4415/8700 [00:01<00:01, 2736.08it/s]

Could not compute descriptors for image ../data/places_reduced/train/water_ice_snow/iceberg_00000023.jpg of class 9.


Phase [Setup]: Extracting the descriptors: 100%|██████████| 8700/8700 [00:03<00:00, 2708.02it/s]


Could not compute descriptors for image ../data/places_reduced/train/mountains_hills_desert_sky/sky_00001410.jpg of class 5.


100%|██████████| 5/5 [02:15<00:00, 27.14s/it]


n_components=32 -> Test Accuracy: 0.3311 ± 0.0063

=== Testing SVD with n_components=64 ===


Phase [Setup]: Extracting the descriptors:  51%|█████     | 4403/8700 [00:01<00:01, 2689.12it/s]

Could not compute descriptors for image ../data/places_reduced/train/water_ice_snow/iceberg_00000023.jpg of class 9.


Phase [Setup]: Extracting the descriptors: 100%|██████████| 8700/8700 [00:03<00:00, 2671.24it/s]


Could not compute descriptors for image ../data/places_reduced/train/mountains_hills_desert_sky/sky_00001410.jpg of class 5.


100%|██████████| 5/5 [03:06<00:00, 37.35s/it]


n_components=64 -> Test Accuracy: 0.3302 ± 0.0085

=== Testing SVD with n_components=96 ===


Phase [Setup]: Extracting the descriptors:  50%|█████     | 4357/8700 [00:01<00:01, 2700.68it/s]

Could not compute descriptors for image ../data/places_reduced/train/water_ice_snow/iceberg_00000023.jpg of class 9.


Phase [Setup]: Extracting the descriptors: 100%|██████████| 8700/8700 [00:03<00:00, 2669.94it/s]


Could not compute descriptors for image ../data/places_reduced/train/mountains_hills_desert_sky/sky_00001410.jpg of class 5.


100%|██████████| 5/5 [03:58<00:00, 47.68s/it]


n_components=96 -> Test Accuracy: 0.3284 ± 0.0068

=== Testing SVD with n_components=127 ===


Phase [Setup]: Extracting the descriptors:  50%|█████     | 4367/8700 [00:01<00:01, 2710.73it/s]

Could not compute descriptors for image ../data/places_reduced/train/water_ice_snow/iceberg_00000023.jpg of class 9.


Phase [Setup]: Extracting the descriptors: 100%|██████████| 8700/8700 [00:03<00:00, 2674.83it/s]


Could not compute descriptors for image ../data/places_reduced/train/mountains_hills_desert_sky/sky_00001410.jpg of class 5.


100%|██████████| 5/5 [04:59<00:00, 59.97s/it]

n_components=127 -> Test Accuracy: 0.3259 ± 0.0067

SVD SUMMARY
n_components= 32 -> Train: 0.4810 ± 0.0045 | Test: 0.3311 ± 0.0063
n_components= 64 -> Train: 0.4804 ± 0.0047 | Test: 0.3302 ± 0.0085
n_components= 96 -> Train: 0.4784 ± 0.0033 | Test: 0.3284 ± 0.0068
n_components=127 -> Train: 0.4792 ± 0.0045 | Test: 0.3259 ± 0.0067





## Experiment 4: LDA - n_components and solver

**Note**: LDA is supervised and requires labels. Maximum n_components = n_classes - 1 = 10 for this dataset.

In [10]:
# LDA: Test different numbers of components (max = n_classes - 1 = 10)
lda_results = {}

print("\n" + "=" * 60)
print("LDA: n_components (supervised)")
print("=" * 60)

for n_comp in [5, 8, 10]:
    print(f"\n=== Testing LDA with n_components={n_comp} ===")
    
    bovw_params = {
        "detector_type": "SIFT",
        "codebook_size": 512,
        "detector_kwargs": {"nfeatures": 1000},
        "dimensionality_reduction": "LDA",
        "dimensionality_reduction_kwargs": {
            "n_components": n_comp,
        },
    }
    
    scores = cross_validate_bovw(
        dataset=data_train,
        bovw_kwargs=bovw_params,
        classifier_cls=SVC,
        classifier_kwargs=classifier_params,
        n_splits=5
    )
    
    lda_results[n_comp] = scores
    print(f"n_components={n_comp} -> Test Accuracy: {scores.test.accuracy.mean:.4f} ± {scores.test.accuracy.std:.4f}")

print("\n" + "=" * 60)
print("LDA SUMMARY")
print("=" * 60)
for n_comp, result in lda_results.items():
    print(f"n_components={n_comp:2d} -> Train: {result.train.accuracy.mean:.4f} ± {result.train.accuracy.std:.4f} | Test: {result.test.accuracy.mean:.4f} ± {result.test.accuracy.std:.4f}")


LDA: n_components (supervised)

=== Testing LDA with n_components=5 ===


Phase [Setup]: Extracting the descriptors:  49%|████▉     | 4297/8700 [00:01<00:01, 2691.06it/s]

Could not compute descriptors for image ../data/places_reduced/train/water_ice_snow/iceberg_00000023.jpg of class 9.


Phase [Setup]: Extracting the descriptors: 100%|██████████| 8700/8700 [00:03<00:00, 2655.72it/s]


Could not compute descriptors for image ../data/places_reduced/train/mountains_hills_desert_sky/sky_00001410.jpg of class 5.


100%|██████████| 5/5 [04:17<00:00, 51.49s/it]


n_components=5 -> Test Accuracy: 0.3312 ± 0.0047

=== Testing LDA with n_components=8 ===


Phase [Setup]: Extracting the descriptors:  48%|████▊     | 4210/8700 [00:01<00:02, 1527.91it/s]

Could not compute descriptors for image ../data/places_reduced/train/water_ice_snow/iceberg_00000023.jpg of class 9.


Phase [Setup]: Extracting the descriptors:  98%|█████████▊| 8568/8700 [00:06<00:00, 872.05it/s] 

Could not compute descriptors for image ../data/places_reduced/train/mountains_hills_desert_sky/sky_00001410.jpg of class 5.


Phase [Setup]: Extracting the descriptors: 100%|██████████| 8700/8700 [00:06<00:00, 1382.81it/s]
100%|██████████| 5/5 [04:15<00:00, 51.06s/it]


n_components=8 -> Test Accuracy: 0.3355 ± 0.0097

=== Testing LDA with n_components=10 ===


Phase [Setup]: Extracting the descriptors:  52%|█████▏    | 4499/8700 [00:02<00:02, 2087.65it/s]

Could not compute descriptors for image ../data/places_reduced/train/water_ice_snow/iceberg_00000023.jpg of class 9.


Phase [Setup]: Extracting the descriptors: 100%|██████████| 8700/8700 [00:04<00:00, 2123.56it/s]


Could not compute descriptors for image ../data/places_reduced/train/mountains_hills_desert_sky/sky_00001410.jpg of class 5.


100%|██████████| 5/5 [04:09<00:00, 49.89s/it]

n_components=10 -> Test Accuracy: 0.3450 ± 0.0128

LDA SUMMARY
n_components= 5 -> Train: 0.4819 ± 0.0034 | Test: 0.3312 ± 0.0047
n_components= 8 -> Train: 0.5055 ± 0.0071 | Test: 0.3355 ± 0.0097
n_components=10 -> Train: 0.5025 ± 0.0048 | Test: 0.3450 ± 0.0128





## Experiment 5: LDA Solvers

In [None]:
# LDA: Test different solvers
lda_solver_results = {}

print("\n" + "=" * 60)
print("LDA: Solver Comparison (n_components=10)")
print("=" * 60)

for solver in ['svd', 'eigen']:
    print(f"\n=== Testing LDA with solver={solver} ===")
    
    bovw_params = {
        "detector_type": "SIFT",
        "codebook_size": 512,
        "detector_kwargs": {"nfeatures": 1000},
        "dimensionality_reduction": "LDA",
        "dimensionality_reduction_kwargs": {
            "n_components": 10,
            "solver": solver,
        },
    }
    
    scores = cross_validate_bovw(
        dataset=data_train,
        bovw_kwargs=bovw_params,
        classifier_cls=SVC,
        classifier_kwargs=classifier_params,
        n_splits=5
    )
    
    lda_solver_results[solver] = scores
    print(f"solver={solver} -> Test Accuracy: {scores.test.accuracy.mean:.4f} ± {scores.test.accuracy.std:.4f}")

print("\n" + "=" * 60)
print("LDA SOLVER SUMMARY")
print("=" * 60)
for solver, result in lda_solver_results.items():
    print(f"solver={solver:5s} -> Train: {result.train.accuracy.mean:.4f} ± {result.train.accuracy.std:.4f} | Test: {result.test.accuracy.mean:.4f} ± {result.test.accuracy.std:.4f}")


LDA: Solver Comparison (n_components=10)

=== Testing LDA with solver=svd ===


Phase [Setup]: Extracting the descriptors:  52%|█████▏    | 4507/8700 [00:02<00:01, 2558.81it/s]

Could not compute descriptors for image ../data/places_reduced/train/water_ice_snow/iceberg_00000023.jpg of class 9.


Phase [Setup]: Extracting the descriptors: 100%|██████████| 8700/8700 [00:03<00:00, 2332.20it/s]


Could not compute descriptors for image ../data/places_reduced/train/mountains_hills_desert_sky/sky_00001410.jpg of class 5.


100%|██████████| 5/5 [04:08<00:00, 49.70s/it]


solver=svd -> Test Accuracy: 0.3426 ± 0.0118

=== Testing LDA with solver=lsqr ===


Phase [Setup]: Extracting the descriptors:  50%|████▉     | 4316/8700 [00:01<00:01, 2646.22it/s]

Could not compute descriptors for image ../data/places_reduced/train/water_ice_snow/iceberg_00000023.jpg of class 9.


Phase [Setup]: Extracting the descriptors: 100%|██████████| 8700/8700 [00:03<00:00, 2642.70it/s]


Could not compute descriptors for image ../data/places_reduced/train/mountains_hills_desert_sky/sky_00001410.jpg of class 5.


  0%|          | 0/5 [00:04<?, ?it/s]


NotImplementedError: transform not implemented for 'lsqr' solver (use 'svd' or 'eigen').

## Final Comparison: All Methods

In [None]:
# Summary comparison of all dimensionality reduction methods
print("\n" + "=" * 60)
print("FINAL COMPARISON: ALL DIMENSIONALITY REDUCTION METHODS")
print("=" * 60)

comparison = {
    "Baseline (None)": baseline_scores,
    "PCA (n=64)": pca_results[64],
    "SVD (n=64)": svd_results[64],
    "LDA (n=10)": lda_results[10],
}

for method_name, result in comparison.items():
    print(f"{method_name:20s} -> Train: {result.train.accuracy.mean:.4f} ± {result.train.accuracy.std:.4f} | Test: {result.test.accuracy.mean:.4f} ± {result.test.accuracy.std:.4f}")

# Find best method
best_method = max(comparison.items(), key=lambda x: x[1].test.accuracy.mean)
print(f"\nBest method: {best_method[0]} with test accuracy {best_method[1].test.accuracy.mean:.4f}")