# Parameter Impact Analysis - BoVW Image Classification

Comprehensive visualization of how different parameters affect model performance

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import ast

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_context("talk")
sns.set_palette("husl")

TRAIN_COLOR = '#1c3b4b'
VAL_COLOR = '#2b8183'
TEST_COLOR = '#F18F01'

plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.size'] = 11
plt.rcParams['axes.labelsize'] = 13
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.titleweight'] = 'bold'
plt.rcParams['xtick.labelsize'] = 11
plt.rcParams['ytick.labelsize'] = 11
plt.rcParams['legend.fontsize'] = 11
plt.rcParams['figure.titlesize'] = 16
plt.rcParams['figure.titleweight'] = 'bold'

## 1. Codebook Size Impact

How does vocabulary size affect model performance?

In [None]:
df_codebook = pd.read_csv('Week1/results/codebook_size_experiment_results.csv')

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

df_plot = df_codebook.sort_values('codebook_size')
x = df_plot['codebook_size']

ax.plot(x, df_plot['mean_train_acc'],
        marker='o', linewidth=2.5, markersize=6,
        color=TRAIN_COLOR, label='Training', alpha=0.9)
ax.fill_between(x,
                df_plot['mean_train_acc'] - df_plot['std_train_acc'],
                df_plot['mean_train_acc'] + df_plot['std_train_acc'],
                color=TRAIN_COLOR, alpha=0.2)

ax.plot(x, df_plot['mean_val_acc'],
        marker='s', linewidth=2.5, markersize=6,
        color=VAL_COLOR, label='Validation', alpha=0.9)
ax.fill_between(x,
                df_plot['mean_val_acc'] - df_plot['std_val_acc'],
                df_plot['mean_val_acc'] + df_plot['std_val_acc'],
                color=VAL_COLOR, alpha=0.2)

best_idx = df_plot['mean_val_acc'].idxmax()
best_k = df_plot.loc[best_idx, 'codebook_size']
best_acc = df_plot.loc[best_idx, 'mean_val_acc']
ax.axvline(best_k, color='red', linestyle='--', linewidth=1.5, alpha=0.5, label=f'Optimal k={int(best_k)}')
ax.scatter([best_k], [best_acc], color='red', s=150, zorder=5, marker='*')

ax.set_xlabel('Codebook Size (k)', fontweight='bold')
ax.set_ylabel('Accuracy', fontweight='bold')
ax.set_title('Impact of Vocabulary Size on Performance', fontweight='bold', pad=15)
ax.legend(loc='lower right', frameon=True, shadow=True)
ax.grid(True, alpha=0.3, linestyle=':')
ax.set_xlim([x.min() - 100, x.max() + 100])

plt.tight_layout()
plt.savefig('Week1/results/codebook_size_impact.png', dpi=300, bbox_inches='tight')
plt.show()

## 2. Classifier Comparison

Comparing different classification algorithms

In [None]:
df_clf = pd.read_csv('Week1/results/classifier_comparison_results.csv')

def get_classifier_name(row):
    if row['classifier'] == 'LogisticRegression':
        return 'Logistic\nRegression'
    elif row['classifier'] == 'HistIntersectionSVM':
        return 'Histogram\nIntersection SVM'
    else:
        params = ast.literal_eval(row['clf_params'])
        kernel = params.get('kernel', 'unknown')
        return f'SVM\n({kernel.upper()})'

df_clf['clf_name'] = df_clf.apply(get_classifier_name, axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

df_plot = df_clf.sort_values('mean_val_acc', ascending=False)
x_pos = np.arange(len(df_plot))
width = 0.35

bars1 = ax.bar(x_pos - width/2, df_plot['mean_train_acc'],
       width, yerr=df_plot['std_train_acc'],
       color=TRAIN_COLOR, edgecolor='black', linewidth=1.5,
       error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5},
       label='Training (±std)')

bars2 = ax.bar(x_pos + width/2, df_plot['mean_val_acc'],
       width, yerr=df_plot['std_val_acc'],
       color=VAL_COLOR, edgecolor='black', linewidth=1.5,
       error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5},
       label='Validation (±std)')

for bar in bars1:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{height:.3f}',
            ha='center', va='bottom', fontsize=9)

for bar in bars2:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{height:.3f}',
            ha='center', va='bottom', fontsize=9)

ax.set_xticks(x_pos)
ax.set_xticklabels(df_plot['clf_name'], rotation=0, ha='center')
ax.set_ylabel('Accuracy', fontweight='bold')
ax.set_title('Classifier Performance Comparison', fontweight='bold', pad=15)
ax.legend(loc='upper right', frameon=True, shadow=True)
ax.grid(axis='y', alpha=0.3, linestyle=':')
ax.set_ylim([0, 1.0])

plt.tight_layout()
plt.savefig('Week1/results/classifier_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## 3. SVM Kernel Impact

How do different SVM kernels perform?

In [None]:
df_kernel = pd.read_csv('Week1/results/svm_kernel_experiment_results.csv')

def extract_kernel(row):
    if row['classifier'] == 'HistIntersectionSVM':
        return 'Histogram\nIntersection'
    else:
        params = ast.literal_eval(row['clf_params'])
        kernel = params.get('kernel', 'unknown')
        return kernel.upper()

df_kernel['kernel'] = df_kernel.apply(extract_kernel, axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

df_plot = df_kernel.sort_values('mean_val_acc', ascending=False)
x_pos = np.arange(len(df_plot))
width = 0.35

bars1 = ax.bar(x_pos - width/2, df_plot['mean_train_acc'],
       width, yerr=df_plot['std_train_acc'],
       color=TRAIN_COLOR, edgecolor='black', linewidth=1.5,
       error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5},
       label='Training (±std)')

bars2 = ax.bar(x_pos + width/2, df_plot['mean_val_acc'],
       width, yerr=df_plot['std_val_acc'],
       color=VAL_COLOR, edgecolor='black', linewidth=1.5,
       error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5},
       label='Validation (±std)')

for bar in bars1:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{height:.3f}',
            ha='center', va='bottom', fontsize=9)

for bar in bars2:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{height:.3f}',
            ha='center', va='bottom', fontsize=9)

ax.set_xticks(x_pos)
ax.set_xticklabels(df_plot['kernel'], rotation=0, ha='center')
ax.set_ylabel('Accuracy', fontweight='bold')
ax.set_title('Impact of SVM Kernel Type', fontweight='bold', pad=15)
ax.legend(loc='upper right', frameon=True, shadow=True)
ax.grid(axis='y', alpha=0.3, linestyle=':')
ax.set_ylim([0, 1.0])

plt.tight_layout()
plt.savefig('Week1/results/svm_kernel_impact.png', dpi=300, bbox_inches='tight')
plt.show()

## 4. Class Weight Impact

Does balancing classes improve performance?

In [None]:
df_weight = pd.read_csv('Week1/results/svm_class_weight_experiment_results.csv')

def extract_class_weight(row):
    params = ast.literal_eval(row['clf_params'])
    weight = params.get('class_weight', None)
    return 'Balanced' if weight == 'balanced' else 'None'

df_weight['class_weight'] = df_weight.apply(extract_class_weight, axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))

df_plot = df_weight.sort_values('mean_val_acc', ascending=False)
x_pos = np.arange(len(df_plot))
width = 0.35

bars1 = ax.bar(x_pos - width/2, df_plot['mean_train_acc'],
       width, yerr=df_plot['std_train_acc'],
       color=TRAIN_COLOR, edgecolor='black', linewidth=1.5,
       error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5},
       label='Training (±std)')

bars2 = ax.bar(x_pos + width/2, df_plot['mean_val_acc'],
       width, yerr=df_plot['std_val_acc'],
       color=VAL_COLOR, edgecolor='black', linewidth=1.5,
       error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5},
       label='Validation (±std)')

for bar in bars1:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{height:.3f}',
            ha='center', va='bottom', fontsize=9)

for bar in bars2:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{height:.3f}',
            ha='center', va='bottom', fontsize=9)

if len(df_plot) == 2:
    diff = (df_plot.iloc[0]['mean_val_acc'] - df_plot.iloc[1]['mean_val_acc']) * 100
    mid_x = 0.5
    y_level = max(df_plot['mean_val_acc']) + max(df_plot['std_val_acc']) + 0.10
    ax.annotate(f'{diff:+.2f}% change',
                xy=(0, df_plot.iloc[0]['mean_val_acc']),
                xytext=(mid_x, y_level),
                fontsize=11, fontweight='bold', color='green' if diff > 0 else 'red',
                ha='center')

ax.set_xticks(x_pos)
ax.set_xticklabels(df_plot['class_weight'], rotation=0, ha='center')
ax.set_ylabel('Accuracy', fontweight='bold')
ax.set_title('Impact of Class Balancing', fontweight='bold', pad=15)
ax.legend(loc='upper right', frameon=True, shadow=True)
ax.grid(axis='y', alpha=0.3, linestyle=':')
ax.set_ylim([0, 1.0])

plt.tight_layout()
plt.savefig('Week1/results/svm_class_weight_impact.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Summary: All Parameter Impacts

Combined view of all experiments

In [None]:
fig = plt.figure(figsize=(16, 10))
gs = fig.add_gridspec(2, 2, hspace=0.3, wspace=0.3)

ax1 = fig.add_subplot(gs[0, :])
df_cb = df_codebook.sort_values('codebook_size')
ax1.plot(df_cb['codebook_size'], df_cb['mean_val_acc'], 
         marker='o', linewidth=2.5, markersize=7,
         color=VAL_COLOR, label='Validation', alpha=0.9)
ax1.fill_between(df_cb['codebook_size'], 
                 df_cb['mean_val_acc'] - df_cb['std_val_acc'],
                 df_cb['mean_val_acc'] + df_cb['std_val_acc'],
                 color=VAL_COLOR, alpha=0.2)
ax1.set_xlabel('Codebook Size (k)', fontweight='bold')
ax1.set_ylabel('Validation Accuracy', fontweight='bold')
ax1.set_title('(A) Codebook Size Impact', fontweight='bold', fontsize=13)
ax1.grid(True, alpha=0.3, linestyle=':')

ax2 = fig.add_subplot(gs[1, 0])
df_c = df_clf.sort_values('mean_val_acc', ascending=False)
x_pos = np.arange(len(df_c))
width = 0.35
ax2.bar(x_pos - width/2, df_c['mean_train_acc'], width, yerr=df_c['std_train_acc'],
         color=TRAIN_COLOR, alpha=0.7, edgecolor='black', linewidth=1.2,
         error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5}, label='Train')
ax2.bar(x_pos + width/2, df_c['mean_val_acc'], width, yerr=df_c['std_val_acc'],
         color=VAL_COLOR, alpha=0.7, edgecolor='black', linewidth=1.2,
         error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5}, label='Val')
ax2.set_xticks(x_pos)
ax2.set_xticklabels([name.replace('\n', ' ') for name in df_c['clf_name']], fontsize=9, rotation=15, ha='right')
ax2.set_ylabel('Accuracy', fontweight='bold')
ax2.set_title('(B) Classifier Comparison', fontweight='bold', fontsize=13)
ax2.legend(loc='upper right', fontsize=9)
ax2.grid(axis='y', alpha=0.3, linestyle=':')
ax2.set_ylim([0, 1.0])

ax3 = fig.add_subplot(gs[1, 1])
df_k = df_kernel.sort_values('mean_val_acc', ascending=False)
x_pos = np.arange(len(df_k))
ax3.bar(x_pos - width/2, df_k['mean_train_acc'], width, yerr=df_k['std_train_acc'],
         color=TRAIN_COLOR, alpha=0.7, edgecolor='black', linewidth=1.2,
         error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5}, label='Train')
ax3.bar(x_pos + width/2, df_k['mean_val_acc'], width, yerr=df_k['std_val_acc'],
         color=VAL_COLOR, alpha=0.7, edgecolor='black', linewidth=1.2,
         error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5}, label='Val')
ax3.set_xticks(x_pos)
ax3.set_xticklabels([k.replace('\n', ' ') for k in df_k['kernel']], fontsize=10)
ax3.set_ylabel('Accuracy', fontweight='bold')
ax3.set_title('(C) SVM Kernel Impact', fontweight='bold', fontsize=13)
ax3.legend(loc='upper right', fontsize=9)
ax3.grid(axis='y', alpha=0.3, linestyle=':')
ax3.set_ylim([0, 1.0])

plt.savefig('Week1/results/parameter_impact_summary.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
def extract_C(row):
    params = ast.literal_eval(row['clf_params'])
    return params.get('C', 1.0)

df_C_svm = pd.read_csv('./Week1/results/C_parameter_svm_results.csv')
df_C_svm['C_value'] = df_C_svm.apply(extract_C, axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

df_plot = df_C_svm.groupby('C_value').agg({
    'mean_train_acc': 'mean',
    'std_train_acc': 'mean',
    'mean_val_acc': 'mean',
    'std_val_acc': 'mean'
}).reset_index()
df_plot = df_plot.sort_values('C_value')

x_pos = np.arange(len(df_plot))
width = 0.35

bars1 = ax.bar(x_pos - width/2, df_plot['mean_train_acc'],
       width, yerr=df_plot['std_train_acc'],
       color=TRAIN_COLOR, edgecolor='black', linewidth=1.5,
       error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5},
       label='Training (±std)')

bars2 = ax.bar(x_pos + width/2, df_plot['mean_val_acc'],
       width, yerr=df_plot['std_val_acc'],
       color=VAL_COLOR, edgecolor='black', linewidth=1.5,
       error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5},
       label='Validation (±std)')

for bar in bars1:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{height:.3f}',
            ha='center', va='bottom', fontsize=8)

for bar in bars2:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{height:.3f}',
            ha='center', va='bottom', fontsize=8)

best_idx = df_plot['mean_val_acc'].idxmax()
best_C = df_plot.loc[best_idx, 'C_value']
best_acc = df_plot.loc[best_idx, 'mean_val_acc']
best_x_pos = df_plot.index.get_loc(best_idx)
ax.scatter([best_x_pos], [best_acc], color='red', s=200, zorder=5, marker='*')

ax.set_xticks(x_pos)
ax.set_xticklabels([str(c) for c in df_plot['C_value']], rotation=45, ha='right')
ax.set_xlabel('C Parameter', fontweight='bold')
ax.set_ylabel('Accuracy', fontweight='bold')
ax.set_title('Impact of C Parameter - RBF SVM', fontweight='bold', pad=15)
ax.legend(loc='best', frameon=True, shadow=True)
ax.grid(axis='y', alpha=0.3, linestyle=':')
ax.set_ylim([0, 1.0])

plt.tight_layout()
plt.savefig('Week1/results/C_parameter_svm_impact.png', dpi=300, bbox_inches='tight')
plt.show()

## 12. Regularization Parameter C - RBF SVM

Impact of C parameter on RBF kernel SVM

In [None]:
df_C_log = pd.read_csv('./Week1/results/C_parameter_logistic_results.csv')
df_C_log['C_value'] = df_C_log.apply(extract_C, axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

df_plot = df_C_log.groupby('C_value').agg({
    'mean_train_acc': 'mean',
    'std_train_acc': 'mean',
    'mean_val_acc': 'mean',
    'std_val_acc': 'mean'
}).reset_index()
df_plot = df_plot.sort_values('C_value')

x_pos = np.arange(len(df_plot))
width = 0.35

bars1 = ax.bar(x_pos - width/2, df_plot['mean_train_acc'],
       width, yerr=df_plot['std_train_acc'],
       color=TRAIN_COLOR, edgecolor='black', linewidth=1.5,
       error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5},
       label='Training (±std)')

bars2 = ax.bar(x_pos + width/2, df_plot['mean_val_acc'],
       width, yerr=df_plot['std_val_acc'],
       color=VAL_COLOR, edgecolor='black', linewidth=1.5,
       error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5},
       label='Validation (±std)')

for bar in bars1:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{height:.3f}',
            ha='center', va='bottom', fontsize=8)

for bar in bars2:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{height:.3f}',
            ha='center', va='bottom', fontsize=8)

best_idx = df_plot['mean_val_acc'].idxmax()
best_C = df_plot.loc[best_idx, 'C_value']
best_acc = df_plot.loc[best_idx, 'mean_val_acc']
best_x_pos = df_plot.index.get_loc(best_idx)
ax.scatter([best_x_pos], [best_acc], color='red', s=200, zorder=5, marker='*')

ax.set_xticks(x_pos)
ax.set_xticklabels([str(c) for c in df_plot['C_value']], rotation=45, ha='right')
ax.set_xlabel('C Parameter', fontweight='bold')
ax.set_ylabel('Accuracy', fontweight='bold')
ax.set_title('Impact of C Parameter - Logistic Regression', fontweight='bold', pad=15)
ax.legend(loc='best', frameon=True, shadow=True)
ax.grid(axis='y', alpha=0.3, linestyle=':')
ax.set_ylim([0, 1.0])

plt.tight_layout()
plt.savefig('Week1/results/C_parameter_logistic_impact.png', dpi=300, bbox_inches='tight')
plt.show()

## 11. Regularization Parameter C - Logistic Regression

Impact of C parameter on Logistic Regression

In [None]:
df_C_hist = pd.read_csv('./Week1/results/C_parameter_histogram_results.csv')
df_C_hist['C_value'] = df_C_hist.apply(extract_C, axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

df_plot = df_C_hist.groupby('C_value').agg({
    'mean_train_acc': 'mean',
    'std_train_acc': 'mean',
    'mean_val_acc': 'mean',
    'std_val_acc': 'mean'
}).reset_index()
df_plot = df_plot.sort_values('C_value')

x_pos = np.arange(len(df_plot))
width = 0.35

bars1 = ax.bar(x_pos - width/2, df_plot['mean_train_acc'],
       width, yerr=df_plot['std_train_acc'],
       color=TRAIN_COLOR, edgecolor='black', linewidth=1.5,
       error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5},
       label='Training (±std)')

bars2 = ax.bar(x_pos + width/2, df_plot['mean_val_acc'],
       width, yerr=df_plot['std_val_acc'],
       color=VAL_COLOR, edgecolor='black', linewidth=1.5,
       error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5},
       label='Validation (±std)')

for bar in bars1:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{height:.3f}',
            ha='center', va='bottom', fontsize=8)

for bar in bars2:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{height:.3f}',
            ha='center', va='bottom', fontsize=8)

best_idx = df_plot['mean_val_acc'].idxmax()
best_C = df_plot.loc[best_idx, 'C_value']
best_acc = df_plot.loc[best_idx, 'mean_val_acc']
best_x_pos = df_plot.index.get_loc(best_idx)
ax.scatter([best_x_pos], [best_acc], color='red', s=200, zorder=5, marker='*')

ax.set_xticks(x_pos)
ax.set_xticklabels([str(c) for c in df_plot['C_value']], rotation=45, ha='right')
ax.set_xlabel('C Parameter', fontweight='bold')
ax.set_ylabel('Accuracy', fontweight='bold')
ax.set_title('Impact of C Parameter - Histogram Intersection SVM', fontweight='bold', pad=15)
ax.legend(loc='best', frameon=True, shadow=True)
ax.grid(axis='y', alpha=0.3, linestyle=':')
ax.set_ylim([0, 1.0])

plt.tight_layout()
plt.savefig('Week1/results/C_parameter_histogram_impact.png', dpi=300, bbox_inches='tight')
plt.show()

## 10. Regularization Parameter C - Histogram Intersection SVM

Impact of C parameter on Histogram Intersection SVM

In [None]:
df_nfeatures = pd.read_csv('Week1/results/sift_nfeatures_test_results.csv')

def extract_nfeatures(row):
    detector = row['detector']
    if 'nf' in detector:
        import re
        match = re.search(r'nf(\d+)', detector)
        if match:
            return int(match.group(1))
    return None

df_nfeatures['nfeatures'] = df_nfeatures.apply(extract_nfeatures, axis=1)
df_nfeatures = df_nfeatures.dropna(subset=['nfeatures'])

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

df_plot = df_nfeatures.groupby('nfeatures').agg({
    'mean_train_acc': 'mean',
    'std_train_acc': 'mean',
    'mean_val_acc': 'mean',
    'std_val_acc': 'mean'
}).reset_index()
df_plot = df_plot.sort_values('nfeatures')

x = df_plot['nfeatures']

ax.plot(x, df_plot['mean_train_acc'],
        marker='o', linewidth=2.5, markersize=6,
        color=TRAIN_COLOR, label='Training', alpha=0.9)
ax.fill_between(x,
                df_plot['mean_train_acc'] - df_plot['std_train_acc'],
                df_plot['mean_train_acc'] + df_plot['std_train_acc'],
                color=TRAIN_COLOR, alpha=0.2)

ax.plot(x, df_plot['mean_val_acc'],
        marker='s', linewidth=2.5, markersize=6,
        color=VAL_COLOR, label='Validation', alpha=0.9)
ax.fill_between(x,
                df_plot['mean_val_acc'] - df_plot['std_val_acc'],
                df_plot['mean_val_acc'] + df_plot['std_val_acc'],
                color=VAL_COLOR, alpha=0.2)

best_idx = df_plot['mean_val_acc'].idxmax()
best_nf = df_plot.loc[best_idx, 'nfeatures']
best_acc = df_plot.loc[best_idx, 'mean_val_acc']
ax.axvline(best_nf, color='red', linestyle='--', linewidth=1.5, alpha=0.5, label=f'Optimal nf={int(best_nf)}')
ax.scatter([best_nf], [best_acc], color='red', s=150, zorder=5, marker='*')

ax.set_xlabel('Number of SIFT Features', fontweight='bold')
ax.set_ylabel('Accuracy', fontweight='bold')
ax.set_title('Impact of SIFT nFeatures Parameter', fontweight='bold', pad=15)
ax.legend(loc='best', frameon=True, shadow=True)
ax.grid(True, alpha=0.3, linestyle=':')

plt.tight_layout()
plt.savefig('Week1/results/sift_nfeatures_impact.png', dpi=300, bbox_inches='tight')
plt.show()

## 9. SIFT nFeatures Impact

How does the number of SIFT features affect performance?

In [None]:
df_gamma = pd.read_csv('./Week1/results/svm_gamma_experiment_results.csv')

def extract_gamma(row):
    params = ast.literal_eval(row['clf_params'])
    gamma = params.get('gamma', 'scale')
    return gamma

df_gamma['gamma_value'] = df_gamma.apply(extract_gamma, axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(14, 6))

def gamma_sort_key(gamma_val):
    if isinstance(gamma_val, (int, float)):
        return (0, gamma_val)
    else:
        order = {'auto': 1, 'scale': 2}
        return (1, order.get(gamma_val, 999))

df_plot = df_gamma.copy()
df_plot['sort_key'] = df_plot['gamma_value'].apply(gamma_sort_key)
df_plot = df_plot.sort_values('sort_key')

x_pos = np.arange(len(df_plot))
width = 0.35

bars1 = ax.bar(x_pos - width/2, df_plot['mean_train_acc'],
       width, yerr=df_plot['std_train_acc'],
       color=TRAIN_COLOR, edgecolor='black', linewidth=1.5,
       error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5},
       label='Training (±std)')

bars2 = ax.bar(x_pos + width/2, df_plot['mean_val_acc'],
       width, yerr=df_plot['std_val_acc'],
       color=VAL_COLOR, edgecolor='black', linewidth=1.5,
       error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5},
       label='Validation (±std)')

for bar in bars1:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{height:.3f}',
            ha='center', va='bottom', fontsize=7)

for bar in bars2:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{height:.3f}',
            ha='center', va='bottom', fontsize=7)

best_idx = df_plot['mean_val_acc'].idxmax()
best_gamma = df_plot.loc[best_idx, 'gamma_value']
best_acc = df_plot.loc[best_idx, 'mean_val_acc']
best_x_pos = df_plot.reset_index(drop=True)[df_plot.index == best_idx].index[0]
ax.scatter([best_x_pos], [best_acc], color='red', s=200, zorder=5, marker='*')

x_labels = []
for val in df_plot['gamma_value']:
    if isinstance(val, (int, float)):
        x_labels.append(f'{val:.4f}' if val < 0.01 else f'{val}')
    else:
        x_labels.append(val)

ax.set_xticks(x_pos)
ax.set_xticklabels(x_labels, rotation=45, ha='right')
ax.set_xlabel('Gamma Parameter', fontweight='bold')
ax.set_ylabel('Accuracy', fontweight='bold')
ax.set_title('Impact of SVM Gamma Parameter', fontweight='bold', pad=15)
ax.legend(loc='best', frameon=True, shadow=True)
ax.grid(axis='y', alpha=0.3, linestyle=':')
ax.set_ylim([0, 1.0])

plt.tight_layout()
plt.savefig('Week1/results/svm_gamma_impact.png', dpi=300, bbox_inches='tight')
plt.show()

## 8. SVM Gamma Parameter Impact

How does the gamma parameter affect SVM RBF kernel performance?

In [None]:
df_pyramid = pd.read_csv('./Week1/results/spatial_pyramid_experiment_results.csv')

def get_pyramid_name(row):
    pyramid_type = str(row['spatial_pyramid'])
    if pyramid_type == 'None':
        return 'None'
    elif pyramid_type == 'horizontal':
        return 'Horizontal'
    elif pyramid_type == 'vertical':
        return 'Vertical'
    elif pyramid_type == 'square':
        return 'Square'
    else:
        return pyramid_type

df_pyramid['pyramid_name'] = df_pyramid.apply(get_pyramid_name, axis=1)

In [None]:
pyramid_types = ['horizontal', 'vertical', 'square']
pyramid_types_present = [pt for pt in pyramid_types if pt.capitalize() in df_pyramid['pyramid_name'].values]

if len(pyramid_types_present) > 0:
    fig, axes = plt.subplots(1, len(pyramid_types_present), figsize=(6*len(pyramid_types_present), 5))
    if len(pyramid_types_present) == 1:
        axes = [axes]
    
    for idx, ptype in enumerate(pyramid_types_present):
        ax = axes[idx]
        
        df_plot = df_pyramid[df_pyramid['pyramid_name'] == ptype.capitalize()].copy()
        
        df_none = df_pyramid[df_pyramid['pyramid_name'] == 'None'].copy()
        if len(df_none) > 0:
            df_none['pyramid_levels'] = 0
            df_plot = pd.concat([df_none, df_plot])
        
        df_plot = df_plot.sort_values('pyramid_levels')
        
        x = df_plot['pyramid_levels']
        
        ax.plot(x, df_plot['mean_train_acc'],
                marker='o', linewidth=2.5, markersize=6,
                color=TRAIN_COLOR, label='Training', alpha=0.9)
        ax.fill_between(x,
                        df_plot['mean_train_acc'] - df_plot['std_train_acc'],
                        df_plot['mean_train_acc'] + df_plot['std_train_acc'],
                        color=TRAIN_COLOR, alpha=0.2)
        
        ax.plot(x, df_plot['mean_val_acc'],
                marker='s', linewidth=2.5, markersize=6,
                color=VAL_COLOR, label='Validation', alpha=0.9)
        ax.fill_between(x,
                        df_plot['mean_val_acc'] - df_plot['std_val_acc'],
                        df_plot['mean_val_acc'] + df_plot['std_val_acc'],
                        color=VAL_COLOR, alpha=0.2)
        
        df_plot_no_none = df_plot[df_plot['pyramid_levels'] > 0]
        if len(df_plot_no_none) > 0:
            best_idx = df_plot_no_none['mean_val_acc'].idxmax()
            best_levels = df_plot_no_none.loc[best_idx, 'pyramid_levels']
            best_acc = df_plot_no_none.loc[best_idx, 'mean_val_acc']
            ax.axvline(best_levels, color='red', linestyle='--', linewidth=1.5, alpha=0.5, 
                      label=f'Optimal levels={int(best_levels)}')
            ax.scatter([best_levels], [best_acc], color='red', s=150, zorder=5, marker='*')
        
        ax.set_xlabel('Pyramid Levels (0=None)', fontweight='bold')
        ax.set_ylabel('Accuracy', fontweight='bold')
        ax.set_title(f'{ptype.capitalize()} Pyramid', fontweight='bold', pad=15)
        ax.legend(loc='best', frameon=True, shadow=True, fontsize=9)
        ax.grid(True, alpha=0.3, linestyle=':')
        ax.set_ylim([0, 1.0])
    
    plt.tight_layout()
    plt.savefig('Week1/results/spatial_pyramid_impact.png', dpi=300, bbox_inches='tight')
    plt.show()

## 7. Spatial Pyramid Impact

How does spatial pyramid pooling affect performance?

In [None]:
df_pca = pd.read_csv('./Week1/results/pca_dimensionality_results.csv')

df_pca['pca_dimension'] = df_pca['pca_dim'].fillna(0).astype(int)
df_pca = df_pca[df_pca['pca_dimension'] > 0]

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

df_plot = df_pca.groupby('pca_dimension').agg({
    'mean_train_acc': 'mean',
    'std_train_acc': 'mean',
    'mean_val_acc': 'mean',
    'std_val_acc': 'mean',
    'pca_explained_variance': 'mean'
}).reset_index()
df_plot = df_plot.sort_values('pca_dimension')

x = df_plot['pca_dimension']

ax.plot(x, df_plot['mean_train_acc'],
        marker='o', linewidth=2.5, markersize=6,
        color=TRAIN_COLOR, label='Training', alpha=0.9)
ax.fill_between(x,
                df_plot['mean_train_acc'] - df_plot['std_train_acc'],
                df_plot['mean_train_acc'] + df_plot['std_train_acc'],
                color=TRAIN_COLOR, alpha=0.2)

ax.plot(x, df_plot['mean_val_acc'],
        marker='s', linewidth=2.5, markersize=6,
        color=VAL_COLOR, label='Validation', alpha=0.9)
ax.fill_between(x,
                df_plot['mean_val_acc'] - df_plot['std_val_acc'],
                df_plot['mean_val_acc'] + df_plot['std_val_acc'],
                color=VAL_COLOR, alpha=0.2)

best_idx = df_plot['mean_val_acc'].idxmax()
best_dim = df_plot.loc[best_idx, 'pca_dimension']
best_acc = df_plot.loc[best_idx, 'mean_val_acc']
ax.axvline(best_dim, color='red', linestyle='--', linewidth=1.5, alpha=0.5, label=f'Optimal dim={int(best_dim)}')
ax.scatter([best_dim], [best_acc], color='red', s=150, zorder=5, marker='*')

ax.set_xlabel('PCA Dimensions', fontweight='bold')
ax.set_ylabel('Accuracy', fontweight='bold')
ax.set_title('Impact of PCA Dimensionality Reduction', fontweight='bold', pad=15)
ax.legend(loc='best', frameon=True, shadow=True)
ax.grid(True, alpha=0.3, linestyle=':')

plt.tight_layout()
plt.savefig('Week1/results/pca_dimensionality_impact.png', dpi=300, bbox_inches='tight')
plt.show()

## Dense SIFT Scale Impact

How does the scale parameter affect Dense SIFT performance?

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

df_plot = df_detector.sort_values('mean_val_acc', ascending=False)
x_pos = np.arange(len(df_plot))
width = 0.35

bars1 = ax.bar(x_pos - width/2, df_plot['mean_train_acc'],
       width, yerr=df_plot['std_train_acc'],
       color=TRAIN_COLOR, edgecolor='black', linewidth=1.5,
       error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5},
       label='Training (±std)')

bars2 = ax.bar(x_pos + width/2, df_plot['mean_val_acc'],
       width, yerr=df_plot['std_val_acc'],
       color=VAL_COLOR, edgecolor='black', linewidth=1.5,
       error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5},
       label='Validation (±std)')

for bar in bars1:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{height:.3f}',
            ha='center', va='bottom', fontsize=9)

for bar in bars2:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{height:.3f}',
            ha='center', va='bottom', fontsize=9)

ax.set_xticks(x_pos)
ax.set_xticklabels(df_plot['detector'], rotation=0, ha='center')
ax.set_xlabel('Feature Detector', fontweight='bold')
ax.set_ylabel('Accuracy', fontweight='bold')
ax.set_title('Feature Detector Comparison', fontweight='bold', pad=15)
ax.legend(loc='upper right', frameon=True, shadow=True)
ax.grid(axis='y', alpha=0.3, linestyle=':')
ax.set_ylim([0, 1.0])

plt.tight_layout()
plt.savefig('Week1/results/detector_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
df_detector = pd.read_csv('./Week1/results/detector_comparison_test_results.csv')

## Detector Comparison

Comparing different feature detectors: SIFT, ORB, and AKAZE

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

df_plot = df_dense_step.sort_values('mean_val_acc', ascending=False)
x_pos = np.arange(len(df_plot))
width = 0.35

bars1 = ax.bar(x_pos - width/2, df_plot['mean_train_acc'],
       width, yerr=df_plot['std_train_acc'],
       color=TRAIN_COLOR, edgecolor='black', linewidth=1.5,
       error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5},
       label='Training (±std)')

bars2 = ax.bar(x_pos + width/2, df_plot['mean_val_acc'],
       width, yerr=df_plot['std_val_acc'],
       color=VAL_COLOR, edgecolor='black', linewidth=1.5,
       error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5},
       label='Validation (±std)')

for bar in bars1:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{height:.3f}',
            ha='center', va='bottom', fontsize=9)

for bar in bars2:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{height:.3f}',
            ha='center', va='bottom', fontsize=9)

best_idx = df_plot['mean_val_acc'].idxmax()
best_step = df_plot.loc[best_idx, 'dense_step']
best_acc = df_plot.loc[best_idx, 'mean_val_acc']
best_x_pos = df_plot.reset_index(drop=True)[df_plot.index == best_idx].index[0]
ax.scatter([best_x_pos], [best_acc], color='red', s=200, zorder=5, marker='*')

ax.set_xticks(x_pos)
ax.set_xticklabels(df_plot['dense_step'], rotation=0, ha='center')
ax.set_xlabel('Dense SIFT Step Size', fontweight='bold')
ax.set_ylabel('Accuracy', fontweight='bold')
ax.set_title('Impact of Dense SIFT Step Parameter', fontweight='bold', pad=15)
ax.legend(loc='best', frameon=True, shadow=True)
ax.grid(axis='y', alpha=0.3, linestyle=':')
ax.set_ylim([0, 1.0])

plt.tight_layout()
plt.savefig('Week1/results/dense_sift_step_impact.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
df_dense_step = pd.read_csv('./Week1/results/dense_sift_step_test_results.csv')

## Dense SIFT Step Impact

How does the step parameter affect Dense SIFT performance?

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

df_plot = df_dense_scale.sort_values('mean_val_acc', ascending=False)
x_pos = np.arange(len(df_plot))
width = 0.35

bars1 = ax.bar(x_pos - width/2, df_plot['mean_train_acc'],
       width, yerr=df_plot['std_train_acc'],
       color=TRAIN_COLOR, edgecolor='black', linewidth=1.5,
       error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5},
       label='Training (±std)')

bars2 = ax.bar(x_pos + width/2, df_plot['mean_val_acc'],
       width, yerr=df_plot['std_val_acc'],
       color=VAL_COLOR, edgecolor='black', linewidth=1.5,
       error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5},
       label='Validation (±std)')

for bar in bars1:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{height:.3f}',
            ha='center', va='bottom', fontsize=8)

for bar in bars2:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{height:.3f}',
            ha='center', va='bottom', fontsize=8)

best_idx = df_plot['mean_val_acc'].idxmax()
best_scale = df_plot.loc[best_idx, 'scales_label']
best_acc = df_plot.loc[best_idx, 'mean_val_acc']
best_x_pos = df_plot.reset_index(drop=True)[df_plot.index == best_idx].index[0]
ax.scatter([best_x_pos], [best_acc], color='red', s=200, zorder=5, marker='*')

ax.set_xticks(x_pos)
ax.set_xticklabels(df_plot['scales_label'], rotation=45, ha='right', fontsize=9)
ax.set_xlabel('Dense SIFT Scales', fontweight='bold')
ax.set_ylabel('Accuracy', fontweight='bold')
ax.set_title('Impact of Dense SIFT Scale Parameter', fontweight='bold', pad=15)
ax.legend(loc='best', frameon=True, shadow=True)
ax.grid(axis='y', alpha=0.3, linestyle=':')
ax.set_ylim([0, 1.0])

plt.tight_layout()
plt.savefig('Week1/results/dense_sift_scale_impact.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
df_dense_scale = pd.read_csv('./Week1/results/dense_sift_scale_test_results.csv')

def extract_dense_scales(row):
    scales_str = str(row['dense_scales'])
    scales = ast.literal_eval(scales_str)
    if isinstance(scales, list):
        return str(scales)
    return str([scales])

df_dense_scale['scales_label'] = df_dense_scale.apply(extract_dense_scales, axis=1)

In [None]:
df_detector = pd.read_csv('./Week1/results/detector_comparison_test_results.csv')

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

df_plot = df_dense_step.sort_values('mean_val_acc', ascending=False)
x_pos = np.arange(len(df_plot))
width = 0.35

bars1 = ax.bar(x_pos - width/2, df_plot['mean_train_acc'],
       width, yerr=df_plot['std_train_acc'],
       color=TRAIN_COLOR, edgecolor='black', linewidth=1.5,
       error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5},
       label='Training (±std)')

bars2 = ax.bar(x_pos + width/2, df_plot['mean_val_acc'],
       width, yerr=df_plot['std_val_acc'],
       color=VAL_COLOR, edgecolor='black', linewidth=1.5,
       error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5},
       label='Validation (±std)')

for bar in bars1:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{height:.3f}',
            ha='center', va='bottom', fontsize=9)

for bar in bars2:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{height:.3f}',
            ha='center', va='bottom', fontsize=9)

best_idx = df_plot['mean_val_acc'].idxmax()
best_step = df_plot.loc[best_idx, 'dense_step']
best_acc = df_plot.loc[best_idx, 'mean_val_acc']
best_x_pos = df_plot.reset_index(drop=True)[df_plot.index == best_idx].index[0]
ax.scatter([best_x_pos], [best_acc], color='red', s=200, zorder=5, marker='*')

ax.set_xticks(x_pos)
ax.set_xticklabels(df_plot['dense_step'], rotation=0, ha='center')
ax.set_xlabel('Dense SIFT Step Size', fontweight='bold')
ax.set_ylabel('Accuracy', fontweight='bold')
ax.set_title('Impact of Dense SIFT Step Parameter', fontweight='bold', pad=15)
ax.legend(loc='best', frameon=True, shadow=True)
ax.grid(axis='y', alpha=0.3, linestyle=':')
ax.set_ylim([0, 1.0])

plt.tight_layout()
plt.savefig('Week1/results/dense_sift_step_impact.png', dpi=300, bbox_inches='tight')
plt.show()

## Dense SIFT Step Impact

How does the step parameter affect Dense SIFT performance?

In [None]:
df_dense_scale = pd.read_csv('./Week1/results/dense_sift_scale_test_results.csv')

def extract_dense_scales(row):
    scales_str = str(row['dense_scales'])
    scales = ast.literal_eval(scales_str)
    if isinstance(scales, list):
        return str(scales)
    return str([scales])

df_dense_scale['scales_label'] = df_dense_scale.apply(extract_dense_scales, axis=1)

In [None]:
df_scaler = pd.read_csv('Week1/results/encoding_scaler_test_results.csv')

df_scaler['scaler_type'] = df_scaler['scaler_type'].fillna('none')

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

df_plot = df_scaler.sort_values('test_acc', ascending=False)
x_pos = np.arange(len(df_plot))
width = 0.35

bars1 = ax1.bar(x_pos - width/2, df_plot['mean_train_acc'],
       width, yerr=df_plot['std_train_acc'],
       color=TRAIN_COLOR, edgecolor='black', linewidth=1.5,
       error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5},
       label='Training (±std)')

bars2 = ax1.bar(x_pos + width/2, df_plot['mean_val_acc'],
       width, yerr=df_plot['std_val_acc'],
       color=VAL_COLOR, edgecolor='black', linewidth=1.5,
       error_kw={'linewidth': 2, 'ecolor': 'black', 'capsize': 5},
       label='Validation (±std)')

for bar in bars1:
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{height:.3f}',
            ha='center', va='bottom', fontsize=9)

for bar in bars2:
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{height:.3f}',
            ha='center', va='bottom', fontsize=9)

ax1.set_xticks(x_pos)
ax1.set_xticklabels(df_plot['scaler_type'], rotation=0, ha='center')
ax1.set_ylabel('Accuracy', fontweight='bold')
ax1.set_title('Train vs Validation Accuracy by Scaler', fontweight='bold', pad=15)
ax1.legend(loc='upper right', frameon=True, shadow=True)
ax1.grid(axis='y', alpha=0.3, linestyle=':')
ax1.set_ylim([0, 1.0])

df_plot['train_val_gap'] = df_plot['mean_train_acc'] - df_plot['mean_val_acc']

bars = ax2.bar(x_pos, df_plot['train_val_gap'], 
               color=[TEST_COLOR if x > 0.4 else TRAIN_COLOR for x in df_plot['train_val_gap']],
               edgecolor='black', linewidth=1.5)

for i, (bar, val) in enumerate(zip(bars, df_plot['train_val_gap'])):
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height,
            f'{val:.3f}',
            ha='center', va='bottom' if height > 0 else 'top', fontsize=10, fontweight='bold')

ax2.set_xticks(x_pos)
ax2.set_xticklabels(df_plot['scaler_type'], rotation=0, ha='center')
ax2.set_ylabel('Train - Validation Gap', fontweight='bold')
ax2.set_title('Overfitting: Train-Validation Gap', fontweight='bold', pad=15)
ax2.grid(axis='y', alpha=0.3, linestyle=':')
ax2.axhline(y=0, color='black', linestyle='--', linewidth=0.8)

plt.tight_layout()
plt.savefig('Week1/results/encoding_scaler_impact.png', dpi=300, bbox_inches='tight')
plt.show()