In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns

In [None]:
digits = load_digits()
print(digits.data.shape)

X = digits.data
y = digits.target

print(f"Number of samples: {len(X)}")
print(f"Number of features per sample: {X.shape[1]}")
print(f"Unique classes in the dataset: {np.unique(y)}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Display a grid of sample digits
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(10, 4))
for ax, image, label in zip(axes.ravel(), digits.images, digits.target):
    ax.imshow(image, cmap=plt.cm.gray_r)
    ax.set_title(f'Label: {label}')
    ax.axis('off')
plt.tight_layout()
plt.show()

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
fig, axarr = plt.subplots(1, 1, figsize=(15, 3))

for i in range(1):
    # Original images
    axarr.imshow(scaler.mean_.reshape(8, 8), cmap='gray_r')
    axarr.axis('off')
    axarr.set_title(f'Mean Digit')

plt.tight_layout()
plt.show()

In [None]:
pca = PCA(n_components=2)
X_train_proj = pca.fit_transform(X_train_scaled)

In [None]:
x_dummy_scaled = scaler.transform([X_train_scaled[0]])   # scale using the same scaler
x_dummy_pca = pca.transform(x_dummy_scaled)        # project into PCA space
print(f"A data point in the original space (shape {x_dummy_scaled.shape}):")
print(x_dummy_scaled)
print(f"A data point in the new space (shape {x_dummy_pca.shape}):")
print(x_dummy_pca)

In [None]:
colors = sns.color_palette('hsv', 10)

plt.figure(figsize=(12, 8))

for digit, color in enumerate(colors):
    # Extract data points of the current class
    indices = y_train == digit
    plt.scatter(X_train_proj[indices, 0],  # First two principal components
                X_train_proj[indices, 1],
                color=color,
                s=50, alpha=0.6, label=str(digit))

plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA Projection (2D) of Digits Data')
plt.legend(title='Digits', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()

# A Game: Is it 6?

First, let's relabel the data.

In [None]:
TARGET_VALUE = 6

In [None]:
y6_train = (y_train == TARGET_VALUE)

colors = sns.color_palette('Set1', 2)

plt.figure(figsize=(8, 6))

for label, color, name in zip([False, True], colors, ['Not 6', '6']):
    indices = (y6_train == label)
    plt.scatter(X_train_proj[indices, 0],
                X_train_proj[indices, 1],
                color=color,
                s=40, alpha=0.7, label=name)

plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA Projection (2D): 6 vs. Not 6')
plt.legend(title='Class', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, linestyle=':')
plt.tight_layout()
plt.show()

In [None]:
X_test_scaled = scaler.transform(X_test)
X_test_proj = pca.transform(X_test_scaled)

y6_test = (y_test == TARGET_VALUE)

# AdaBoost with sklearn

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

base_learner = DecisionTreeClassifier(max_depth=1)

ada = AdaBoostClassifier(
    estimator=base_learner,
    n_estimators=50,
    learning_rate=1.0,
    random_state=42
)

ada.fit(X_train_proj, y6_train)

y6_train_pred = ada.predict(X_train_proj)
print("Training Accuracy:", accuracy_score(y6_train, y6_train_pred))

y6_test_pred = ada.predict(X_test_proj)
print("Test Accuracy:", accuracy_score(y6_test, y6_test_pred))

In [None]:
correct = (y6_train_pred == y6_train)
incorrect = ~correct

plt.figure(figsize=(8, 6))
colors = sns.color_palette('Set1', 2)

# Plot correct predictions as circles
for label, color, name in zip([False, True], colors, [f'Not {TARGET_VALUE}', f'{TARGET_VALUE}']):
    mask = (y6_train == label) & correct
    plt.scatter(X_train_proj[mask, 0], X_train_proj[mask, 1],
                color=color, marker='o', edgecolor='k', alpha=0.6, label=f'{name} (correct)')

# Plot incorrect predictions as crosses
for label, color, name in zip([False, True], colors, [f'Not {TARGET_VALUE}', f'{TARGET_VALUE}']):
    mask = (y6_train == label) & incorrect
    plt.scatter(X_train_proj[mask, 0], X_train_proj[mask, 1],
                color=color, marker='x', s=100, linewidths=2, label=f'{name} (wrong)')

plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title(f'AdaBoost Results on Training Set ({TARGET_VALUE} vs Not {TARGET_VALUE})')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, linestyle=':')
plt.tight_layout()
plt.show()

In [None]:
# Predict on test set
y6_test_pred = ada.predict(X_test_proj)

correct = (y6_test_pred == y6_test)
incorrect = ~correct

plt.figure(figsize=(8, 6))

colors = sns.color_palette('Set1', 2)

# Plot correct predictions as circles
for label, color, name in zip([False, True], colors, [f'Not {TARGET_VALUE}', f'{TARGET_VALUE}']):
    mask = (y6_test == label) & correct
    plt.scatter(X_test_proj[mask, 0], X_test_proj[mask, 1],
                color=color, marker='o', edgecolor='k', alpha=0.6, label=f'{name} (correct)')

# Plot incorrect predictions as crosses
for label, color, name in zip([False, True], colors, [f'Not {TARGET_VALUE}', f'{TARGET_VALUE}']):
    mask = (y6_test == label) & incorrect
    plt.scatter(X_test_proj[mask, 0], X_test_proj[mask, 1],
                color=color, marker='x', s=100, linewidths=2, label=f'{name} (wrong)')

plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('AdaBoost Classification Results (6 vs Not 6)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, linestyle=':')
plt.tight_layout()
plt.show()

## Look at where there's issues

In [None]:
# Test set

incorrect = (y6_test_pred != y6_test)

wrong_6 = (y6_test == True) & incorrect     # true label 6, predicted not-6
wrong_not6 = (y6_test == False) & incorrect # true label not-6, predicted 6

def show_incorrect(images, mask, true_label, pred_label, max_show=25):
    idxs = np.where(mask)[0]
    n_to_show = min(len(idxs), max_show)
    nrows = int(np.ceil(n_to_show / 5))
    ncols = 5
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(10, 2.2*nrows))
    axes = axes.ravel()

    for ax, idx in zip(axes, idxs[:n_to_show]):
        ax.imshow(X_test[idx].reshape(8, 8), cmap='gray_r')
        ax.set_title(f'True: {int(y_test[idx])}\nPred: {f"{TARGET_VALUE}" if y6_test_pred[idx] else f"¬{TARGET_VALUE}"}',
                     fontsize=10)
        ax.axis('off')

    for ax in axes[n_to_show:]:
        ax.axis('off')

    fig.suptitle(f'Incorrect {true_label}s (predicted {pred_label})', fontsize=14)
    plt.tight_layout()
    plt.show()

show_incorrect(digits.images, wrong_6, true_label=f'{TARGET_VALUE}', pred_label=f'not-{TARGET_VALUE}')
show_incorrect(digits.images, wrong_not6, true_label=f'not-{TARGET_VALUE}', pred_label=f'{TARGET_VALUE}')

In [None]:
# Train set

incorrect = (y6_train_pred != y6_train)

wrong_6 = (y6_train == True) & incorrect     # true label 6, predicted not-6
wrong_not6 = (y6_train == False) & incorrect # true label not-6, predicted 6

def show_incorrect(images, mask, true_label, pred_label, max_show=25):
    idxs = np.where(mask)[0]
    n_to_show = min(len(idxs), max_show)
    nrows = int(np.ceil(n_to_show / 5))
    ncols = 5
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(10, 2.2*nrows))
    axes = axes.ravel()

    for ax, idx in zip(axes, idxs[:n_to_show]):
        ax.imshow(X_train[idx].reshape(8, 8), cmap='gray_r')
        ax.set_title(f'True: {int(y_train[idx])}\nPred: {f"{TARGET_VALUE}" if y6_train_pred[idx] else f"¬{TARGET_VALUE}"}',
                     fontsize=10)
        ax.axis('off')

    for ax in axes[n_to_show:]:
        ax.axis('off')

    fig.suptitle(f'Incorrect {true_label}s (predicted {pred_label})', fontsize=14)
    plt.tight_layout()
    plt.show()

show_incorrect(digits.images, wrong_6, true_label=f'{TARGET_VALUE}', pred_label=f'not-{TARGET_VALUE}')
show_incorrect(digits.images, wrong_not6, true_label=f'not-{TARGET_VALUE}', pred_label=f'{TARGET_VALUE}')

# Implementing AdaBoost

In [None]:
class Decision():
    pass

In [None]:
class AdaBoost:
    def __init__(self, n_estimators=10):
        self.n_estimators = n_estimators

    def fit(self, X, y):
        raise NotImplementedError("After homework deadline; remind Jim to update this notebook.")

    def predict(self, X):
        raise NotImplementedError("After homework deadline; remind Jim to update this notebook.")

ab = AdaBoost(n_estimators=50)
ab.fit(X_train_proj, y6_train)

In [None]:
# Predict on training and test sets
y_train_pred = ab.predict(X_train_proj)
y_test_pred  = ab.predict(X_test_proj)

# Compute accuracies
train_acc = accuracy_score(y6_train, y_train_pred)
test_acc  = accuracy_score(y6_test, y_test_pred)

print(f"Training accuracy: {train_acc:.3f}")
print(f"Test accuracy:     {test_acc:.3f}")

In [None]:
# Boolean arrays for correct / incorrect
correct = (y_test_pred == y6_test)
incorrect = ~correct

plt.figure(figsize=(8, 6))

colors = sns.color_palette('Set1', 2)  # same colors as before

# Plot correct predictions (circles)
for label, color, name in zip([False, True], colors, [f'Not {TARGET_VALUE}', f'{TARGET_VALUE}']):
    mask = (y6_test == label) & correct
    plt.scatter(X_test_proj[mask, 0], X_test_proj[mask, 1],
                color=color, marker='o', edgecolor='k', alpha=0.6, label=f'{name} (correct)')

# Plot incorrect predictions (crosses)
for label, color, name in zip([False, True], colors, [f'Not {TARGET_VALUE}', f'{TARGET_VALUE}']):
    mask = (y6_test == label) & incorrect
    plt.scatter(X_test_proj[mask, 0], X_test_proj[mask, 1],
                color=color, marker='x', s=100, linewidths=2, label=f'{name} (wrong)')

plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('AdaBoost Classification Results (6 vs Not 6)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, linestyle=':')
plt.tight_layout()
plt.show()