# Bonus

For the bonus, you will modify an MNIST-digit classifier to include an "I don't know" (IDK) option. This is useful in real-world applications where a model should defer to a human when it is not confident in its prediction.

You will modify the MNIST generation and classification notebook for this task. Create and publish a FiftyOne dataset with your experimentsâ€™ results.


## Load the model

In [None]:
# Mnist Model Architecture
class MNISTClassifier(nn.Module):
    def __init__(self):
        super(MNISTClassifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.relu3 = nn.ReLU()
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.pool1(self.relu1(self.conv1(x)))
        x = self.pool2(self.relu2(self.conv2(x)))
        x = self.flatten(x)
        x = self.relu3(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
## Load the model
# initialize mnist classifier
mnist_classifier = MNISTClassifier().to(device)

## TODO
model_path = STORAGE_PATH / "checkpoints/best_mnist_classifier.pth"

try:
    mnist_classifier.load_state_dict(torch.load(model_path, map_location=device))
    print("Model loaded.")
except FileNotFoundError:
    print("Please run the training loop in the previous notebook to generate the model file.")

mnist_classifier.eval()

## Create test dataset

In [None]:
BATCH_SIZE_CLASSIFIER = 64

# Get the data
transform_classifier = transforms.Compose([
    transforms.ToTensor(),                      ## TODO: change toImage() etc. pp.
    transforms.Normalize((0.1307,), (0.3081,))
])
test_dataset_classifier = datasets.MNIST('.', train=False, download=True, transform=transform_classifier)
test_loader_classifier = DataLoader(test_dataset_classifier, batch_size=BATCH_SIZE_CLASSIFIER, shuffle=False)


## Create the prediction function

In [None]:
# Implement the predict_with_idk Logic
def predict_with_idk(image, model, threshold=0.5):
    """
    Args:
        image: A single image tensor (1, 28, 28)
        model: The trained PyTorch model
        threshold: Float between 0 and 1
    Returns:
        String: Predicted class "0"-"9" or "IDK"
    """
    # Ensure tensor has batch dimension [1, 1, 28, 28]
    if image.dim() == 3:
        image = image.unsqueeze(0)

    with torch.no_grad():
        image = image.to(device)
        logits = model(image)

        # Apply Softmax to get probabilities
        probs = Func.softmax(logits, dim=1)

        # Get the highest probability and its index
        confidence, predicted_idx = torch.max(probs, dim=1)

        confidence = confidence.item()
        predicted_idx = predicted_idx.item()

    # Check threshold
    if confidence >= threshold:
        return str(predicted_idx), confidence
    else:
        return "IDK", confidence

## Create Fiftyone Dataset


TODO: reformulate for assignment
Next Step for the Bonus Task
Now that you have a model with 99.24% accuracy, you can proceed to the code I wrote for you in the previous message.
Important Tip: Because your model is so accurate (99.24%), it is very confident. When testing your "IDK" function:
On Real MNIST: You might need a very high threshold (e.g., 0.99) to see any "IDK" results.
On Generated (Diffusion) Images: This is where the IDK function will shine. The generated digits are often imperfect, so the classifier will be less confident, and your function will correctly output "IDK".

In [None]:
TEMP_IMG_DIR = "mnist_temp"
os.makedirs(TEMP_IMG_DIR, exist_ok=True)

In [None]:
# Create a new FiftyOne dataset

# Delete existing dataset if it exists
if FIFTYONE_BONUS_DATASET_NAME in fo.list_datasets():
    print(f"Deleting existing dataset: {FIFTYONE_BONUS_DATASET_NAME}")
    fo.delete_dataset(FIFTYONE_BONUS_DATASET_NAME)

dataset = fo.Dataset(name=FIFTYONE_BONUS_DATASET_NAME)

samples = []

print("Running Inference...")
for i in tqdm(range(len(test_dataset_classifier))):
    img_tensor, label = test_dataset_classifier[i]

    # Run our IDK function
    pred_label, conf = predict_with_idk(img_tensor, mnist_classifier, CONFIDENCE_THRESHOLD)

    # Save image to disk for FiftyOne as it usually needs file paths
    filepath = os.path.join(TEMP_IMG_DIR, f"img_{i}.png")
    # Un-normalize for saving
    save_image(img_tensor, filepath)

    # Create Sample
    sample = fo.Sample(filepath=filepath)
    sample["ground_truth"] = fo.Classification(label=str(label))
    sample["prediction_with_idk"] = fo.Classification(label=pred_label, confidence=conf)

    # Add a simple tag to easily find IDK cases
    if pred_label == "IDK":
        sample.tags.append("IDK")

    samples.append(sample)

    # Optimization: Add in batches to save memory
    if len(samples) >= 1000:
        dataset.add_samples(samples)
        samples = []

# Add remaining samples
if samples:
    dataset.add_samples(samples)

In [None]:
# Launch App
session = fo.launch_app(dataset)
print(f"Dataset created with {len(dataset)} samples.")

In [None]:
# Create a View of only IDK samples
idk_view = dataset.match_tags("IDK")
print(f"Total IDK cases found: {len(idk_view)}")

# Launch App
session.view = idk_view

In [None]:
# Compute coverage to test for different confidence thresholds

total_samples = len(dataset)
idk_count = len(idk_view)       # number of idk samples

# Coverage = (Total - IDK) / Total
covered_view = dataset.match_tags("IDK", bool=False)
covered_count = len(covered_view)
coverage = covered_count / total_samples if total_samples > 0 else 0.0

# Calculate Accuracy on Covered Samples
# From those covered samples, count how many match the ground truth
correct_covered_view = covered_view.match(
    F("prediction_with_idk.label") == F("ground_truth.label")
)
correct_covered_count = len(correct_covered_view)

# Accuracy on Covered = Correct / Covered
accuracy_on_covered = correct_covered_count / covered_count if covered_count > 0 else 0.0

# Standard Accuracy (treating IDK as incorrect)
# This helps visualize the trade-off
standard_accuracy = correct_covered_count / total_samples if total_samples > 0 else 0.0

# --- Print Report ---
print(f"Total Test Images:    {total_samples}")
print(f"IDK Responses:        {idk_count}")
print(f"Covered Responses:    {covered_count}")
print("-" * 30)
print(f"COVERAGE:             {coverage:.2%}  (Goal: Keep this high)")
print(f"ACCURACY (Covered):   {accuracy_on_covered:.2%}  (Goal: Higher than standard accuracy)")
print(f"ACCURACY (Standard):  {standard_accuracy:.2%}  (Baseline)")
print("="*50)

## Publish Dataset on Hugging Face

In [None]:
# Save FiftyOne dataset (images + metadata) to disk
export_dir = "mnist_idk_export"
os.makedirs(export_dir, exist_ok=True)

print(f"Exporting dataset to {export_dir}...")

dataset.export(
    export_dir=export_dir,
    dataset_type=fo.types.FiftyOneDataset,
    export_media=True, # This ensures the actual .png images are included
)

print("Export complete.")

In [None]:
# Publish to Hugging Face

# Get token
HF_TOKEN = os.getenv("HF_TOKEN")

if HF_TOKEN:
    print("Uploading to Hugging Face...")
    api = HfApi(token=HF_TOKEN)
    repo_id = "mmarschn/mnist-idk-experiment"

    # Create the repo if it doesn't exist
    try:
        api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)
    except Exception as e:
        print(f"Repo creation warning: {e}")

    # Upload the folder
    api.upload_large_folder(
        folder_path=export_dir,
        repo_id=repo_id,
        repo_type="dataset",
        ignore_patterns=["*.ipynb_checkpoints"],
    )
    print(f"Successfully published to: https://huggingface.co/datasets/{repo_id}")
else:
    print("Error: HF_TOKEN not found. Cannot publish to Hugging Face.")