# Booz Allen Spring 2025 Codefest: Challenge 1 
### Isaiah Byrd, Kyler Gelissen, David Ameh

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("sumn2u/garbage-classification-v2")
#print(path)

In [None]:
import os

# Create a directory to store the dataset
dataset_dir = "garbage_classification_dataset"
if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)
    print(f"Directory {dataset_dir} created.")
else:
    pass
    print(f"Directory {dataset_dir} already exists.")

In [None]:
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
import torch as th
import cv2
import json
import random
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms, datasets
from torch.utils.data import DataLoader, Dataset

In [None]:
#Set divice to GPU if available
divice = th.device("cuda" if th.cuda.is_available() else "cpu")

#Change the memory fraction to limit the GPU memory usage
#Set the memory fraction to 60% of the total GPU memory
memory_fraction = 0.6

#Limits the GPU memory usage to 60% 
if(divice.type == "cuda"):
    print("GPU is available")
    th.cuda.set_memory_fraction(memory_fraction, divice=divice.index)

In [None]:
#Defines the data transforms
data_transforms = transforms.Compose([
    transforms.Resize((224, 224)), #Image needs to be resized to 224x224 for ResNet requirement
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) #Standard normalization for ResNet
])

#Load the dataset
complete_dataset = datasets.ImageFolder(root=path, transform=data_transforms) 
dataset_size = len(complete_dataset)

In [None]:
#We are going to do a 80-10-10 split of the dataset to start
train_size = int(0.8 * dataset_size)
val_size = int(0.1 * dataset_size)
test_size = dataset_size - train_size - val_size
print(f"Dataset size: {dataset_size}")
print(f"Train size: {train_size}")
print(f"Validation size: {val_size}")
print(f"Test size: {test_size}")

#Useing random_split to split the dataset into train, validation and test sets
train_dataset, val_dataset, test_dataset = th.utils.data.random_split(complete_dataset, [train_size, val_size, test_size])

In [None]:
#Creating the dataloaders
batch_size = 16 #Update this if memory permits
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

# Manually defining the class names as per the dataset
class_names = ['Metal', 'Glass', 'Biological', 'Paper', 'Battery', 'Trash', 'Cardboard', 'Shoes', 'Clothes', 'Plastic']
num_classes = len(class_names)