### Data Preprocessing

In [11]:
import boto3
import zipfile

# Define dataset path
s3_bucket = "id-classifier-images"
s3_key = "images.zip"
local_zip_path = "/tmp/images.zip"
dataset_dir = "/tmp/dataset"

# Download and extract dataset from S3
s3 = boto3.client("s3")
print(s3)
# s3.download_file(s3_bucket, s3_key, local_zip_path)

# with zipfile.ZipFile(local_zip_path, "r") as zip_ref:
#     zip_ref.extractall(dataset_dir)

<botocore.client.S3 object at 0x7f52cc1269b0>


### Checking extracted dataset

<!-- import os

print("Checking /tmp/ contents:")
print(os.listdir("/tmp/"))  # List files/folders in /tmp

if os.path.exists(dataset_dir):
    print(f"Dataset directory exists: {dataset_dir}")
    print("Extracted files:")
    print(os.listdir(dataset_dir))  # List extracted files
else:
    print("Dataset directory NOT found. Extraction may have failed.") -->

### Define Image Transformations

In [17]:
# !pip install torchvision

In [21]:
import torch
from torchvision import transforms

transform = transforms.Compose([
    transforms.Resize((224, 224)), 
    transforms.ToTensor()
])


### Custom Dataset Class

In [22]:
import os
from PIL import Image
from torch.utils.data import DataLoader, Dataset
import shutil


class IdentityDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.image_paths = []
        self.labels = []
        self.classes = sorted(os.listdir(root_dir))

        for class_idx, class_name in enumerate(self.classes):
            class_dir = os.path.join(root_dir, class_name)
            for img_name in os.listdir(class_dir):
                self.image_paths.append(os.path.join(class_dir, img_name))
                self.labels.append(class_idx)

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert("RGB")
        label = self.labels[idx]
        if self.transform:
            image = self.transform(image)
        return image, label

dataset = IdentityDataset(dataset_dir, transform=transform)

### Model Training
- Use a simple CNN (Convolutional Neural Network).
- Train using SageMaker Training Jobs.

In [23]:
import torch.nn as nn
import torch.optim as optim

class IdentityModel(nn.Module):
    def __init__(self, num_classes=10):
        super(IdentityModel, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.fc = nn.Linear(32 * 112 * 112, num_classes)

    def forward(self, x):
        x = self.conv(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

# Model training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = IdentityModel(num_classes=10).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
