In [2]:
import boto3
import sagemaker

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [3]:
bucket_name = "sagemaker-team11-stanford-dogs"
session = boto3.session.Session()
sagemaker_session = sagemaker.Session(
    default_bucket = bucket_name
)
region = session.region_name
bucket = sagemaker_session.default_bucket()
s3 = boto3.Session().client(service_name="s3", region_name=region)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [4]:
response = s3.list_objects_v2(Bucket=bucket_name, Delimiter='/')
response

{'ResponseMetadata': {'RequestId': 'DQC9WNJR1HTQY9PQ',
  'HostId': 'iQ2vEB23L6raZFB3mnNnmYFgab7kIdITJEd0n2Oih7NRhRrgUZsG0ZiixDWCrRIfgWKBAiKfs7M=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'iQ2vEB23L6raZFB3mnNnmYFgab7kIdITJEd0n2Oih7NRhRrgUZsG0ZiixDWCrRIfgWKBAiKfs7M=',
   'x-amz-request-id': 'DQC9WNJR1HTQY9PQ',
   'date': 'Sun, 10 Mar 2024 05:49:26 GMT',
   'x-amz-bucket-region': 'us-east-1',
   'content-type': 'application/xml',
   'transfer-encoding': 'chunked',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'IsTruncated': False,
 'Name': 'sagemaker-team11-stanford-dogs',
 'Prefix': '',
 'Delimiter': '/',
 'MaxKeys': 1000,
 'CommonPrefixes': [{'Prefix': 'Annotation/'}, {'Prefix': 'Images/'}],
 'EncodingType': 'url',
 'KeyCount': 2}

In [5]:
for obj in response.get('CommonPrefixes', []):
    print(obj.get('Prefix'))

Annotation/
Images/


In [6]:
# page through the bucket to get all image objects and annotation objects in a list
paginator = s3.get_paginator('list_objects_v2')

Image_objects = []
for page in paginator.paginate(Bucket=bucket_name, Prefix='Images'):
    if 'Contents' in page:
        for obj in page['Contents']:
            Image_objects.append(obj['Key'])

Annotation_objects = []
for page in paginator.paginate(Bucket=bucket_name, Prefix='Annotation'):
    if 'Contents' in page:
        for obj in page['Contents']:
            Annotation_objects.append(obj['Key'])

In [7]:
Image_objects

['Images/n02085620-Chihuahua/n02085620_10074.jpg',
 'Images/n02085620-Chihuahua/n02085620_10131.jpg',
 'Images/n02085620-Chihuahua/n02085620_10621.jpg',
 'Images/n02085620-Chihuahua/n02085620_1073.jpg',
 'Images/n02085620-Chihuahua/n02085620_10976.jpg',
 'Images/n02085620-Chihuahua/n02085620_11140.jpg',
 'Images/n02085620-Chihuahua/n02085620_11238.jpg',
 'Images/n02085620-Chihuahua/n02085620_11258.jpg',
 'Images/n02085620-Chihuahua/n02085620_11337.jpg',
 'Images/n02085620-Chihuahua/n02085620_11477.jpg',
 'Images/n02085620-Chihuahua/n02085620_1152.jpg',
 'Images/n02085620-Chihuahua/n02085620_11696.jpg',
 'Images/n02085620-Chihuahua/n02085620_11818.jpg',
 'Images/n02085620-Chihuahua/n02085620_11948.jpg',
 'Images/n02085620-Chihuahua/n02085620_1205.jpg',
 'Images/n02085620-Chihuahua/n02085620_12101.jpg',
 'Images/n02085620-Chihuahua/n02085620_12334.jpg',
 'Images/n02085620-Chihuahua/n02085620_1235.jpg',
 'Images/n02085620-Chihuahua/n02085620_1271.jpg',
 'Images/n02085620-Chihuahua/n020856

In [8]:
Annotation_objects

['Annotation/n02085620-Chihuahua/n02085620_10074',
 'Annotation/n02085620-Chihuahua/n02085620_10131',
 'Annotation/n02085620-Chihuahua/n02085620_10621',
 'Annotation/n02085620-Chihuahua/n02085620_1073',
 'Annotation/n02085620-Chihuahua/n02085620_10976',
 'Annotation/n02085620-Chihuahua/n02085620_11140',
 'Annotation/n02085620-Chihuahua/n02085620_11238',
 'Annotation/n02085620-Chihuahua/n02085620_11258',
 'Annotation/n02085620-Chihuahua/n02085620_11337',
 'Annotation/n02085620-Chihuahua/n02085620_11477',
 'Annotation/n02085620-Chihuahua/n02085620_1152',
 'Annotation/n02085620-Chihuahua/n02085620_11696',
 'Annotation/n02085620-Chihuahua/n02085620_11818',
 'Annotation/n02085620-Chihuahua/n02085620_11948',
 'Annotation/n02085620-Chihuahua/n02085620_1205',
 'Annotation/n02085620-Chihuahua/n02085620_12101',
 'Annotation/n02085620-Chihuahua/n02085620_12334',
 'Annotation/n02085620-Chihuahua/n02085620_1235',
 'Annotation/n02085620-Chihuahua/n02085620_1271',
 'Annotation/n02085620-Chihuahua/n02

In [9]:
import pandas as pd

annotation_classes = []
annotation_ids = []
annotation_file_paths = []
annotation_breeds = []

for idx, val in enumerate(Annotation_objects):
    annotation_file_paths.append(val)

    split_Annotation_object = val.split('/')
    annotation_id = split_Annotation_object[2].split('.')[0]
    annotation_ids.append(annotation_id)

    annotation_class = annotation_id.split('_')[0]
    annotation_classes.append(annotation_class)
    
    annotation_breed = split_Annotation_object[1].split('-')[1]
    annotation_breeds.append(annotation_breed)

ann_df = pd.DataFrame({'annotation_file_path':annotation_file_paths,
                      'annotation_id':annotation_ids,
                      'annotation_class':annotation_classes,
                      'annotation_breed':annotation_breeds})

ann_df




Unnamed: 0,annotation_file_path,annotation_id,annotation_class,annotation_breed
0,Annotation/n02085620-Chihuahua/n02085620_10074,n02085620_10074,n02085620,Chihuahua
1,Annotation/n02085620-Chihuahua/n02085620_10131,n02085620_10131,n02085620,Chihuahua
2,Annotation/n02085620-Chihuahua/n02085620_10621,n02085620_10621,n02085620,Chihuahua
3,Annotation/n02085620-Chihuahua/n02085620_1073,n02085620_1073,n02085620,Chihuahua
4,Annotation/n02085620-Chihuahua/n02085620_10976,n02085620_10976,n02085620,Chihuahua
...,...,...,...,...
20575,Annotation/n02116738-African_hunting_dog/n0211...,n02116738_9798,n02116738,African_hunting_dog
20576,Annotation/n02116738-African_hunting_dog/n0211...,n02116738_9818,n02116738,African_hunting_dog
20577,Annotation/n02116738-African_hunting_dog/n0211...,n02116738_9829,n02116738,African_hunting_dog
20578,Annotation/n02116738-African_hunting_dog/n0211...,n02116738_9844,n02116738,African_hunting_dog


In [10]:
image_classes = []
image_ids = []
image_file_paths = []

for idx, val in enumerate(Image_objects):
    image_file_paths.append(val)

    split_object = val.split('/')
    image_id = split_object[2].split('.')[0]
    image_ids.append(image_id)

    image_class = image_id.split('_')[0]
    image_classes.append(image_class)

image_df = pd.DataFrame({'image_file_path':image_file_paths,
                      'image_id':image_ids,
                      'image_class':image_classes})

image_df

Unnamed: 0,image_file_path,image_id,image_class
0,Images/n02085620-Chihuahua/n02085620_10074.jpg,n02085620_10074,n02085620
1,Images/n02085620-Chihuahua/n02085620_10131.jpg,n02085620_10131,n02085620
2,Images/n02085620-Chihuahua/n02085620_10621.jpg,n02085620_10621,n02085620
3,Images/n02085620-Chihuahua/n02085620_1073.jpg,n02085620_1073,n02085620
4,Images/n02085620-Chihuahua/n02085620_10976.jpg,n02085620_10976,n02085620
...,...,...,...
20575,Images/n02116738-African_hunting_dog/n02116738...,n02116738_9798,n02116738
20576,Images/n02116738-African_hunting_dog/n02116738...,n02116738_9818,n02116738
20577,Images/n02116738-African_hunting_dog/n02116738...,n02116738_9829,n02116738
20578,Images/n02116738-African_hunting_dog/n02116738...,n02116738_9844,n02116738


In [11]:
# align the images and annotations
merged = pd.merge(image_df, ann_df, how='inner', left_on = 'image_id',right_on='annotation_id')

# map breed to a number with dictonary to return a label
breeds = list(merged['annotation_breed'].unique())
labels = list(range(120))
reference_builder_dict = dict(zip(breeds, labels))

merged['label'] = merged['annotation_breed'].map(reference_builder_dict)
display(merged)
merged.to_csv('labels.csv', index = False)

Unnamed: 0,image_file_path,image_id,image_class,annotation_file_path,annotation_id,annotation_class,annotation_breed,label
0,Images/n02085620-Chihuahua/n02085620_10074.jpg,n02085620_10074,n02085620,Annotation/n02085620-Chihuahua/n02085620_10074,n02085620_10074,n02085620,Chihuahua,0
1,Images/n02085620-Chihuahua/n02085620_10131.jpg,n02085620_10131,n02085620,Annotation/n02085620-Chihuahua/n02085620_10131,n02085620_10131,n02085620,Chihuahua,0
2,Images/n02085620-Chihuahua/n02085620_10621.jpg,n02085620_10621,n02085620,Annotation/n02085620-Chihuahua/n02085620_10621,n02085620_10621,n02085620,Chihuahua,0
3,Images/n02085620-Chihuahua/n02085620_1073.jpg,n02085620_1073,n02085620,Annotation/n02085620-Chihuahua/n02085620_1073,n02085620_1073,n02085620,Chihuahua,0
4,Images/n02085620-Chihuahua/n02085620_10976.jpg,n02085620_10976,n02085620,Annotation/n02085620-Chihuahua/n02085620_10976,n02085620_10976,n02085620,Chihuahua,0
...,...,...,...,...,...,...,...,...
20575,Images/n02116738-African_hunting_dog/n02116738...,n02116738_9798,n02116738,Annotation/n02116738-African_hunting_dog/n0211...,n02116738_9798,n02116738,African_hunting_dog,119
20576,Images/n02116738-African_hunting_dog/n02116738...,n02116738_9818,n02116738,Annotation/n02116738-African_hunting_dog/n0211...,n02116738_9818,n02116738,African_hunting_dog,119
20577,Images/n02116738-African_hunting_dog/n02116738...,n02116738_9829,n02116738,Annotation/n02116738-African_hunting_dog/n0211...,n02116738_9829,n02116738,African_hunting_dog,119
20578,Images/n02116738-African_hunting_dog/n02116738...,n02116738_9844,n02116738,Annotation/n02116738-African_hunting_dog/n0211...,n02116738_9844,n02116738,African_hunting_dog,119


In [13]:
import pandas as pd

# load the labels.csv
df = pd.read_csv('labels.csv')

df_train = df.sample(n=12000, random_state=42)
df_train.to_csv('train_labels.csv', index=False)

# Get the remaining rows by dropping the rows that are in df_train
df_test = df.drop(df_train.index)
df_test.to_csv('test_labels.csv', index=False)

In [12]:
import json

breeds = list(merged['annotation_breed'].unique())
labels = list(range(120))
dictionary = dict(zip(labels, breeds))

with open('label_breed_map.json', 'w') as f:
    json.dump(dictionary, f)

In [10]:
# Initialize the dataset
dataset = S3DogDataset(file_ref_csv='labels.csv')

# Get an image and its label
image, label = dataset[42]

image

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


tensor([[[ 1.1015e+00,  1.1869e+00,  1.3236e+00,  ..., -1.6483e+00,
          -1.6825e+00, -1.6996e+00],
         [ 1.0845e+00,  1.1528e+00,  1.2723e+00,  ..., -1.7167e+00,
          -1.7167e+00, -1.7337e+00],
         [ 1.0674e+00,  1.1015e+00,  1.2211e+00,  ..., -1.7679e+00,
          -1.8021e+00, -1.8021e+00],
         ...,
         [ 5.2081e-01,  5.8913e-01,  6.2329e-01,  ...,  4.8665e-01,
           5.8913e-01,  6.9161e-01],
         [ 5.7205e-01,  6.0621e-01,  6.2329e-01,  ...,  5.7205e-01,
           6.4037e-01,  7.0869e-01],
         [ 5.3789e-01,  5.3789e-01,  5.3789e-01,  ...,  6.5745e-01,
           6.9161e-01,  7.0869e-01]],

        [[ 4.8539e-01,  5.7270e-01,  7.1238e-01,  ..., -1.8194e+00,
          -1.8368e+00, -1.8368e+00],
         [ 4.6793e-01,  5.3778e-01,  6.6000e-01,  ..., -1.8019e+00,
          -1.8019e+00, -1.8194e+00],
         [ 4.5047e-01,  4.6793e-01,  5.9016e-01,  ..., -1.8019e+00,
          -1.8019e+00, -1.8194e+00],
         ...,
         [-9.0793e-02, -3