## Dependencies

**Note:** This notebook runs on ml.t3.medium (cheap). Heavy training runs on ml.g4dn.xlarge (GPU) via SageMaker Jobs.

In [None]:
!pip install -q kagglehub

In [None]:
import kagglehub
import pickle
import os
import numpy as np
import pandas as pd
import sagemaker
from sagemaker.tensorflow import TensorFlow
from sklearn.model_selection import train_test_split
from PIL import Image

## General Configurations

Prepare data locally (runs fast on t3.medium), then upload to S3 for training job.

In [None]:
# SageMaker setup
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sess.default_bucket()
prefix = 'fashion-bacnn'

print(f'SageMaker role: {role}')
print(f'S3 bucket: {bucket}')

In [None]:
# Download dataset
path = kagglehub.dataset_download("paramaggarwal/fashion-product-images-dataset")
print("Path to dataset files:", path)

In [None]:
# Load and filter dataset
styles_csv_path = f'{path}/fashion-dataset/styles.csv'
styles_df = pd.read_csv(styles_csv_path, on_bad_lines='skip')

# Filter by masterCategory
top_3_master = styles_df['masterCategory'].value_counts().nlargest(3).index.tolist()
filtered_df = styles_df[styles_df['masterCategory'].isin(top_3_master)]

# Filter by subCategory
result_df = pd.DataFrame()
for master_cat in top_3_master:
    df_master = filtered_df[filtered_df['masterCategory'] == master_cat]
    top_sub = [sc for sc in df_master['subCategory'].value_counts().nlargest(5).index.tolist() if sc != "Watches"][:2]
    result_df = pd.concat([result_df, df_master[df_master['subCategory'].isin(top_sub)]])

# Filter by articleType
dataset = pd.DataFrame()
for sub_cat in result_df['subCategory'].unique():
    if sub_cat in ['Watches', 'Flip Flops']:
        continue
    df_sub = result_df[result_df['subCategory'] == sub_cat]
    top_articles = df_sub['articleType'].value_counts().nlargest(3).index.tolist()
    dataset = pd.concat([dataset, df_sub[df_sub['articleType'].isin(top_articles)]])

print(f'Dataset size: {len(dataset)}')

In [None]:
# Prepare data if not already saved
if not os.path.exists('dados_32x32.pkl'):
    class_names = ["Tshirts", "Shirts", "Kurtas", "Jeans", "Shorts", "Trousers",
                   "Handbags", "Backpacks", "Clutches", "Earrings", "Pendant",
                   "Necklace and Chains", "Casual Shoes", "Sports Shoes", "Heels",
                   "Sandals", "Sports Sandals", "Flip Flops"]
    fine_to_index = {name: idx for idx, name in enumerate(class_names)}
    
    train_df, test_df = train_test_split(dataset, test_size=0.2, random_state=42)
    train_path = f'{path}/fashion-dataset/images/'
    
    x_train, y_train, x_test, y_test = [], [], [], []
    id_to_index = {image_id: index for index, image_id in enumerate(dataset['id'])}
    
    for image_id in dataset['id']:
        image_path = os.path.join(train_path, f'{image_id}.jpg')
        if os.path.exists(image_path):
            img = Image.open(image_path).convert("RGB").resize((32, 32))
            img_array = np.array(img)
            dataset_index = id_to_index[image_id]
            label = fine_to_index[dataset.iloc[dataset_index]['articleType']]
            
            if image_id in train_df['id'].values:
                x_train.append(img_array)
                y_train.append(label)
            elif image_id in test_df['id'].values:
                x_test.append(img_array)
                y_test.append(label)
    
    x_train = np.array(x_train).transpose(0, 3, 1, 2).astype("float32")
    x_test = np.array(x_test).transpose(0, 3, 1, 2).astype("float32")
    x_train = (x_train - np.mean(x_train)) / np.std(x_train)
    x_test = (x_test - np.mean(x_test)) / np.std(x_test)
    
    with open("dados_32x32.pkl", "wb") as f:
        pickle.dump({"x_train": x_train, "x_test": x_test, "y_train": np.array(y_train), "y_test": np.array(y_test)}, f)
    print('Data saved to dados_32x32.pkl')
else:
    print('Data file already exists')

In [None]:
# Upload data to S3
s3_data = sess.upload_data(path='dados_32x32.pkl', bucket=bucket, key_prefix=f'{prefix}/data')
print(f'Training data uploaded to: {s3_data}')

## BA-CNN Training with SageMaker

Launch training job on GPU instance. You can close this notebook - job runs independently.

In [None]:
# Configure SageMaker TensorFlow estimator
estimator = TensorFlow(
    entry_point='train.py',
    role=role,
    instance_count=1,
    instance_type='ml.g4dn.xlarge',  # GPU instance
    framework_version='2.13',
    py_version='py310',
    hyperparameters={
        'batch_size': 128,
        'epochs': 100
    },
    output_path=f's3://{bucket}/{prefix}/output',
    code_location=f's3://{bucket}/{prefix}/code',
    base_job_name='fashion-bacnn'
)

print('Estimator configured')

In [None]:
# Start training job
# Training runs on ml.g4dn.xlarge (GPU) - you can close this notebook
estimator.fit({'training': s3_data}, wait=False)
print(f'Training job started: {estimator.latest_training_job.name}')
print('Monitor in SageMaker Console > Training > Training jobs')

In [None]:
# Get model artifacts
print(f'Model artifacts: {estimator.model_data}')