In [1]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image
from IPython.display import display
from time import gmtime, strftime
from sagemaker.predictor import csv_serializer
!pip install torch
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset
!pip install torchvision
from torchvision import datasets, transforms
import torchvision.models as models
import time
from tqdm import trange
from tqdm import tqdm
import torch.nn as nn
import copy
import torch.optim as optim
import pandas as pd
import os
from pathlib import Path
import operator

# Define IAM role
role = get_execution_role()
my_region = boto3.session.Session().region_name # set the region of the instance



print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the container for your SageMaker endpoint.")

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting torch
  Downloading torch-2.0.0-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m529.7 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting nvidia-cudnn-cu11==8.5.0.96
  Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl (557.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.1/557.1 MB[0m [31m599.9 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting nvidia-cusolver-cu11==11.4.0.1
  Downloading nvidia_cusolver_cu11-11.4.0.1-2-py3-none-manylinux1_x86_64.whl (102.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.6/102.6 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting nvidia-cuda-cupti-cu11==11.7.101
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl (11.8 MB)
[2K     [90m━━━━━━━━━━━━

In [None]:

bucket_name = 'trained_models'

s3 = boto3.resource('s3')

# check if the bucket exists
if s3.Bucket(bucket_name) in s3.buckets.all():
    print(f"{bucket_name} already exists.")
else:
    # create the bucket
    s3.create_bucket(Bucket=bucket_name)
    print(f"{bucket_name} created successfully.")

# Building a dataloader for STREET LEVEL IMAGES

In [15]:
import boto3
import os
from io import BytesIO
from PIL import Image
from torch.utils.data import Dataset, DataLoader, Subset
from tqdm import tqdm
import concurrent.futures

class S3ImageDataset(Dataset):
    def __init__(self, bucket_name, folder_path, class_folders=None, transform=None):
        # initialize attributes
        self.bucket_name = bucket_name
        self.folder_path = folder_path
        self.class_folders = class_folders
        self.transform = transform
        
        # create S3 resource
        self.s3 = boto3.resource('s3')
        self.bucket = self.s3.Bucket(bucket_name)
        
        # initialize lists to store image paths and labels
        self.images = []
        self.labels = []
        
        s3 = boto3.client('s3')
        
        # Make sure the prefix ends with a forward slash
        if not folder_path.endswith('/'):
            folder_path = folder_path + '/'

        # List all folders in the folder
        response = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_path, Delimiter='/')

        image_data = []

        # Loop through the folders and print their names
        for obj in tqdm(response.get('CommonPrefixes', [])):
            folder_name = obj.get('Prefix').replace(folder_path, '', 1).strip('/')
            full_path = folder_path + folder_name
            for sub_obj in s3.list_objects_v2(Bucket=bucket_name, Prefix=full_path)['Contents']:
                image_data.append((sub_obj['Key'], folder_name))

        with concurrent.futures.ThreadPoolExecutor() as executor:
            results = list(tqdm(executor.map(self.load_image, image_data), total=len(image_data)))

        for img, label in results:
            self.images.append(img)
            self.labels.append(label)

                
    def load_image(self, img_data):
        img_obj = self.bucket.Object(img_data[0])
        img_data = img_obj.get().get('Body').read()
        img = Image.open(BytesIO(img_data))
        # apply transformation if specified
        if self.transform:
            img = self.transform(img)
        return (img, img_data[1])

    def __len__(self):
        # return the number of images in the dataset
        return len(self.images)

class S3DataLoader:
    def __init__(self, bucket_name, folder_path, batch_size=32, shuffle=True, num_workers=4, transform=None):
        # initialize the dataset and dataloader
        self.dataset = S3ImageDataset(bucket_name=bucket_name, folder_path=folder_path, class_folders=None, transform=transform)
        self.split_dataset = self.train_val_dataset(self.dataset)
        self.split_loader = {x: DataLoader(self.split_dataset[x], batch_size=batch_size, shuffle=shuffle, num_workers=num_workers) for x in ['train', 'val']}
        
    def __iter__(self):
        # return the iterator over the dataloader
        return iter(self.split_loader)

    def __len__(self):
        # return the number of batches in the dataloader
        return len(self.split_loader)
    
    def train_val_dataset(self, val_split=0.3):
        # Split the indices of the dataset into train and validation sets
        train_idx, val_idx = train_test_split(list(range(len(self.dataset))), test_size=val_split)

        # Create a dictionary to store the train and validation datasets
        datasets = {}

        # Create a Subset for the train dataset using the train indices
        datasets['train'] = Subset(self.dataset, train_idx)



In [16]:
# define data transforms
transform = transforms.Compose([transforms.Resize(224),
                                transforms.CenterCrop(224),
                                transforms.ToTensor()])

# create S3 dataloader
bucket_name = "capgemini-cvguild"
folder_path = "multimodalrecognition/streetlevel_images/"
batch_size = 32
s3dl = S3DataLoader(bucket_name, folder_path, batch_size=batch_size, shuffle=True, num_workers=4, transform = transform)
street_dataloaders_dict = s3dl.split_loader

100%|██████████| 204/204 [00:08<00:00, 24.39it/s]
  1%|▏         | 254/19382 [00:05<06:53, 46.25it/s]


KeyboardInterrupt: 

In [None]:
import boto3
import pandas as pd

# Create an S3 client
s3 = boto3.client('s3')

# Set the bucket name and file name
bucket_name = "capgemini-cvguild"
file_name =  'multimodalrecognition/population_indicators/dhs_final_labels.csv'

# Download the file from S3 to a DataFrame
obj = s3.get_object(Bucket=bucket_name, Key=file_name)
features_df = pd.read_csv(obj['Body'])
display(features_df)

# make a target dict
target_dict = {}
test_keys = list(features_df['DHSID_EA'])
test_values = list(features_df['sanitation_index'])

target_dict = {test_keys[i]: test_values[i] for i in range(len(test_keys))}

In [None]:
# Detect if we have a GPU available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

def train_model(model, dataloaders, locations, target_dict,  criterion, optimizer,num_epochs=20, is_inception=False):
    since = time.time()

    val_acc_history = []
    train_acc_history = []

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

        running_loss = 0.0
        running_corrects = 0

        with tqdm(dataloaders[phase],unit = 'batch') as tepoch:
            # Iterate over data.
            
            print(tepoch)
            for inputs, labels in tepoch:
                # get DHSID_EA id for all labels
                lb_ids = [locations[id] for id in labels]
                # get feature values
                labels = [target_dict[lb_id] for lb_id in lb_ids]

                labels = torch.Tensor(labels)

                inputs = inputs.to(device)
                labels = labels.to(device)
                print('labels',labels)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):


                    outputs = model(inputs)
                    print('outputs', outputs)
                    loss = criterion(outputs, labels)
                    print('loss', loss)

                    _, preds = torch.max(outputs, 1)


                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()


                # statistics
                running_loss += loss.item() * inputs.size(0)
                #getCategoricalAccuracy(preds, labels.data, class_dict)
                running_corrects += torch.sum(preds == labels.data)

        epoch_loss = running_loss / len(dataloaders[phase].dataset)
        epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)

        print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

        # deep copy the model
        if phase == 'val' and epoch_acc > best_acc:
            best_acc = epoch_acc
            best_model_wts = copy.deepcopy(model.state_dict())
        if phase == 'val':
            val_acc_history.append(epoch_acc)
        if phase == 'train':
            train_acc_history.append(epoch_acc)

    print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, val_acc_history, train_acc_history

In [None]:
# Save your PyTorch model as an HDF5 file
with h5py.File('baselinemodel.h5', 'w') as f:
    for k, v in model.state_dict().items():
        f.create_dataset(k, data=v.cpu().numpy())

# Upload your HDF5 file to an S3 bucket
s3 = boto3.client('s3')
bucket_name = 'trained_models'
s3.upload_file('baselinemodel.h5', bucket_name, 'baselinemodel.h5')