In [1]:
!pip install mlflow==2.13.2 sagemaker-mlflow==0.1.0



In [2]:
import time
import os
import json
import boto3
import numpy as np
import sagemaker
import requests
import torch
import tqdm
import mlflow

import torch.nn as nn
import torch.optim as optim

from PIL import Image
from torchvision import models
from torchvision import transforms
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sagemaker.feature_store.feature_group import FeatureGroup


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [3]:
boto_session = boto3.Session()
region = boto_session.region_name

sm_session = sagemaker.Session()
sm_client = boto_session.client("sagemaker")
sm_role = sagemaker.get_execution_role()

s3_client = boto3.client('s3')

# Define your feature group name and region
feature_group_name = 'fire-image-feature-group'

# Athena client
athena_client = boto3.client('athena', region_name=region)

# MLFLow
tracking_server_arn = 'arn:aws:sagemaker:eu-central-1:567821811420:mlflow-tracking-server/wildfire-mj'
experiment_name = 'wildfire-team2'

**Taking Data from Feature Store**

In [4]:
# Initialize the feature group
feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=sm_session)

In [5]:
# Query
query = """SELECT *
FROM "AwsDataCatalog"."sagemaker_featurestore"."fire_image_feature_group_1718694943";
"""

# Run query
response = athena_client.start_query_execution(
    QueryString=query,
    QueryExecutionContext={
        'Database': 'sagemaker_featurestore'  # Replace with your Athena database name
    },
    ResultConfiguration={
        'OutputLocation': 's3://wildfires/feature-store-output/'  # Replace with your S3 bucket
    }
)

# Get query execution ID
query_execution_id = response['QueryExecutionId']

# Wait for the query to complete
status = 'RUNNING'
while status != 'SUCCEEDED':
    response = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
    status = response['QueryExecution']['Status']['State']

# Get the results
response = athena_client.get_query_results(QueryExecutionId=query_execution_id)

# Process the results into a DataFrame
rows = [row['Data'] for row in response['ResultSet']['Rows'][1:]]
columns = [col['VarCharValue'] for col in response['ResultSet']['Rows'][0]['Data']]

rows[:5]

[[{'VarCharValue': '3fcaee94-dd1c-4350-ace3-bbe7fb57b7aa'},
  {'VarCharValue': 's3://wildfires/fire_images/fire.1.png'},
  {'VarCharValue': '1'},
  {'VarCharValue': 'png'},
  {'VarCharValue': '2024-06-18T07:28:48.589045Z'},
  {'VarCharValue': '2024-06-18 07:33:47.967'},
  {'VarCharValue': '2024-06-18 07:28:53.000'},
  {'VarCharValue': 'false'}],
 [{'VarCharValue': 'ece53839-d424-4234-8dbd-cb9e7353fa16'},
  {'VarCharValue': 's3://wildfires/fire_images/fire.408.png'},
  {'VarCharValue': '1'},
  {'VarCharValue': 'png'},
  {'VarCharValue': '2024-06-18T07:28:48.592019Z'},
  {'VarCharValue': '2024-06-18 07:33:47.967'},
  {'VarCharValue': '2024-06-18 07:28:53.000'},
  {'VarCharValue': 'false'}],
 [{'VarCharValue': 'e367966d-3b98-4cd1-a78a-274f8d889f13'},
  {'VarCharValue': 's3://wildfires/fire_images/fire.107.png'},
  {'VarCharValue': '1'},
  {'VarCharValue': 'png'},
  {'VarCharValue': '2024-06-18T07:28:48.589121Z'},
  {'VarCharValue': '2024-06-18 07:33:47.967'},
  {'VarCharValue': '2024-06-1

In [6]:
def download_images(metadata, download_dir='images'):
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)

    for record in metadata:
        image_location = record['image_location']
        bucket, key = image_location.replace('s3://', '').split('/', 1)
        local_path = os.path.join(download_dir, os.path.basename(key))

        s3_client.download_file(bucket, key, local_path)

        record['local_path'] = local_path  # Add the local path to the record

    return metadata


metadata = [
    {
        'image_id': row[0]['VarCharValue'],
        'image_location': row[1]['VarCharValue'],
        'label': int(row[2]['VarCharValue']),
        'image_type': row[3]['VarCharValue'],
        'event_time': row[4]['VarCharValue'],
    } for row in rows
]

metadata = download_images(metadata)
print("Finished!")

Finished!


In [7]:
# Split the metadata into train, validation, and test sets
train_metadata, test_metadata = train_test_split(metadata, test_size=0.2, stratify=[m['label'] for m in metadata], random_state=42)
train_metadata, val_metadata = train_test_split(train_metadata, test_size=0.25, stratify=[m['label'] for m in train_metadata], random_state=42)

print(f"Training samples: {len(train_metadata)}")
print(f"Validation samples: {len(val_metadata)}")
print(f"Test samples: {len(test_metadata)}")

Training samples: 599
Validation samples: 200
Test samples: 200


In [8]:
class FireDataset(Dataset):
    def __init__(self, metadata, transform=None):
        self.metadata = metadata
        self.transform = transform

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        img_path = self.metadata[idx]['local_path']
        image = Image.open(img_path).convert('RGB')
        label = self.metadata[idx]['label']

        if self.transform:
            image = self.transform(image)

        return image, label


In [9]:
# Define transformations for training and validation/test datasets
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Create datasets
train_dataset = FireDataset(train_metadata, transform=train_transform)
val_dataset = FireDataset(val_metadata, transform=val_test_transform)
test_dataset = FireDataset(test_metadata, transform=val_test_transform)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


**Training**

In [10]:
model = models.resnet18(weights='ResNet18_Weights.DEFAULT')

In [11]:
model

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [12]:
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 2)

In [13]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [14]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.1)

In [17]:
def train(model, train_loader, optimizer, loss_function, epoch, device, run):
    model = model.to(device)
    loss_function = loss_function.to(device)
    model.train()
    train_loss = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        data = data.to(device)
        target = target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = loss_function(output, target)
        train_loss += loss.sum().item()
        loss.backward()
        optimizer.step()
        
        if batch_idx % 200 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
    train_loss /= len(train_loader.dataset)

    mlflow.log_metric('training_loss', train_loss, step=epoch, run_id=run.info.run_id)


def test(model, test_loader, loss_function, epoch, device, run):
    model = model.to(device)
    loss_function = loss_function.to(device)
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data = data.to(device)
            target = target.to(device)
            output = model(data)
            test_loss += loss_function(output, target).sum().item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    mlflow.log_metric('test_loss', test_loss, step=epoch, run_id=run.info.run_id)
    mlflow.log_metric('test_accuracy',
                      (correct / len(test_loader.dataset)),
                      step=epoch, run_id=run.info.run_id)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [18]:
def train_test(model, optimizer, train_loader, test_loader, device, run, n_epochs=1):
    for epoch in range(0, n_epochs):
        train(model, train_loader, optimizer, criterion, epoch, device, run)
        test(model, test_loader, criterion, epoch, device, run)

In [19]:
EPOCHS = 5
model_filename = "model_resnet18_v2"
model_folder = "models"

%mkdir -p models

mlflow.set_tracking_uri(tracking_server_arn)
mlflow.set_experiment(experiment_name)


with mlflow.start_run(run_name=sagemaker.utils.name_from_base("classification-wildfire")) as run:
    mlflow.log_params({
        "Training samples": len(train_metadata),
        "Validation samples": len(val_metadata),
        "Test samples": len(test_metadata),
        "Epochs": EPOCHS,
    }, run_id=run.info.run_id)

    train_test(model=model,
               optimizer=optimizer,
               train_loader=train_loader,
               test_loader=test_loader,
               device=device,
               run=run,
               n_epochs=EPOCHS)

    torch.save(model.state_dict(), f'{model_folder}/{model_filename}.pth')
    mlflow.log_artifact(os.path.curdir, f'{model_folder}/{model_filename}.pth', run_id=run.info.run_id)

    mlflow.end_run(status='FINISHED')


Test set: Average loss: 617.8516, Accuracy: 151/200 (76%)


Test set: Average loss: 0.0167, Accuracy: 179/200 (90%)


Test set: Average loss: 0.0098, Accuracy: 175/200 (88%)


Test set: Average loss: 0.0028, Accuracy: 193/200 (96%)


Test set: Average loss: 0.0047, Accuracy: 187/200 (94%)



In [16]:
model_filename = "model_resnet18"
model_folder = "models"

%mkdir -p models

torch.save(model.state_dict(), f'{model_folder}/{model_filename}.pth')

In [17]:
import tarfile

# model.save_model(f"../{model_folder}/{model_filename}")

with tarfile.open(f"{model_folder}/{model_filename}.tar.gz", "w:gz") as tar:
    tar.add(f'{model_folder}/{model_filename}.pth', arcname=model_filename)

In [18]:
s3 = boto3.client('s3')

BUCKET_NAME = 'wildfires'

s3.upload_file(
    f'{model_folder}/{model_filename}.tar.gz',
    BUCKET_NAME,
    f"models/{model_filename}.tar.gz",
)

tensor([[-1.9718,  2.0293]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [44]:
torch.__version__

'2.0.0.post304'

In [78]:

model_filename = "resnet18_WeWillDeployCorrectlyThisTime"
# Define the S3 bucket and model file path
s3_bucket = 'wildfires'
model_key = f"models/{model_filename}.tar.gz"
local_model_path = 'resnet-model.tar.gz'

# Create an S3 client
s3 = boto3.client('s3')

# Download the model file from S3
s3.download_file(s3_bucket, model_key, local_model_path)

print(f'Model downloaded to {local_model_path}')


Model downloaded to resnet-model.tar.gz


In [79]:
with tarfile.open(local_model_path) as tar:
    tar.extractall()
    extracted_files = tar.getnames()
    print(f'Extracted files: {extracted_files}')

Extracted files: ['model_resnet18.pth', 'code', 'code/inference.py', 'code/.ipynb_checkpoints', 'code/.ipynb_checkpoints/inference-checkpoint.py']


In [80]:
extracted_model_path = 'model_resnet18.pth'  # Adjust this path based on the extracted files
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = models.resnet18()
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 2)
model.load_state_dict(torch.load(extracted_model_path, map_location=device))

<All keys matched successfully>

In [81]:
# model

In [82]:
val_test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


def preprocess_image(img_path, transform):
    image = Image.open(img_path).convert('RGB')
    image = transform(image)

    image = image.unsqueeze(0)
    return image


image_path = 'test_data_1.png'
preprocess_data = preprocess_image(image_path, val_test_transform)

input_list = input_data.tolist()
input_dict = {"inputs": input_list}

# Serialize the dictionary to a JSON string
input_json = json.dumps(input_dict)

data = json.loads(input_json)
input_data = torch.tensor(data['inputs'], dtype=torch.float32)

In [83]:
print(input_data == preprocess_data)

tensor([[[[True, True, True,  ..., True, True, True],
          [True, True, True,  ..., True, True, True],
          [True, True, True,  ..., True, True, True],
          ...,
          [True, True, True,  ..., True, True, True],
          [True, True, True,  ..., True, True, True],
          [True, True, True,  ..., True, True, True]],

         [[True, True, True,  ..., True, True, True],
          [True, True, True,  ..., True, True, True],
          [True, True, True,  ..., True, True, True],
          ...,
          [True, True, True,  ..., True, True, True],
          [True, True, True,  ..., True, True, True],
          [True, True, True,  ..., True, True, True]],

         [[True, True, True,  ..., True, True, True],
          [True, True, True,  ..., True, True, True],
          [True, True, True,  ..., True, True, True],
          ...,
          [True, True, True,  ..., True, True, True],
          [True, True, True,  ..., True, True, True],
          [True, True, True,  ...

In [84]:
input_data = input_data.to(device)

In [88]:
model = model.to(device)
with torch.no_grad():
    pred = model(input_data)
pred

tensor([[-1.9718,  2.0293]], device='cuda:0')