# Libraries and s3 connexion

In [None]:
!pip install h5py rasterio torch

In [None]:
import numpy as np
import h5py
import pandas as pd
import s3fs
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import os

In [None]:
fs = s3fs.S3FileSystem(
    client_kwargs={"endpoint_url": f'https://{os.environ["AWS_S3_ENDPOINT"]}'},
    key=os.environ["AWS_ACCESS_KEY_ID"],
    secret=os.environ["AWS_SECRET_ACCESS_KEY"],
    token=os.environ["AWS_SESSION_TOKEN"],
)

# Load data

In [None]:
def download_s3_folder(fs, bucket_name, s3_folder, local_dir):
    """
    Télécharge tous les fichiers d'un dossier S3 dans un répertoire local.
    
    :param bucket_name: Nom du bucket S3.
    :param s3_folder: Chemin du dossier sur S3 à télécharger.
    :param local_dir: Chemin local où télécharger les fichiers.
    """
    files = fs.ls(f"{bucket_name}/{s3_folder}")

    for file in files:
        file_path = file.replace(bucket_name+s3_folder, "")
        local_file_path = os.path.join(local_dir, file_path)

        local_file_dir = os.path.dirname(local_file_path)
        if not os.path.exists(local_file_dir):
            os.makedirs(local_file_dir)

        print(f"Téléchargement de {file} vers {local_file_path}")
        fs.get(file, local_file_path)

# Téléchargement des données
bucket_name = 'projet-slums-detection/'
s3_folder = 'challenge_mexique/'
local_dir = 'data/'

download_s3_folder(fs, bucket_name, s3_folder, local_dir)


## Train data

In [None]:
# Path to your HDF5 file
hdf5_file = "data/train_data.h5"

# Open the HDF5 file
with h5py.File(hdf5_file, 'r') as hdf:
    # Extract the images (X)
    X = np.array(hdf['images'])
    
    # Extract the labels (y)
    y = np.array(hdf['labels'])

# Check the shapes to ensure they are correct
print("Shape of X (images):", X.shape)
print("Shape of y (labels):", y.shape)


### Visualize first image (uint16)

In [None]:
# plot l'image 50
image_array = X[50]

rgb_image = np.stack([image_array[:, :, 3], image_array[:, :, 4], image_array[:, :, 5]], axis=-1)

# Normalize the image for display (optional if values exceed standard 8-bit range)
rgb_image_normalized = rgb_image / np.max(rgb_image)

# Plot the RGB image
plt.imshow(rgb_image_normalized)
plt.axis('off')
plt.show()

## Test data

In [None]:
hdf5_file_test = "data/test_data.h5"
# Open the HDF5 file
with h5py.File(hdf5_file_test, 'r') as hdf:
    # Extract the images (X)
    X_test = np.array(hdf['images'])

# Check the shapes to ensure they are correct
print("Shape of X_test (images):", X_test.shape)

### Create y_test

In [None]:
mapping = pd.read_csv("data/id_map.csv")

In [None]:
sample = pd.read_csv("data/SampleSubmission.csv")

In [None]:
y_test = pd.merge(sample, mapping, on="id")
y_test = y_test.sort_values(by="ID", ascending=True)
y_test = y_test.reset_index(drop=True)
y_test = np.array(y_test['class'])

### Balance data

In [None]:
def balance_data(X, y, prop_of_zeros=0.5):
    # Step 1: Count the number of 1's in y
    num_ones = np.sum(y == 1)
    
    # Step 2: Get indices of 0's and 1's in y
    ones_indices = np.where(y == 1)[0]
    zeros_indices = np.where(y == 0)[0]
    
    # Step 3: Randomly sample the same number of 0's as there are 1's
    balanced_zero_indices = np.random.choice(zeros_indices, int(int(num_ones)*prop_of_zeros), replace=False)
    
    # Step 4: Combine indices of 0's and 1's
    balanced_indices = np.concatenate([ones_indices, balanced_zero_indices])
    
    # Step 5: Create balanced X and y
    X_balanced = X[balanced_indices]
    y_balanced = y[balanced_indices]
    
    # Display the number of 0's and 1's in the balanced y
    print(f"Number of 1's in balanced y: {np.sum(y_balanced == 1)}")
    print(f"Number of 0's in balanced y: {np.sum(y_balanced == 0)}")

    # Shuffle both X_balanced and y_balanced together
    X_train, y_train = shuffle(X_balanced, y_balanced, random_state=1)

    return X_train, y_train

In [None]:
X_train, y_train = balance_data(X, y)

In [None]:
print(X_train.shape)
print(X_test.shape)

## Entrainements

In [None]:
import torch
import torch.multiprocessing as multiprocessing
import torchvision
from torch import nn
from torchvision.models.resnet import ResNet50_Weights

# Increase the shared memory limit
multiprocessing.set_sharing_strategy("file_system")


class ResNet50Module(nn.Module):
    """
    Finetuned ResNet50 model for binary classification.

    The model is based on the ResNet50 architecture and has been trained on a
    specific task to classify inputs into two labels.

    Args:
        n_channel: (int) number of channels of the input image

    Returns:
        torch.Tensor: The output tensor containing the probabilities
        for each class.
    """

    def __init__(self, nchannel=6):
        super().__init__()
        # Load the pre-trained ResNet50 model
        self.model = torchvision.models.resnet50(weights=ResNet50_Weights.DEFAULT)

        # Replace the last fully connected layer
        self.model.fc = nn.Linear(2048, 2)
        self.softmax = nn.Softmax(dim=1)

        if nchannel != 3:
            self.model.conv1 = nn.Conv2d(
                nchannel,
                64,
                kernel_size=(7, 7),
                stride=(2, 2),
                padding=(3, 3),
                bias=False,
            )

    def forward(self, input):
        """
        Performs the forward pass of the model.

        Args:
            input (torch.Tensor): The input tensor.

        Returns:
            torch.Tensor: The output probabilities after applying the
            softmax activation.
        """
        output = self.model(input)
        probabilities = torch.softmax(output, dim=1)

        return probabilities

### Baseline models