# Multimodal siamese neural network for similarity detection between (image, text) pairs

This notebook features a multimodal siamese model built using PyTorch, designed to carry out deduplication for pairs of images and text. The initial dataset comprises an image folder and a data.txt file containing advertisement titles. Prior to running this notebook, it is necessary to execute the create dataset endpoint of the API to generate the data.csv file.

The intended usage for this notebook is for someone to test different model arhcitectures and/or training parameters without having to interact wit hthe API directly. Most of the .py files employed in the API (e.g., model.py) are also imported here. One can alter these and test results here to colclude to an optimal/custom solution.
Then if everything is de

## Simple example

### Data reading

In [2]:
import pandas as pd
from itertools import combinations
from torch.utils.data import DataLoader
import numpy as np
from utils.raw_preprocessing import *
from utils.training_utils import *
from dataset import *
from model import *
from sentence_transformers import SentenceTransformer, util, losses
import torch.nn as nn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torchvision.models as models
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
# Import necessary constants
from utils.constants import DATA_PATH, MODEL_PATH

# Load the dataset and set the device
dataset = pd.read_csv(DATA_PATH + "data.csv")
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')


### Creating data loaders for pytorch

In [4]:

# Example usage:
text_column1_name = 'title'
text_column2_name = 'title_2'
image_id_column1_name = 'img_identifier'
image_id_column2_name = 'img_identifier_2'
label_name = 'label'
image_folder = DATA_PATH + 'images'
dataset_size = len(dataset)


# Create custom datasets and DataLoaders for training and validation
train_dataset = CustomDataset('train', dataset_size, dataset, image_folder, text_column1_name, text_column2_name, image_id_column1_name, image_id_column2_name, label_name)
val_dataset = CustomDataset('val', dataset_size, dataset, image_folder, text_column1_name, text_column2_name, image_id_column1_name, image_id_column2_name, label_name)
test_dataset = CustomDataset('test', dataset_size, dataset, image_folder, text_column1_name, text_column2_name, image_id_column1_name, image_id_column2_name, label_name)

batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

### Model and Losses initialization

Here, we employ a pretrained CNN, namely resnet for the image comparison task and a sentence transformer for the tetx comparison task.

In [5]:
# Initialize models
resnet18 = models.resnet18(pretrained=True)
sentence_transformer = SentenceTransformer('all-MiniLM-L12-v2').to(device)
model = MultimodalModel(sentence_transformer, resnet18).to(device)

# Loss function and optimizer
cos_loss = nn.CosineEmbeddingLoss()
bce_loss = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)




This is the training loop where validation loss is monitored too to perfor mearly stopping.

In [None]:

early_stopping = EarlyStopper()

# Training loop
for epoch in range(2):

        # Perform training step
        training_loss = training_step(model, train_dataloader, bce_loss, cos_loss, optimizer, device)
 
        with torch.no_grad():
            # Perform validation step
            validation_loss = validation_step(model, val_dataloader, bce_loss, cos_loss, device)

            # Check for early stopping criteria
            if early_stopping.early_stop(validation_loss) or epoch == 24:
                torch.save(model.state_dict(), MODEL_PATH + 'model.pth')
                break

        print("Epoch {}, training loss: {:.4f}, validation loss: {:.4f}".format(epoch + 1, training_loss, validation_loss))

### Testing of the model

In [20]:



# Evaluation loop
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_dataloader:
            text_data1, text_data2, image_data1, image_data2, labels = batch['text_data1'], batch['text_data2'], batch['image_data1'], batch['image_data2'], batch['labels']

            # Move tensors to the device
            image_data1, image_data2 = image_data1.to(device), image_data2.to(device)

            outputs, _, _, _, _ = model(image_data1, image_data2, text_data1, text_data2)
            

            predictions.extend(outputs.squeeze().cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

# Convert predictions to binary (0 or 1) based on a threshold
threshold = 0.5
binary_predictions = np.array(predictions) > threshold
binary_predictions = binary_predictions.astype(int)
# Calculate evaluation metrics

accuracy = accuracy_score(true_labels, binary_predictions)
precision = precision_score(true_labels, binary_predictions)
recall = recall_score(true_labels, binary_predictions)
f1 = f1_score(true_labels, binary_predictions)

print('accuracy :', accuracy)
print('precision :', precision)
print('recall :', recall)
print('f1-score :', f1)

tensor([0.2405, 0.2267, 0.2812, 0.2427, 0.6125, 0.2293, 0.2854, 0.3708, 0.2471,
        0.3162, 0.2806, 0.2455, 0.2544, 0.3136, 0.3573, 0.2699, 0.4194, 0.3360,
        0.2707, 0.4497, 0.2935, 0.2501, 0.3744, 0.3665, 0.2619, 0.4047, 0.4158,
        0.3473, 0.3250, 0.3562, 0.2485, 0.2708])
tensor([0.2242, 0.2625, 0.3409, 0.3651, 0.2658, 0.2801, 0.2289, 0.3217, 0.2851,
        0.3087, 0.3191, 0.4320, 0.2972, 0.3563, 0.3343, 0.2853, 0.2296, 0.2938,
        0.4083, 0.2601, 0.3213, 0.3969, 0.2897, 0.4425, 0.1655, 0.2699, 0.2318,
        0.3364, 0.3040, 0.2638, 0.2555, 0.3425])
tensor([0.3464, 0.2440, 0.3147, 0.2175, 0.4190, 0.3552, 0.2354, 0.2633, 0.3691,
        0.1871, 0.3152, 0.2830, 0.2712, 0.2890, 0.2570, 0.3109, 0.2420, 0.2477,
        0.3286, 0.3094, 0.3340, 0.2455, 0.2951, 0.3606, 0.3950, 0.2731, 0.2346,
        0.2142, 0.2349, 0.3028, 0.2480, 0.3102])
tensor([0.3323, 0.3109, 0.2464, 0.2187, 0.3518, 0.2833, 0.4232, 0.3239, 0.2731,
        0.3592, 0.3002, 0.2918, 0.3251, 0.3179, 0.313

KeyboardInterrupt: 