# 1 An overview
Siamese networks consist of two identical sub-networks that share weights and learn to compute the similarity between two input samples. The goal is to learn embeddings such that similar inputs are close in the embedding space, while dissimilar inputs are far apart. 
For the WikiDiverse dataset, where we have image-caption pairs, we can build a Siamese network that processes text and image data (or just one modality like text or image) and learns to compute similarity between two entities from the knowledge base. 

#  2. Required Libraries

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import optimizers

2024-12-18 10:48:03.224162: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-18 10:48:03.598089: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# 3 Data Preprocessing
Assuming you have preprocessed text and image data, we need to encode both image and text inputs for the Siamese network. You will first tokenize and pad the text, and then use a pretrained ResNet50 model (for example) for feature extraction from the images.

# Text Processing

In [None]:
# Example of text preprocessing using Tokenizer
max_sequence_length = 100  # Maximum length of each text sequence
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(text_data)  # `text_data` is a list of text samples

# Convert text to sequences
text_sequences = tokenizer.texts_to_sequences(text_data)

# Pad sequences to make them of uniform length
text_input = pad_sequences(text_sequences, maxlen=max_sequence_length)

# Image processin

In [None]:
# Example of image preprocessing using ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input

def preprocess_image(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    return preprocess_input(img_array)

image_input = np.array([preprocess_image(img_path) for img_path in image_data])

# 4. Building Siamese Network 

In [None]:
# Image sub-network (using ResNet50 for feature extraction)
def create_image_model(input_shape=(224, 224, 3)):
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=input_shape)
    x = layers.GlobalAveragePooling2D()(base_model.output)
    x = layers.Dense(256, activation='relu')(x)
    return models.Model(inputs=base_model.input, outputs=x)

# Text sub-network (using LSTM for sequence processing)
def create_text_model(input_shape=(max_sequence_length,)):
    input_text = layers.Input(shape=input_shape)
    x = layers.Embedding(input_dim=10000, output_dim=128, input_length=max_sequence_length)(input_text)
    x = layers.LSTM(256)(x)
    x = layers.Dense(256, activation='relu')(x)
    return models.Model(inputs=input_text, outputs=x)

# Shared sub-network for both text and image
def create_siamese_network(image_shape=(224, 224, 3), text_shape=(max_sequence_length,)):
    # Image model
    image_model = create_image_model(input_shape=image_shape)
    
    # Text model
    text_model = create_text_model(input_shape=text_shape)
    
    # Define inputs for the Siamese network
    input_image_1 = layers.Input(shape=image_shape)
    input_image_2 = layers.Input(shape=image_shape)
    input_text_1 = layers.Input(shape=text_shape)
    input_text_2 = layers.Input(shape=text_shape)
    
    # Get embeddings for both image pairs and text pairs
    image_embedding_1 = image_model(input_image_1)
    image_embedding_2 = image_model(input_image_2)
    text_embedding_1 = text_model(input_text_1)
    text_embedding_2 = text_model(input_text_2)
    
    # Combine the embeddings
    combined_embedding_1 = layers.concatenate([image_embedding_1, text_embedding_1])
    combined_embedding_2 = layers.concatenate([image_embedding_2, text_embedding_2])
    
    # Calculate the absolute difference between embeddings
    distance = layers.Lambda(lambda x: tf.abs(x[0] - x[1]))([combined_embedding_1, combined_embedding_2])
    
    # Output layer with sigmoid activation (similarity score)
    output = layers.Dense(1, activation='sigmoid')(distance)
    
    # Create the model
    model = models.Model(inputs=[input_image_1, input_image_2, input_text_1, input_text_2], outputs=output)
    
    return model

# 5. Loss Function
In Siamese networks, a common loss function used is contrastive loss, which minimizes the distance between similar pairs and maximizes the distance for dissimilar pairs. 

In [None]:
def contrastive_loss(y_true, y_pred, margin=1.0):
    square_pred = tf.square(y_pred)
    square_true = tf.square(y_true)
    loss = (y_true * square_pred) + ((1 - y_true) * tf.maximum(0.0, margin - tf.sqrt(square_pred + 1e-6))**2)
    return tf.reduce_mean(loss)

# 6. Compiling the Model
Compile the model with an optimizer (e.g., Adam) and the contrastive loss.

In [None]:
siamese_model = create_siamese_network()
siamese_model.compile(optimizer=optimizers.Adam(lr=0.0001), loss=contrastive_loss, metrics=['accuracy'])

# 7. Training
Now that the model is built, we can train it using image-caption pairs from the WikiDiverse dataset.

In [None]:
# Example training loop
# X_train_image1, X_train_image2: images for the pair
# X_train_text1, X_train_text2: text for the pair
# y_train: label (1 for similar, 0 for dissimilar)

siamese_model.fit([X_train_image1, X_train_image2, X_train_text1, X_train_text2], y_train, batch_size=32, epochs=10)


# 8. Evaluation
After training, you can evaluate the model on a test set to check its performance in similarity detection tasks.

In [None]:
siamese_model.evaluate([X_test_image1, X_test_image2, X_test_text1, X_test_text2], y_test)