# 1. Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array

# Load the WikiDiverse dataset
dataset_path = r"C:\Users\Min Dator\aics-project\wikidiverse_dataset.csv"
data = pd.read_csv(dataset_path)

# Extract columns
image_paths1 = data['image_path1'].values
image_paths2 = data['image_path2'].values
text_data1 = data['text1'].values
text_data2 = data['text2'].values
labels = data['label'].values

# Tokenize and pad text
max_sequence_length = 100
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(np.concatenate([text_data1, text_data2]))

text_sequences1 = tokenizer.texts_to_sequences(text_data1)
text_sequences2 = tokenizer.texts_to_sequences(text_data2)

text_input1 = pad_sequences(text_sequences1, maxlen=max_sequence_length)
text_input2 = pad_sequences(text_sequences2, maxlen=max_sequence_length)

# Image preprocessing function
def preprocess_image(img_path):
    try:
        img = load_img(img_path, target_size=(224, 224))
        img_array = img_to_array(img)
        return preprocess_input(img_array)
    except Exception as e:
        print(f"Error loading image {img_path}: {e}")
        return np.zeros((224, 224, 3))  # Fallback for missing images

image_input1 = np.array([preprocess_image(img) for img in image_paths1])
image_input2 = np.array([preprocess_image(img) for img in image_paths2])

# Split data into training and testing sets
X_train_image1, X_test_image1, X_train_image2, X_test_image2, \
X_train_text1, X_test_text1, X_train_text2, X_test_text2, \
y_train, y_test = train_test_split(
    image_input1, image_input2, text_input1, text_input2, labels, test_size=0.2, random_state=42
)

print("Data preprocessing completed.")

2025-01-02 08:07:00.289982: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-02 08:07:04.571938: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Error loading image path_to_image_1.jpg: [Errno 2] No such file or directory: 'path_to_image_1.jpg'
Error loading image path_to_image_2.jpg: [Errno 2] No such file or directory: 'path_to_image_2.jpg'
Error loading image path_to_image_3.jpg: [Errno 2] No such file or directory: 'path_to_image_3.jpg'
Error loading image path_to_image_4.jpg: [Errno 2] No such file or directory: 'path_to_image_4.jpg'
Error loading image path_to_image_5.jpg: [Errno 2] No such file or directory: 'path_to_image_5.jpg'
Error loading image path_to_image_6.jpg: [Errno 2] No such file or directory: 'path_to_image_6.jpg'
Error loading image path_to_image_7.jpg: [Errno 2] No such file or directory: 'path_to_image_7.jpg'
Error loading image path_to_image_8.jpg: [Errno 2] No such file or directory: 'path_to_image_8.jpg'
Error loading image path_to_image_9.jpg: [Errno 2] No such file or directory: 'path_to_image_9.jpg'
Error loading image path_to_image_10.jpg: [Errno 2] No such file or directory: 'path_to_image_10.jpg

# 2. Model Architecture

In [3]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import ResNet50

# Image Sub-Network
def create_image_model(input_shape=(224, 224, 3)):
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=input_shape)
    x = layers.GlobalAveragePooling2D()(base_model.output)
    x = layers.Dense(256, activation='relu')(x)
    return models.Model(inputs=base_model.input, outputs=x)

# Text Sub-Network
def create_text_model(input_shape=(max_sequence_length,)):
    input_text = layers.Input(shape=input_shape)
    x = layers.Embedding(input_dim=10000, output_dim=128, input_length=max_sequence_length)(input_text)
    x = layers.LSTM(256, return_sequences=True)(x)
    x = layers.GlobalMaxPooling1D()(x)  # Global max pooling for compact representation
    x = layers.Dense(256, activation='relu')(x)
    return models.Model(inputs=input_text, outputs=x)

# Cross-Attention Layer
def cross_attention_layer(image_embedding, text_embedding):
    """
    Cross-attention mechanism where text guides the focus on the image embedding.
    """
    text_expanded = tf.expand_dims(text_embedding, axis=1)  # Expand for broadcasting
    image_expanded = tf.expand_dims(image_embedding, axis=2)
    
    # Attention scores
    attention_scores = tf.matmul(image_expanded, text_expanded)  # Shape: (batch_size, image_dim, text_dim)
    attention_weights = tf.nn.softmax(attention_scores, axis=1)
    
    # Weighted sum
    attended_image = tf.matmul(attention_weights, text_embedding)
    return layers.Flatten()(attended_image)

# Siamese Network
def create_siamese_network(image_shape=(224, 224, 3), text_shape=(max_sequence_length,)):
    # Image and Text Models
    image_model = create_image_model(input_shape=image_shape)
    text_model = create_text_model(input_shape=text_shape)
    
    # Inputs
    input_image1 = layers.Input(shape=image_shape)
    input_image2 = layers.Input(shape=image_shape)
    input_text1 = layers.Input(shape=text_shape)
    input_text2 = layers.Input(shape=text_shape)
    
    # Get Embeddings
    image_embedding1 = image_model(input_image1)
    image_embedding2 = image_model(input_image2)
    text_embedding1 = text_model(input_text1)
    text_embedding2 = text_model(input_text2)
    
    # Apply Cross-Attention
    cross_embedding1 = cross_attention_layer(image_embedding1, text_embedding1)
    cross_embedding2 = cross_attention_layer(image_embedding2, text_embedding2)
    
    # Combine Embeddings
    combined_embedding1 = layers.concatenate([cross_embedding1, text_embedding1])
    combined_embedding2 = layers.concatenate([cross_embedding2, text_embedding2])
    
    # Compute Similarity
    distance = layers.Lambda(lambda x: tf.abs(x[0] - x[1]))([combined_embedding1, combined_embedding2])
    output = layers.Dense(1, activation='sigmoid')(distance)
    
    # Build Model
    model = models.Model(inputs=[input_image1, input_image2, input_text1, input_text2], outputs=output)
    return model

# 3. Training and Evaluation

In [5]:
from tensorflow.keras.optimizers import Adam

# Loss Function
def contrastive_loss(y_true, y_pred, margin=1.0):
    square_pred = tf.square(y_pred)
    square_true = tf.square(y_true)
    loss = (y_true * square_pred) + ((1 - y_true) * tf.maximum(0.0, margin - tf.sqrt(square_pred + 1e-6))**2)
    return tf.reduce_mean(loss)

# Create and Compile Model
siamese_model = create_siamese_network()
siamese_model.compile(optimizer=Adam(learning_rate=1e-4), loss=contrastive_loss, metrics=['accuracy'])

# Train Model
history = siamese_model.fit(
    [X_train_image1, X_train_image2, X_train_text1, X_train_text2], y_train,
    batch_size=16, epochs=10, validation_split=0.1
)

# Evaluate Model
test_loss, test_accuracy = siamese_model.evaluate(
    [X_test_image1, X_test_image2, X_test_text1, X_test_text2], y_test
)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

2025-01-02 08:09:38.239809: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2025-01-02 08:09:51.201969: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2025-01-02 08:09:51.208344: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2025-01-02 08:09:51.210439: I tensorflow/core/com

Epoch 1/10


2025-01-02 08:09:54.293358: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2025-01-02 08:09:54.296759: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2025-01-02 08:09:54.303655: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

TypeError: in user code:

    File "/home/dawitj/anaconda3/lib/python3.11/site-packages/keras/engine/training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "/tmp/ipykernel_45302/2090342518.py", line 7, in contrastive_loss  *
        loss = (y_true * square_pred) + ((1 - y_true) * tf.maximum(0.0, margin - tf.sqrt(square_pred + 1e-6))**2)

    TypeError: Input 'y' of 'Mul' Op has type float32 that does not match type int64 of argument 'x'.
