The WikiDiverse dataset offers a comprehensive and rich set of multimodal information for entity linking and related tasks. Below are steps and strategies for effectively using the dataset in your project.
Dataset Structure
Passage Level:

Contains an image URL, a textual description (passage), and annotated entities with:
Entity mention text.
Entity type (e.g., organization, person).
Mention positions in the passage.
Wikipedia URL for the entity.
Mention Level:

Focuses on specific mentions within a sentence, with additional context:
Left and right contexts.
Mention type and candidates.
Topic category.
Entity Level:

Provides Wikipedia-based descriptions, images, and entity-level annotations.

# 1. Image Preprocessing
   * Images are retrieved from the URLs provided in the dataset.
   Fetching Images: Use the hashing function provided in the dataset documentation to locate the
   images in your local storage or download them.

In [31]:
import hashlib
import re
import os

def get_image_path(url, local_dir="path_to_wikinewsImgs"):
    img_name = url.split('/')[-1]
    prefix = hashlib.md5(img_name.encode()).hexdigest()
    suffix = re.sub(r'(\S+(?=\.(jpg|JPG|png|PNG|svg|SVG)))|(\S+(?=\.(jpeg|JPEG)))', '', img_name)
    file_name = prefix + suffix
    file_name = file_name.replace('.svg', '.png').replace('.SVG', '.png')
    return os.path.join(local_dir, file_name)

# Example usage
url = "https://upload.wikimedia.org/wikipedia/commons/0/06/DetroitLionsRunningPlay-2007.jpg"
local_path = get_image_path(url)
print(local_path)

path_to_wikinewsImgs/062ce5e341a566a4208d801e53557538.jpg


In [39]:
import os
import hashlib
import re
import requests
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array

# Function to generate the file path
def get_image_path(url, local_dir="path_to_wikinewsImgs"):
    os.makedirs(local_dir, exist_ok=True)  # Create directory if it doesn't exist
    img_name = url.split('/')[-1]
    prefix = hashlib.md5(img_name.encode()).hexdigest()
    suffix = re.sub(r'(\S+(?=\.(jpg|JPG|png|PNG|svg|SVG)))|(\S+(?=\.(jpeg|JPEG)))', '', img_name)
    file_name = prefix + suffix
    file_name = file_name.replace('.svg', '.png').replace('.SVG', '.png')
    return os.path.join(local_dir, file_name)

# Function to preprocess image
def preprocess_image(img_path, target_size=(224, 224)):
    img = load_img(img_path, target_size=target_size)
    img_array = img_to_array(img)
    img_array = preprocess_input(img_array)
    return img_array

# Example usage
url = "https://upload.wikimedia.org/wikipedia/commons/0/06/DetroitLionsRunningPlay-2007.jpg"
local_path = get_image_path(url)

# Step 2: Download the image
if not os.path.exists(local_path):  # Check if the file already exists
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        with open(local_path, 'wb') as f:
            f.write(response.content)
        print(f"Image saved at {local_path}")
    else:
        print(f"Failed to download image. Status code: {response.status_code}")
else:
    print(f"Image already exists at {local_path}")

# Step 3: Preprocess the image
try:
    img_array = preprocess_image(local_path)
    print(f"Preprocessed image shape: {img_array.shape}")
except FileNotFoundError as e:
    print(f"Error: {e}")

Image already exists at path_to_wikinewsImgs/062ce5e341a566a4208d801e53557538.jpg
Preprocessed image shape: (224, 224, 3)


# Preprocessing Images: Convert the images into tensors, resize them to a fixed size, and normalize them.

In [35]:
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array

def preprocess_image(img_path, target_size=(224, 224)):
    img = load_img(img_path, target_size=target_size)
    img_array = img_to_array(img)
    img_array = preprocess_input(img_array)
    return img_array

# 2. Text Preprocessing
   Textual information includes both the passage and mention-level details. Use tokenization and
   padding for consistency.

In [19]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def preprocess_text(texts, max_len=100, vocab_size=10000):
    tokenizer = Tokenizer(num_words=vocab_size, oov_token="<UNK>")
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
    return padded_sequences, tokenizer

# 3.  Dataset Parsing
    Parse the dataset and split it into inputs for training.

In [21]:
import json

def parse_passage_level(data_path):
    with open(data_path, 'r') as f:
        data = json.load(f)

    images, texts, entities = [], [], []
    for item in data:
        text = item[0]
        image_url = item[1]
        entity_annotations = item[3]

        images.append(get_image_path(image_url))
        texts.append(text)
        entities.append(entity_annotations)

    return images, texts, entities

# For mention-level data:

In [23]:
def parse_mention_level(data_path):
    with open(data_path, 'r') as f:
        data = json.load(f)

    mentions, contexts, topics = [], [], []
    for item in data:
        text = item[0]
        mention = item[2]
        left_context = item[4]
        right_context = item[5]
        topic = item[8]

        contexts.append((left_context, text, right_context))
        mentions.append(mention)
        topics.append(topic)

    return contexts, mentions, topics

# 4. Pair Generation for Training
   Generate positive and negative pairs of inputs for the Siamese network.

In [25]:
import random

def generate_pairs(images, texts, labels, num_pairs=1000):
    pairs_image1, pairs_image2 = [], []
    pairs_text1, pairs_text2 = [], []
    pair_labels = []

    for _ in range(num_pairs):
        # Positive pair
        idx = random.randint(0, len(images) - 1)
        pairs_image1.append(images[idx])
        pairs_text1.append(texts[idx])
        pairs_image2.append(images[idx])
        pairs_text2.append(texts[idx])
        pair_labels.append(1)

        # Negative pair
        idx1, idx2 = random.sample(range(len(images)), 2)
        pairs_image1.append(images[idx1])
        pairs_text1.append(texts[idx1])
        pairs_image2.append(images[idx2])
        pairs_text2.append(texts[idx2])
        pair_labels.append(0)

    return pairs_image1, pairs_image2, pairs_text1, pairs_text2, pair_labels

# Training the Siamese Network: Use the generated pairs to train the model.

In [None]:
# Assuming we have preprocessed image and text inputs
X_image1, X_image2 = preprocess_images(pairs_image1), preprocess_images(pairs_image2)
X_text1, X_text2 = preprocess_texts(pairs_text1), preprocess_texts(pairs_text2)

siamese_model.fit(
    [X_image1, X_image2, X_text1, X_text2], 
    pair_labels, 
    batch_size=32, 
    epochs=10
)

In [45]:
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Flatten, Dropout, Conv2D, MaxPooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.applications import VGG16
from tensorflow.keras.layers import Lambda
import tensorflow as tf

# Image Branch
def build_image_branch(input_shape=(224, 224, 3)):
    base_model = VGG16(include_top=False, weights='imagenet', input_shape=input_shape)
    base_model.trainable = False  # Freeze pre-trained layers

    x = Flatten()(base_model.output)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(128, activation='relu')(x)

    return Model(base_model.input, x, name="ImageBranch")

# Text Branch
def build_text_branch(vocab_size, max_len, embedding_dim=128):
    text_input = Input(shape=(max_len,), name="TextInput")
    x = Embedding(vocab_size, embedding_dim, input_length=max_len)(text_input)
    x = LSTM(128, return_sequences=False)(x)
    x = Dense(128, activation='relu')(x)
    return Model(text_input, x, name="TextBranch")

# Similarity Layer
def cosine_similarity(vectors):
    x, y = vectors
    x = tf.math.l2_normalize(x, axis=1)
    y = tf.math.l2_normalize(y, axis=1)
    return tf.reduce_sum(x * y, axis=1, keepdims=True)

def euclidean_distance(vectors):
    x, y = vectors
    return tf.sqrt(tf.reduce_sum(tf.square(x - y), axis=1, keepdims=True))

# Full Siamese Network
def build_siamese_network(vocab_size, max_len, input_shape=(224, 224, 3)):
    # Inputs
    img1_input = Input(shape=input_shape, name="Image1Input")
    img2_input = Input(shape=input_shape, name="Image2Input")
    text1_input = Input(shape=(max_len,), name="Text1Input")
    text2_input = Input(shape=(max_len,), name="Text2Input")

    # Branches
    image_branch = build_image_branch(input_shape)
    text_branch = build_text_branch(vocab_size, max_len)

    # Extract Features
    img1_features = image_branch(img1_input)
    img2_features = image_branch(img2_input)
    text1_features = text_branch(text1_input)
    text2_features = text_branch(text2_input)

    # Concatenate Features
    combined_features1 = tf.concat([img1_features, text1_features], axis=-1)
    combined_features2 = tf.concat([img2_features, text2_features], axis=-1)

    # Distance Calculation
    similarity = Lambda(cosine_similarity, name="CosineSimilarity")([combined_features1, combined_features2])

    # Final Model
    model = Model(inputs=[img1_input, img2_input, text1_input, text2_input], outputs=similarity, name="SiameseNetwork")
    return model

# The recised version

# 1. Dataset Preprocessing

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd

# Create a sample dataset
data = {
    "image_path1": ["path_to_image1.jpg", "path_to_image2.jpg"],
    "image_path2": ["path_to_image3.jpg", "path_to_image4.jpg"],
    "text1": ["This is a sample text 1", "Another example text 1"],
    "text2": ["This is a sample text 2", "Another example text 2"],
    "label": [1, 0]
}

df = pd.DataFrame(data)

# Save the dataset
df.to_csv(r"C:\Users\Min Dator\aics-project\wikidiverse_dataset.csv", index=False)

print("Sample dataset created.")


# Load WikiDiverse Dataset
# Assuming the dataset has columns: ['image_path1', 'image_path2', 'text1', 'text2', 'label']
data = pd.read_csv(r"C:\Users\Min Dator\aics-project\wikidiverse_dataset.csv")


# Extract columns
image_paths1 = data['image_path1'].values
image_paths2 = data['image_path2'].values
text_data1 = data['text1'].values
text_data2 = data['text2'].values
labels = data['label'].values

# Tokenize and pad text
max_sequence_length = 100
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(np.concatenate([text_data1, text_data2]))

text_sequences1 = tokenizer.texts_to_sequences(text_data1)
text_sequences2 = tokenizer.texts_to_sequences(text_data2)

text_input1 = pad_sequences(text_sequences1, maxlen=max_sequence_length)
text_input2 = pad_sequences(text_sequences2, maxlen=max_sequence_length)

# Preprocess images
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array

def preprocess_image(img_path):
    img = load_img(img_path, target_size=(224, 224))
    img_array = img_to_array(img)
    return preprocess_input(img_array)

image_input1 = np.array([preprocess_image(img) for img in image_paths1])
image_input2 = np.array([preprocess_image(img) for img in image_paths2])

# Train-Test Split
X_train_image1, X_test_image1, X_train_image2, X_test_image2, \
X_train_text1, X_test_text1, X_train_text2, X_test_text2, \
y_train, y_test = train_test_split(
    image_input1, image_input2, text_input1, text_input2, labels, test_size=0.2, random_state=42
)

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications.resnet50 import preprocess_input

# Create a sample dataset
data = {
    "image_path1": ["path_to_image1.jpg", "path_to_image2.jpg"],
    "image_path2": ["path_to_image3.jpg", "path_to_image4.jpg"],
    "text1": ["This is a sample text 1", "Another example text 1"],
    "text2": ["This is a sample text 2", "Another example text 2"],
    "label": [1, 0]
}

df = pd.DataFrame(data)

# Save the dataset
dataset_path = r"C:\Users\Min Dator\aics-project\wikidiverse_dataset.csv"
df.to_csv(dataset_path, index=False)
print("Sample dataset created.")

# Load WikiDiverse Dataset
data = pd.read_csv(dataset_path)

# Extract columns
image_paths1 = data['image_path1'].values
image_paths2 = data['image_path2'].values
text_data1 = data['text1'].values
text_data2 = data['text2'].values
labels = data['label'].values

# Tokenize and pad text
max_sequence_length = 100
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(np.concatenate([text_data1, text_data2]))

text_sequences1 = tokenizer.texts_to_sequences(text_data1)
text_sequences2 = tokenizer.texts_to_sequences(text_data2)

text_input1 = pad_sequences(text_sequences1, maxlen=max_sequence_length)
text_input2 = pad_sequences(text_sequences2, maxlen=max_sequence_length)

# Generate random image arrays for testing
def generate_random_image_array():
    return preprocess_input(np.random.rand(224, 224, 3).astype(np.float32) * 255)

image_input1 = np.array([generate_random_image_array() for _ in image_paths1])
image_input2 = np.array([generate_random_image_array() for _ in image_paths2])

# Train-Test Split
X_train_image1, X_test_image1, X_train_image2, X_test_image2, \
X_train_text1, X_test_text1, X_train_text2, X_test_text2, \
y_train, y_test = train_test_split(
    image_input1, image_input2, text_input1, text_input2, labels, test_size=0.5, random_state=42
)

print("Preprocessing completed. Dataset is ready for training.")

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array

# Expanded dataset creation
data = {
    "image_path1": [f"path_to_image_{i}.jpg" for i in range(1, 21)],
    "image_path2": [f"path_to_image_{i+20}.jpg" for i in range(1, 21)],
    "text1": [f"This is a sample text {i}" for i in range(1, 21)],
    "text2": [f"This is another sample text {i}" for i in range(1, 21)],
    "label": [1 if i % 2 == 0 else 0 for i in range(1, 21)]  # Alternating labels
}

df = pd.DataFrame(data)
dataset_path = r"C:\Users\Min Dator\aics-project\wikidiverse_dataset.csv"
df.to_csv(dataset_path, index=False)
print("Dataset saved to:", dataset_path)

# Load the dataset
data = pd.read_csv(dataset_path)

# Extract columns
image_paths1 = data['image_path1'].values
image_paths2 = data['image_path2'].values
text_data1 = data['text1'].values
text_data2 = data['text2'].values
labels = data['label'].values

# Tokenize and pad text
max_sequence_length = 100
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(np.concatenate([text_data1, text_data2]))

text_sequences1 = tokenizer.texts_to_sequences(text_data1)
text_sequences2 = tokenizer.texts_to_sequences(text_data2)

text_input1 = pad_sequences(text_sequences1, maxlen=max_sequence_length)
text_input2 = pad_sequences(text_sequences2, maxlen=max_sequence_length)

# Image preprocessing function
def preprocess_image(img_path):
    # Replace this placeholder logic with real image file handling if available
    img = np.random.rand(224, 224, 3)  # Simulate random image
    img_array = img_to_array(img)
    return preprocess_input(img_array)

image_input1 = np.array([preprocess_image(img) for img in image_paths1])
image_input2 = np.array([preprocess_image(img) for img in image_paths2])

# Split data into training and testing sets
X_train_image1, X_test_image1, X_train_image2, X_test_image2, \
X_train_text1, X_test_text1, X_train_text2, X_test_text2, \
y_train, y_test = train_test_split(
    image_input1, image_input2, text_input1, text_input2, labels, test_size=0.2, random_state=42
)

print("Preprocessing complete. Training and test data prepared.")

# 2. Model Architecture

In [53]:
from tensorflow.keras import layers, models
from tensorflow.keras.applications import ResNet50

# Image Sub-Network
def create_image_model(input_shape=(224, 224, 3)):
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=input_shape)
    x = layers.GlobalAveragePooling2D()(base_model.output)
    x = layers.Dense(256, activation='relu')(x)
    return models.Model(inputs=base_model.input, outputs=x)

# Text Sub-Network
def create_text_model(input_shape=(max_sequence_length,)):
    input_text = layers.Input(shape=input_shape)
    x = layers.Embedding(input_dim=10000, output_dim=128, input_length=max_sequence_length)(input_text)
    x = layers.LSTM(256, return_sequences=True)(x)
    x = layers.GlobalMaxPooling1D()(x)  # Global max pooling for compact representation
    x = layers.Dense(256, activation='relu')(x)
    return models.Model(inputs=input_text, outputs=x)

# Cross-Attention Layer
def cross_attention_layer(image_embedding, text_embedding):
    """
    Cross-attention mechanism where text guides the focus on the image embedding.
    """
    text_expanded = tf.expand_dims(text_embedding, axis=1)  # Expand for broadcasting
    image_expanded = tf.expand_dims(image_embedding, axis=2)
    
    # Attention scores
    attention_scores = tf.matmul(image_expanded, text_expanded)  # Shape: (batch_size, image_dim, text_dim)
    attention_weights = tf.nn.softmax(attention_scores, axis=1)
    
    # Weighted sum
    attended_image = tf.matmul(attention_weights, text_embedding)
    return layers.Flatten()(attended_image)

# Siamese Network
def create_siamese_network(image_shape=(224, 224, 3), text_shape=(max_sequence_length,)):
    # Image and Text Models
    image_model = create_image_model(input_shape=image_shape)
    text_model = create_text_model(input_shape=text_shape)
    
    # Inputs
    input_image1 = layers.Input(shape=image_shape)
    input_image2 = layers.Input(shape=image_shape)
    input_text1 = layers.Input(shape=text_shape)
    input_text2 = layers.Input(shape=text_shape)
    
    # Get Embeddings
    image_embedding1 = image_model(input_image1)
    image_embedding2 = image_model(input_image2)
    text_embedding1 = text_model(input_text1)
    text_embedding2 = text_model(input_text2)
    
    # Apply Cross-Attention
    cross_embedding1 = cross_attention_layer(image_embedding1, text_embedding1)
    cross_embedding2 = cross_attention_layer(image_embedding2, text_embedding2)
    
    # Combine Embeddings
    combined_embedding1 = layers.concatenate([cross_embedding1, text_embedding1])
    combined_embedding2 = layers.concatenate([cross_embedding2, text_embedding2])
    
    # Compute Similarity
    distance = layers.Lambda(lambda x: tf.abs(x[0] - x[1]))([combined_embedding1, combined_embedding2])
    output = layers.Dense(1, activation='sigmoid')(distance)
    
    # Build Model
    model = models.Model(inputs=[input_image1, input_image2, input_text1, input_text2], outputs=output)
    return model

# 3. Training and Evaluation

In [55]:
from tensorflow.keras.optimizers import Adam

# Loss Function
def contrastive_loss(y_true, y_pred, margin=1.0):
    square_pred = tf.square(y_pred)
    square_true = tf.square(y_true)
    loss = (y_true * square_pred) + ((1 - y_true) * tf.maximum(0.0, margin - tf.sqrt(square_pred + 1e-6))**2)
    return tf.reduce_mean(loss)

# Create and Compile Model
siamese_model = create_siamese_network()
siamese_model.compile(optimizer=Adam(learning_rate=1e-4), loss=contrastive_loss, metrics=['accuracy'])

# Train Model
history = siamese_model.fit(
    [X_train_image1, X_train_image2, X_train_text1, X_train_text2], y_train,
    batch_size=32, epochs=10, validation_split=0.1
)

# Evaluate Model
test_loss, test_accuracy = siamese_model.evaluate(
    [X_test_image1, X_test_image2, X_test_text1, X_test_text2], y_test
)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

2025-01-02 07:28:08.023787: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


2025-01-02 07:28:16.122640: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2025-01-02 07:28:16.126308: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2025-01-02 07:28:16.128411: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

ValueError: Training data contains 1 samples, which is not sufficient to split it into a validation and training set as specified by `validation_split=0.1`. Either provide more data, or a different value for the `validation_split` argument.