# An overview
Siamese networks consist of two identical sub-networks that share weights and learn to compute the similarity between two input samples. The goal is to learn embeddings such that similar inputs are close in the embedding space, while dissimilar inputs are far apart. For the WikiDiverse dataset, where we have image-caption pairs, we can build a Siamese network that processes text and image data (or just one modality like text or image) and learns to compute similarity between two entities from the knowledge base.
* Siamese Network Structure: Two identical sub-networks that compute embeddings for input pairs and learn their similarity
* Application: For WikiDiverse, compute similarity between image-caption pairs to link knowledge-base entities.

In [27]:
# Required Libraries
import os
import json
import hashlib
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from transformers import BertTokenizer
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
from PIL import Image
import pandas as pd
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, default_collate
from torchvision.transforms import Compose
from torchvision.transforms import Compose, ColorJitter

In [28]:
#-----------------------------------
# 1. Dataset Path and Hyperparameter
#-----------------------------------
DATASET_PATH = r'C:\Users\Min Dator\aics-project\wikidiverse_w_cands\wikidiverse_w_cands'
IMAGE_DIR = os.path.join(DATASET_PATH, "wikinewsImgs")
JSON_PATH = os.path.join(DATASET_PATH, "train_w_10cands.json")

BATCH_SIZE = 16
EPOCHS = 10
LEARNING_RATE = 0.0001
EMBED_DIM = 256
VOCAB_SIZE = 10000

In [29]:
#-------------------------------
# 2. Helper Functions
#-------------------------------
# Helper Functions to generate the local image path:
def get_image_path(url, img_dir):
    filename = url.split('/')[-1]
    prefix = hashlib.md5(filename.encode()).hexdigest()
    suffix = re.sub(r'(\S+(?=\.(jpg|jpeg|png|svg)))', '', filename, flags=re.IGNORECASE)
    #suffix = filename.replace('.svg', '.png') if filename.lower().endswith('.svg') else filename
    local_path = os.path.join(img_dir, f"{prefix}{suffix}".replace('.svg', '.png'))
    return local_path

# Load and preprocess the dataset
def load_dataset(json_path, img_dir):
    with open(json_path, 'r') as f:
        data = json.load(f)

    processed_data = []
    for item in data:
        # Extract fields from the dataset
        sentence = item[0]
        img_url = item[1]
        mention = item[2]
        mention_type = item[3]
        left_context = item[4]
        right_context = item[5]
        entity_url = item[6]

        # Generate the local image path
        img_path = get_image_path(img_url, img_dir)

        # Add processed data
        processed_data.append({
            'sentence': sentence,
            'mention': mention,
            'mention_type': mention_type,
            'left_context': left_context,
            'right_context': right_context,
            'entity_url': entity_url,
            'img_path': img_path
        })
    return processed_data

#img_dir = r"C:\Users\Min Dator\aics-project\wikinewsImgs"
#json_path = r"C:\Users\Min Dator\aics-project\wikidiverse_w_cands\wikidiverse_w_cands\train_w_10cands.json"
#dataset = load_dataset(json_path, img_dir)
# Print a sample
#print(dataset[5])

In [30]:
#-----------------------------
# 3. WikiDiverse Dataset Class 
#-----------------------------
class WikiDiverseDataset(Dataset):
    def __init__(self, data, img_dir, transform=None, text_tokenizer=None):
        """
        Initialize the WikiDiverseDataset.

        Args:
            data (list): List of processed data entries.
            img_dir (str): Directory containing the images.
            transform (callable, optional): Transformations to apply to images.
            text_tokenizer (callable, optional): Tokenizer to apply to text data.
        """
        self.data = data
        self.img_dir = img_dir
        self.transform = transform
        self.text_tokenizer = text_tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        entry = self.data[idx]
        try:
            img1_path = os.path.join(self.img_dir, entry['img1'])
            img2_path = os.path.join(self.img_dir, entry['img2'])
            img1 = Image.open(img1_path).convert('RGB')
            img2 = Image.open(img2_path).convert('RGB')
        except KeyError as e:
            raise ValueError(f"Missing key {e} in dataset entry at index {idx}: {entry}")

        if self.transform:
            img1 = self.transform(img1)
            img2 = self.transform(img2)

        text1 = entry['text1']
        text2 = entry['text2']
        if self.text_tokenizer:
            text1 = self.text_tokenizer(
                text1, return_tensors="pt", padding=True, truncation=True, max_length=128
            )
            text2 = self.text_tokenizer(
                text2, return_tensors="pt", padding=True, truncation=True, max_length=128
            )

        label = torch.tensor(entry['label', 0], dtype=torch.float32)

        return img1, text1, img2, text2, label


#Revision 2
# class WikiDiverseDataset:
#     def __init__(self, data, img_dir, transform=None, text_tokenizer=None):
#         self.data = data
#         self.img_dir = img_dir
#         self.transform = transform
#         self.text_tokenizer = text_tokenizer

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         entry1 = self.data[idx]
#         entry2 = self.data[np.random.randint(len(self.data))]  # Randomly pick a second entry
        
#         # Extract paths and text
#         img1_path = os.path.join(self.img_dir, entry1['img_path'])
#         img2_path = os.path.join(self.img_dir, entry2['img_path'])
#         text1 = entry1['mention']
#         text2 = entry2['mention']
        
#         # Apply transformations
#         from PIL import Image
#         img1 = self.transform(Image.open(img1_path).convert('RGB')) if self.transform else img1_path
#         img2 = self.transform(Image.open(img2_path).convert('RGB')) if self.transform else img2_path

#         # Tokenize text
#         if self.text_tokenizer:
#             text1 = self.text_tokenizer(text1, return_tensors='pt', padding=True, truncation=True, max_length=128)
#             text2 = self.text_tokenizer(text2, return_tensors='pt', padding=True, truncation=True, max_length=128)

#         # Define label (1: similar, 0: dissimilar)
#         label = 1 if entry1['mention'] == entry2['mention'] else 0

#         return img1, text1, img2, text2, torch.tensor(label, dtype=torch.float)

In [31]:
#----------------------------
# 4. Data Transformations 
#----------------------------
transform = Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(degrees=30),
    ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [32]:
#----------------------------
# 5. Cross-Attention Mechanism
#----------------------------
class CrossAttention(nn.Module):
    def __init__(self, embed_dim, num_heads=4, dropout=0.1):
        super(CrossAttention, self).__init__()
        self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
        self.layer_norm = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, query, key, value):
        attn_output, _ = self.multihead_attn(query, key, value)
        attn_output = self.dropout(attn_output)
        return self.layer_norm(query + attn_output)

In [33]:
#-----------------------------------
# 6. Sub-Networks for Image and Text
#-----------------------------------
class ImageSubNetworkWithAttention(nn.Module):
    def __init__(self, embed_dim=256, num_heads=4):
        super(ImageSubNetworkWithAttention, self).__init__()
        base_model = models.resnet50(pretrained=True)
        self.features = nn.Sequential(*list(base_model.children())[:-1])
        self.fc = nn.Sequential(
            nn.Linear(2048, embed_dim),
            nn.ReLU(),
            nn.Dropout(0.5)
        )
        self.cross_attention = CrossAttention(embed_dim, num_heads)

    def forward(self, x, text_features):
        x = self.features(x).view(x.size(0), -1)
        x = self.fc(x)
        if text_features is not None:
            x = self.cross_attention(x.unsqueeze(1), text_features, text_features).squeeze(1)
        return x

class TextSubNetworkWithAttention(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256, num_heads=4):
        super(TextSubNetworkWithAttention, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, dropout=0.5)
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.5)
        )
        self.cross_attention = CrossAttention(hidden_dim, num_heads)

    def forward(self, x, image_features):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        hidden = hidden.squeeze(0)
        if image_features is not None:
            hidden = self.cross_attention(hidden.unsqueeze(1), image_features, image_features).squeeze(1)
        return self.fc(hidden)

In [34]:
#------------------------------
# 7. Siamese Network
#------------------------------
class SiameseNetworkWithCrossAttention(nn.Module):
    def __init__(self, vocab_size):
        super(SiameseNetworkWithCrossAttention, self).__init__()
        self.image_net = ImageSubNetworkWithAttention()
        self.text_net = TextSubNetworkWithAttention(vocab_size)

    def forward(self, img1, img2, text1, text2):
        img_embedding1 = self.image_net(img1, None)
        img_embedding2 = self.image_net(img2, None)

        text_embedding1 = self.text_net(text1, img_embedding1)
        text_embedding2 = self.text_net(text2, img_embedding2)

        combined_embedding1 = torch.cat([img_embedding1, text_embedding1], dim=1)
        combined_embedding2 = torch.cat([img_embedding2, text_embedding2], dim=1)

        return combined_embedding1, combined_embedding2

In [35]:
#-------------------------------
# 8. Contrastive Loss
#-------------------------------
class ContrastiveLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        euclidean_distance = torch.nn.functional.pairwise_distance(output1, output2)
        loss = torch.mean((1 - label) * torch.pow(euclidean_distance, 2) +
                          label * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))
        return loss

In [36]:
import torch
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, precision_score
import numpy as np

def train(model, device, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for img1, text1, img2, text2, labels in loader:
        # Move to device
        img1, img2, labels = img1.to(device), img2.to(device), labels.to(device)
        text1 = {key: val.to(device) for key, val in text1.items()}
        text2 = {key: val.to(device) for key, val in text2.items()}

        optimizer.zero_grad()
        output1, output2 = model(img1, img2, text1, text2)
        loss = criterion(output1, output2, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, device, loader):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for img1, text1, img2, text2, labels in loader:
            img1, img2, labels = img1.to(device), img2.to(device), labels.to(device)
            text1 = {key: val.to(device) for key, val in text1.items()}
            text2 = {key: val.to(device) for key, val in text2.items()}
            
            output1, output2 = model(img1, img2, text1, text2)
            similarity = F.cosine_similarity(output1, output2)
            predictions.extend(similarity.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, (np.array(predictions) > 0.5).astype(int)) * 100
    precision = precision_score(true_labels, (np.array(predictions) > 0.5).astype(int))
    
    print(f"Accuracy: {accuracy:.2f}%")
    print(f"Precision: {precision:.2f}")
    return accuracy
    print(accuracy) 

In [41]:
# Main Execution
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SiameseNetworkWithCrossAttention(VOCAB_SIZE).to(device)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CosineEmbeddingLoss()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

# Load dataset
dataset = load_dataset(JSON_PATH, IMAGE_DIR)
train_data, val_test_data = train_test_split(dataset, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(val_test_data, test_size=0.5, random_state=42)

# Create DataLoaders
train_loader = DataLoader(WikiDiverseDataset(train_data, IMAGE_DIR, transform, text_tokenizer=tokenizer), batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(WikiDiverseDataset(val_data, IMAGE_DIR, transform, text_tokenizer=tokenizer), batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(WikiDiverseDataset(test_data, IMAGE_DIR, transform, text_tokenizer=tokenizer), batch_size=BATCH_SIZE, shuffle=False)

# Initialize model, optimizer, and loss function
model = SiameseNetworkWithCrossAttention(VOCAB_SIZE).to(device)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CosineEmbeddingLoss()  # or ContrastiveLoss()
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

# Training and evaluation loop
# best_val_accuracy = 0
# for epoch in range(EPOCHS):
#     train_loss = train(model, device, train_loader, optimizer, criterion)
#     val_accuracy = evaluate(model, device, val_loader)
#     scheduler.step(train_loss)  

#     print(f"Epoch {epoch+1}/{EPOCHS}, Train Loss: {train_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

#     # Save the best model
#     if val_accuracy > best_val_accuracy:
#         best_val_accuracy = val_accuracy
#         torch.save(model.state_dict(), "best_model.pth")

# # Test the model
# model.load_state_dict(torch.load("best_model.pth"))
# test_accuracy = evaluate(model, device, test_loader)
# print(f"Test Accuracy: {test_accuracy:.4f}")

best_val_accuracy = 0
for epoch in range(EPOCHS):
    # Train
    train_loss = train(model, device, train_loader, optimizer, criterion)
    
    # Evaluate
    val_accuracy = evaluate(model, device, val_loader)
    scheduler.step(train_loss)  # Adjust learning rate based on training loss

    # Log results
    print(f"Epoch {epoch+1}/{EPOCHS}, Train Loss: {train_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

    # Save the best model
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), "best_model.pth")
        print(f"New best model saved with Val Accuracy: {best_val_accuracy:.4f}")

# Test the best model
model.load_state_dict(torch.load("best_model.pth"))
test_accuracy = evaluate(model, device, test_loader)
print(f"Test Accuracy: {test_accuracy:.4f}")

ValueError: Missing key 'img1' in dataset entry at index 267: {'sentence': "(From left) Harry Potter, Star Lord (from Marvel's Guardians of the Galaxy, Elsa (from Disney's Frozen) and Harley Quinn from DC/Warner Bros. Suicide Squad.", 'mention': "Marvel's", 'mention_type': 'Organization', 'left_context': ['from', 'left', 'harry', 'potter', 'star', 'lord', 'from'], 'right_context': [' ', 'guardian', 'of', 'the', 'galaxy', 'elsa', 'from', 'disney', 'frozen', 'and', 'harley', 'quinn', 'from', 'dcwarner', 'bros', 'suicide', 'squad'], 'entity_url': 'https://en.wikipedia.org/wiki/Marvel_Comics', 'img_path': 'C:\\Users\\Min Dator\\aics-project\\wikidiverse_w_cands\\wikidiverse_w_cands\\wikinewsImgs\\a29bbfb533d09c5165b3dea9ea11d881.JPG'}

In [40]:
#-----------------------------
# 9. Training and Evaluation
#-----------------------------
from torch.optim.lr_scheduler import ReduceLROnPlateau
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SiameseNetworkWithCrossAttention(VOCAB_SIZE).to(device)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CosineEmbeddingLoss()

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

# Define transformations for images
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


def train(model, device, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for img1, text1, img2, text2, labels in loader:
        img1, text1, img2, text2, labels = img1.to(device), text1.to(device), img2.to(device), text2.to(device), labels.to(device)
        optimizer.zero_grad()
        output1, output2 = model(img1, img2, text1, text2)
        loss = criterion(output1, output2, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

#def train(model, device, loader, optimizer, criterion, scheduler):
#     model.train()
#     total_loss = 0
#     for img1, text1, img2, text2, labels in loader:
#         img1, text1, img2, text2, labels = img1.to(device), text1.to(device), img2.to(device), text2.to(device), labels.to(device)
#         optimizer.zero_grad()
#         output1, output2 = model(img1, img2, text1, text2)
#         loss = criterion(output1, output2, labels)
#         loss.backward()
#         optimizer.step()
#         scheduler.step(loss)
#         total_loss += loss.item()
#     return total_loss / len(loader)

def evaluate(model, device, loader):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for img1, text1, img2, text2, labels in loader:
            img1, text1, img2, text2, labels = img1.to(device), text1.to(device), img2.to(device), text2.to(device), labels.to(device)
            output1, output2 = model(img1, img2, text1, text2)
            similarity = F.cosine_similarity(output1, output2)
            predictions.extend(similarity.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    return accuracy_score(true_labels, (np.array(predictions) > 0.5).astype(int)) * 100

# Main Execution
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataset = load_dataset(JSON_PATH, IMAGE_DIR)
train_data, val_test_data = train_test_split(dataset, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(val_test_data, test_size=0.5, random_state=42)

train_loader = DataLoader(WikiDiverseDataset(train_data, IMAGE_DIR, transform, text_tokenizer=tokenizer), batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(WikiDiverseDataset(val_data, IMAGE_DIR, transform, text_tokenizer=tokenizer), batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(WikiDiverseDataset(test_data, IMAGE_DIR, transform, text_tokenizer=tokenizer), batch_size=BATCH_SIZE, shuffle=False)

model = SiameseNetworkWithCrossAttention(VOCAB_SIZE).to(device)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = ContrastiveLoss()

for epoch in range(EPOCHS):
    train_loss = train(model, device, train_loader, optimizer, criterion)
    print(f"Epoch {epoch+1}/{EPOCHS}, Train Loss: {train_loss:.4f}")
    val_accuracy = evaluate(model, device, val_loader)
    print(f"Epoch {epoch+1}/{EPOCHS}, Val Accuracy: {val_accuracy:.4f}")

test_accuracy = evaluate(model, device, test_loader)
print(f"Test Accuracy: {test_accuracy:.4f}")

ValueError: Missing key 'img1' in dataset entry at index 2855: {'sentence': 'Location of Helmand Province within Afghanistan where Daniele Mastrogiacomo was taken hostage.', 'mention': 'Helmand Province', 'mention_type': 'Location', 'left_context': ['location', 'of'], 'right_context': [' ', 'within', 'afghanistan', 'where', 'daniele', 'mastrogiacomo', 'be', 'take', 'hostage'], 'entity_url': 'https://en.wikipedia.org/wiki/Helmand_Province', 'img_path': 'C:\\Users\\Min Dator\\aics-project\\wikidiverse_w_cands\\wikidiverse_w_cands\\wikinewsImgs\\6a9d4a5a25582015c8d9a65c8afe365d.png'}

In [None]:
# #--------------------
# # evaluation metrics
# #--------------------

# import numpy as np
# from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
# metrics['accuracy'] = accuracy_score(true_labels, (np.array(predictions) > 0.5).astype(int)) * 100

# def evaluate(model, device, loader):
#     model.eval()
#     predictions = []
#     true_labels = []
#     with torch.no_grad():
#         for img1, text1, img2, text2, labels in loader:
#             img1, text1, img2, text2, labels = img1.to(device), text1.to(device), img2.to(device), text2.to(device), labels.to(device)
#             output1, output2 = model(img1, img2, text1, text2)
#             similarity = F.cosine_similarity(output1, output2)
#             predictions.extend(similarity.cpu().numpy())
#             true_labels.extend(labels.cpu().numpy())
    
#     return np.array(predictions), np.array(true_labels)

# # After running the evaluation
# test_predictions, test_true_labels = evaluate(model, device, test_loader)

# # Print results
# print("Test Metrics:")
# print(f"Accuracy: {accuracy_score(test_true_labels, (test_predictions > 0.5).astype(int)) * 100:.4f}")
# print(f"AUC: {roc_auc_score(test_true_labels, test_predictions):.4f}")
# print(f"Precision: {precision_score(test_true_labels, (test_predictions > 0.5).astype(int)):.4f}")
# print(f"Recall: {recall_score(test_true_labels, (test_predictions > 0.5).astype(int)):.4f}")
# print(f"F1 Score: {f1_score(test_true_labels, (test_predictions > 0.5).astype(int)):.4f}")

# # Plot confusion matrix
# plot_confusion_matrix(test_true_labels, (test_predictions > 0.5).astype(int))

# # Plot ROC curve
# plot_roc_curve(test_true_labels, test_predictions)

### END ####

In [None]:
# def train(model, device, loader, optimizer, criterion):
#     model.train()
#     total_loss = 0
#     for img1, text1, img2, text2, labels in loader:
#         # Move to device
#         img1, img2, labels = img1.to(device), img2.to(device), labels.to(device)
#         text1 = {key: val.to(device) for key, val in text1.items()}
#         text2 = {key: val.to(device) for key, val in text2.items()}

#         optimizer.zero_grad()
#         output1, output2 = model(img1, img2, text1, text2)
#         loss = criterion(output1, output2, labels)
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()
#     return total_loss / len(loader)
# import numpy as np
# def evaluate(model, device, loader):
#     model.eval()
#     predictions = []
#     true_labels = []
#     with torch.no_grad():
#         for img1, text1, img2, text2, labels in loader:
#             img1, img2, labels = img1.to(device), img2.to(device), labels.to(device)
#             text1 = {key: val.to(device) for key, val in text1.items()}
#             text2 = {key: val.to(device) for key, val in text2.items()}
            
#             output1, output2 = model(img1, img2, text1, text2)
#             similarity = F.cosine_similarity(output1, output2)
#             predictions.extend(similarity.cpu().numpy())
#             true_labels.extend(labels.cpu().numpy())
#     return accuracy_score(true_labels, (np.array(predictions) > 0.5).astype(int)) * 100
# print(precision_score)

In [62]:
# #-----------------------------
# # 9. Training and Evaluation
# #-----------------------------
# from torch.optim.lr_scheduler import ReduceLROnPlateau
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = SiameseNetworkWithCrossAttention(VOCAB_SIZE).to(device)
# optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
# criterion = nn.CosineEmbeddingLoss()

# # Initialize tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# #scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

# # Define transformations for images
# transform = transforms.Compose([
#     transforms.Resize((224, 224)),
#     transforms.ToTensor(),
#     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
# ])


# def train(model, device, loader, optimizer, criterion):
#     model.train()
#     total_loss = 0
#     for img1, text1, img2, text2, labels in loader:
#         img1, text1, img2, text2, labels = img1.to(device), text1.to(device), img2.to(device), text2.to(device), labels.to(device)
#         optimizer.zero_grad()
#         output1, output2 = model(img1, img2, text1, text2)
#         loss = criterion(output1, output2, labels)
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()
#     return total_loss / len(loader)

# #def train(model, device, loader, optimizer, criterion, scheduler):
# #     model.train()
# #     total_loss = 0
# #     for img1, text1, img2, text2, labels in loader:
# #         img1, text1, img2, text2, labels = img1.to(device), text1.to(device), img2.to(device), text2.to(device), labels.to(device)
# #         optimizer.zero_grad()
# #         output1, output2 = model(img1, img2, text1, text2)
# #         loss = criterion(output1, output2, labels)
# #         loss.backward()
# #         optimizer.step()
# #         scheduler.step(loss)
# #         total_loss += loss.item()
# #     return total_loss / len(loader)

# def evaluate(model, device, loader):
#     model.eval()
#     predictions = []
#     true_labels = []
#     with torch.no_grad():
#         for img1, text1, img2, text2, labels in loader:
#             img1, text1, img2, text2, labels = img1.to(device), text1.to(device), img2.to(device), text2.to(device), labels.to(device)
#             output1, output2 = model(img1, img2, text1, text2)
#             similarity = F.cosine_similarity(output1, output2)
#             predictions.extend(similarity.cpu().numpy())
#             true_labels.extend(labels.cpu().numpy())
#     return accuracy_score(true_labels, (np.array(predictions) > 0.5).astype(int)) * 100

# # Main Execution
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# dataset = load_dataset(JSON_PATH, IMAGE_DIR)
# train_data, val_test_data = train_test_split(dataset, test_size=0.2, random_state=42)
# val_data, test_data = train_test_split(val_test_data, test_size=0.5, random_state=42)

# train_loader = DataLoader(WikiDiverseDataset(train_data, IMAGE_DIR, transform, text_tokenizer=tokenizer), batch_size=BATCH_SIZE, shuffle=True)
# val_loader = DataLoader(WikiDiverseDataset(val_data, IMAGE_DIR, transform, text_tokenizer=tokenizer), batch_size=BATCH_SIZE, shuffle=False)
# test_loader = DataLoader(WikiDiverseDataset(test_data, IMAGE_DIR, transform, text_tokenizer=tokenizer), batch_size=BATCH_SIZE, shuffle=False)

# #--------
# # train_loader = DataLoader(
# #     WikiDiverseDataset(train_data, IMAGE_DIR, transform=transform, text_tokenizer=tokenizer),
# #     batch_size=BATCH_SIZE,
# #     shuffle=True,
# # )
# # val_loader = DataLoader(
# #     WikiDiverseDataset(val_data, IMAGE_DIR, transform=transform, text_tokenizer=tokenizer),
# #     batch_size=BATCH_SIZE,
# #     shuffle=False,
# # )
# # test_loader = DataLoader(
# #     WikiDiverseDataset(test_data, IMAGE_DIR, transform=transform, text_tokenizer=tokenizer),
# #     batch_size=BATCH_SIZE,
# #     shuffle=False,
# # )

# #-----------

# # # Create datasets
# # train_dataset = WikiDiverseDataset(
# #     json_path="C:/Users/Min Dator/aics-project/wikidiverse_w_cands/wikidiverse_w_cands/train_w_10cands.json",
# #     image_dir="C:/Users/Min Dator/aics-project/wikinewsImgs",
# #     tokenizer=tokenizer,
# #     transform=transform
# # )

# # valid_dataset = WikiDiverseDataset(
# #     json_path="C:/Users/Min Dator/aics-project/wikidiverse_w_cands/wikidiverse_w_cands/valid_w_10cands.json",
# #     image_dir="C:/Users/Min Dator/aics-project/wikinewsImgs",
# #     tokenizer=tokenizer,
# #     transform=transform
# # )

# # test_dataset = WikiDiverseDataset(
# #     json_path="C:/Users/Min Dator/aics-project/wikidiverse_w_cands/wikidiverse_w_cands/test_w_10cands.json",
# #     image_dir="C:/Users/Min Dator/aics-project/wikinewsImgs",
# #     tokenizer=tokenizer,
# #     transform=transform
# # )

# # # Create DataLoaders
# # train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# # valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)
# # test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


# model = SiameseNetworkWithCrossAttention(VOCAB_SIZE).to(device)
# optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
# criterion = ContrastiveLoss()

# for epoch in range(EPOCHS):
#     train_loss = train(model, device, train_loader, optimizer, criterion)
#     print(f"Epoch {epoch+1}/{EPOCHS}, Train Loss: {train_loss:.4f}")
#     val_accuracy = evaluate(model, device, valid_loader)
#     print(f"Epoch {epoch+1}/{EPOCHS}, Val Accuracy: {val_accuracy:.4f}")

# test_accuracy = evaluate(model, device, test_loader)
# print(f"Test Accuracy: {test_accuracy:.4f}")

In [63]:
# # import numpy as np
# from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
# # metrics['accuracy'] = accuracy_score(true_labels, (np.array(predictions) > 0.5).astype(int)) * 100


# def evaluate(model, device, loader):
#     model.eval()
#     predictions = []
#     true_labels = []
#     with torch.no_grad():
#         for img1, text1, img2, text2, labels in loader:
#             img1, text1, img2, text2, labels = img1.to(device), text1.to(device), img2.to(device), text2.to(device), labels.to(device)
#             output1, output2 = model(img1, img2, text1, text2)
#             similarity = F.cosine_similarity(output1, output2)
#             predictions.extend(similarity.cpu().numpy())
#             true_labels.extend(labels.cpu().numpy())
    
#     return np.array(predictions), np.array(true_labels)

# # After running the evaluation
# test_predictions, test_true_labels = evaluate(model, device, test_loader)

# # Print results
# print("Test Metrics:")
# print(f"Accuracy: {accuracy_score(test_true_labels, (test_predictions > 0.5).astype(int)) * 100:.4f}")
# #print(f"AUC: {roc_auc_score(test_true_labels, test_predictions):.4f}")
# print(f"Precision: {precision_score(test_true_labels, (test_predictions > 0.5).astype(int)):.4f}")
# print(f"Recall: {recall_score(test_true_labels, (test_predictions > 0.5).astype(int)):.4f}")
# print(f"F1 Score: {f1_score(test_true_labels, (test_predictions > 0.5).astype(int)):.4f}")

# # Plot confusion matrix
# #plot_confusion_matrix(test_true_labels, (test_predictions > 0.5).astype(int))

# # Plot ROC curve
# #plot_roc_curve(test_true_labels, test_predictions)

In [64]:

# import numpy as np
# from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
# metrics['accuracy'] = accuracy_score(true_labels, (np.array(predictions) > 0.5).astype(int)) * 100


# def evaluate(model, device, loader):
#     model.eval()
#     predictions = []
#     true_labels = []
#     with torch.no_grad():
#         for img1, text1, img2, text2, labels in loader:
#             img1, text1, img2, text2, labels = img1.to(device), text1.to(device), img2.to(device), text2.to(device), labels.to(device)
#             output1, output2 = model(img1, img2, text1, text2)
#             similarity = F.cosine_similarity(output1, output2)
#             predictions.extend(similarity.cpu().numpy())
#             true_labels.extend(labels.cpu().numpy())
    
#     return np.array(predictions), np.array(true_labels)

# # After running the evaluation
# test_predictions, test_true_labels = evaluate(model, device, test_loader)

# # Now you can create the confusion matrix
# plot_confusion_matrix(test_true_labels, (test_predictions > 0.5).astype(int))

In [65]:
# #--------------------
# # Implement confusion matrix calculation
# #--------------------
# import numpy as np
# from sklearn.metrics import confusion_matrix
# import seaborn as sns
# import matplotlib.pyplot as plt

# def plot_confusion_matrix(y_true, y_pred):
#     cm = confusion_matrix(y_true, y_pred)
#     plt.figure(figsize=(10,8))
#     sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
#     plt.xlabel('Predicted labels')
#     plt.ylabel('True labels')
#     plt.title('Confusion Matrix')
#     plt.show()

# # Calculate confusion matrix
# y_pred = (np.array(test_predictions) > 0.5).astype(int)
# plot_confusion_matrix(test_true_labels, y_pred)

In [625]:
# # Evaluation
# test_metrics, test_predictions, test_true_labels = evaluate(model, device, test_loader)
# print("Test Metrics:")
# for metric, value in test_metrics.items():
#     print(f"{metric}: {value:.4f}")

# # Confusion Matrix
# from sklearn.metrics import confusion_matrix
# import seaborn as sns

# def plot_confusion_matrix(y_true, y_pred):
#     cm = confusion_matrix(y_true, y_pred)
#     plt.figure(figsize=(10, 8))
#     sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
#     plt.xlabel('Predicted labels')
#     plt.ylabel('True labels')
#     plt.title('Confusion Matrix')
#     plt.show()

# # Calculate and plot confusion matrix
# y_pred = (np.array(test_predictions) > 0.5).astype(int)
# plot_confusion_matrix(test_true_labels, y_pred)

In [41]:
# # -----------------------------
# # Main Execution
# # -----------------------------
# data = load_dataset(JSON_PATH, IMAGE_DIR)
# print(f"Loaded {len(data)} entries from the dataset.")

# # Initialize tokenizer (if needed)
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# # Create dataset and dataloader
# dataset = WikiDiverseDataset(data, transform=transform, text_tokenizer=tokenizer.encode)
# dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# # Create model
# model = SiameseNetworkWithCrossAttention(VOCAB_SIZE)
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)

# # Set up optimizer and loss function
# optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
# criterion = ContrastiveLoss(margin=1.0)

# # Training loop
# for epoch in range(EPOCHS):
#     loss = train(model, device, dataloader, optimizer, criterion)
#     print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {loss:.4f}")

In [304]:
# class WikiDiverseDataset(Dataset):
#     def __init__(self, data, img_dir, transform=None, text_tokenizer=None):
#         self.data = data  # Directly set the data passed to the constructor
#         self.img_dir = img_dir
#         self.transform = transform
#         self.text_tokenizer = text_tokenizer

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         entry = self.data[idx]

#         # Load images
#         img1_path = os.path.join(self.img_dir, entry['img1'])
#         img2_path = os.path.join(self.img_dir, entry['img2'])
#         img1 = Image.open(img1_path).convert('RGB')
#         img2 = Image.open(img2_path).convert('RGB')

#         # Apply image transformations
#         if self.transform:
#             img1 = self.transform(img1)
#             img2 = self.transform(img2)

#         # Tokenize text
#         text1 = entry['text1']
#         text2 = entry['text2']
#         if self.text_tokenizer:
#             text1 = self.text_tokenizer(text1, return_tensors="pt", padding=True, truncation=True, max_length=128)
#             text2 = self.text_tokenizer(text2, return_tensors="pt", padding=True, truncation=True, max_length=128)

#         label = torch.tensor(entry['label'], dtype=torch.float32)

#         return img1, text1, img2, text2, label

In [314]:
# from torchvision import transforms

# class WikiDiverseDataset(Dataset):
#     def __init__(self, json_path, img_dir, transform=None, text_tokenizer=None):
#         self.data = []
#         self.img_dir = img_dir
#         self.transform = transform
#         self.text_tokenizer = text_tokenizer
        
#         try:
#             with open(json_path, 'r') as f:
#                 for entry in json.load(f):
#                     try:
#                         img1_path = os.path.join(img_dir, entry['image1'])
#                         img2_path = os.path.join(img_dir, entry['image2'])
                        
#                         if os.path.exists(img1_path) and os.path.exists(img2_path):
#                             self.data.append(entry)
#                     except Exception as e:
#                         print(f"Error processing entry: {e}")
#         except PermissionError:
#             print(f"Permission denied for file: {json_path}. Please ensure you have read permissions.")
        
#         print(f"Total valid entries: {len(self.data)}")

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         entry = self.data[idx]

#         # Load images
#         img1_path = os.path.join(self.img_dir, entry['image1'])
#         img2_path = os.path.join(self.img_dir, entry['image2'])
#         img1 = Image.open(img1_path).convert('RGB')
#         img2 = Image.open(img2_path).convert('RGB')

#         # Apply image transformations
#         if self.transform:
#             img1 = self.transform(img1)
#             img2 = self.transform(img2)

#         # Tokenize text
#         text1 = entry['text1']
#         text2 = entry['text2']
#         if self.text_tokenizer:
#             text1 = self.text_tokenizer(text1, return_tensors="pt", padding=True, truncation=True, max_length=128)
#             text2 = self.text_tokenizer(text2, return_tensors="pt", padding=True, truncation=True, max_length=128)

#         label = torch.tensor(entry['label'], dtype=torch.float32)

#         return img1, text1, img2, text2, label

# # Example: Define the transformation pipeline
# transform = transforms.Compose([
#     transforms.Resize((224, 224)),
#     transforms.ToTensor(),
#     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
# ])

# # Now you can pass this transform to your dataset
# dataset = WikiDiverseDataset(json_path='path_to_json', img_dir='path_to_images', transform=transform)

In [360]:
# def preprocess_data(raw_data):
#     """
#     Convert list entries in the dataset to dictionaries with expected keys.
#     """
#     processed_data = []
#     for idx, entry in enumerate(raw_data):
#         if isinstance(entry, list):
#             try:
#                 # Ensure the list follows the expected structure
#                 processed_entry = {
#                     'text1': entry[0],  # Descriptive text
#                     'img1': entry[1],   # Path or URL for the first image
#                     'text2': entry[2],  # Second text (if applicable)
#                     'img2': entry[3],   # Path or URL for the second image
#                     'label': entry[-1], # Label or class
#                 }
#                 processed_data.append(processed_entry)
#             except IndexError:
#                 print(f"Skipping malformed list entry at index {idx}: {entry}")
#         elif isinstance(entry, dict):
#             # Keep valid dictionary entries
#             processed_data.append(entry)
#         else:
#             print(f"Skipping unsupported entry type at index {idx}: {entry}")
#     return processed_data


In [338]:
# class WikiDiverseDataset(Dataset):
#     def __init__(self, data, img_dir, transform=None, text_tokenizer=None):
#         """
#         Initialize the dataset with preloaded data.

#         :param data: List of dictionary entries containing dataset information.
#         :param img_dir: Path to the directory containing images.
#         :param transform: Image transformations to apply (optional).
#         :param text_tokenizer: Tokenizer function for text (optional).
#         """
#         self.data = data  # Use the preloaded data directly
#         self.img_dir = img_dir
#         self.transform = transform
#         self.text_tokenizer = text_tokenizer

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         entry = self.data[idx]

#         # Load images
#         img1_path = os.path.join(self.img_dir, entry['image1'])
#         img2_path = os.path.join(self.img_dir, entry['image2'])
#         img1 = Image.open(img1_path).convert('RGB')
#         img2 = Image.open(img2_path).convert('RGB')

#         # Apply image transformations
#         if self.transform:
#             img1 = self.transform(img1)
#             img2 = self.transform(img2)

#         # Tokenize text
#         text1 = entry['text1']
#         text2 = entry['text2']
#         if self.text_tokenizer:
#             text1 = self.text_tokenizer(
#                 text1, return_tensors="pt", padding=True, truncation=True, max_length=128
#             )
#             text2 = self.text_tokenizer(
#                 text2, return_tensors="pt", padding=True, truncation=True, max_length=128
#             )

#         label = torch.tensor(entry['label'], dtype=torch.float32)

#         return img1, text1, img2, text2, label   

In [362]:
# def load_data(json_path, img_dir, batch_size):
#     dataset = WikiDiverseDataset(json_path, img_dir)
    
#     # Split the dataset into train, validation, and test
#     train_data, val_test_data = train_test_split(dataset.data, test_size=0.2, random_state=42)
#     val_data, test_data = train_test_split(val_test_data, test_size=0.5, random_state=42)
    
#     # Ensure split data retains correct structure (list of dictionaries)
#     def ensure_data_structure(data):
#         if isinstance(data[0], str):  # If data entries are strings, parse them as JSON
#             import json
#             return [json.loads(entry) for entry in data]
#         return data

#     train_data = ensure_data_structure(train_data)
#     val_data = ensure_data_structure(val_data)
#     test_data = ensure_data_structure(test_data)
    
#     # Create dataset objects for each set with corresponding splits
#     train_dataset = WikiDiverseDataset(json_path, img_dir)
#     train_dataset.data = train_data  # Assign parsed data to dataset
#     val_dataset = WikiDiverseDataset(json_path, img_dir)
#     val_dataset.data = val_data
#     test_dataset = WikiDiverseDataset(json_path, img_dir)
#     test_dataset.data = test_data
    
#     # Create DataLoader for each set
#     train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
#     val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
#     test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
#     return train_loader, val_loader, test_loader

In [363]:
# #------------------------
# # 4. Data Transformations 
# #------------------------
# transform = Compose([
#     transforms.Resize((224, 224)),
#     transforms.RandomHorizontalFlip(),
#     transforms.RandomVerticalFlip(),
#     transforms.RandomRotation(degrees=30),
#     ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
#     transforms.ToTensor(),
#     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
# ])

In [364]:
# #---------------------------
# # 5. Cross-Attention Mechanism 
# #---------------------------
# class CrossAttention(nn.Module):
#     def __init__(self, embed_dim, num_heads=4, dropout=0.1):
#         super(CrossAttention, self).__init__()
#         self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
#         self.layer_norm = nn.LayerNorm(embed_dim)
#         self.dropout = nn.Dropout(dropout)

#     def forward(self, query, key, value):
#         attn_output, _ = self.multihead_attn(query, key, value)
#         attn_output = self.dropout(attn_output)
#         return self.layer_norm(query + attn_output)

In [365]:
# #-----------------------------------
# # 6. Sub-Networks for Image and Text
# #-----------------------------------
# class ImageSubNetworkWithAttention(nn.Module):
#     def __init__(self, embed_dim=256, num_heads=4):
#         super(ImageSubNetworkWithAttention, self).__init__()
#         base_model = models.resnet50(pretrained=True)
#         self.features = nn.Sequential(*list(base_model.children())[:-1])  # Remove FC layer
#         self.fc = nn.Sequential(
#             nn.Linear(2048, embed_dim),
#             nn.ReLU(),
#             nn.Dropout(0.5)
#         )
#         self.cross_attention = CrossAttention(embed_dim, num_heads)

#     def forward(self, x, text_features):
#         x = self.features(x).view(x.size(0), -1)  # Flatten
#         x = self.fc(x)
#         if text_features is not None:
#             x = self.cross_attention(x.unsqueeze(1), text_features, text_features).squeeze(1)
#         return x

# class TextSubNetworkWithAttention(nn.Module):
#     def __init__(self, vocab_size, embed_dim=128, hidden_dim=256, num_heads=4):
#         super(TextSubNetworkWithAttention, self).__init__()
#         self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
#         self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, dropout=0.5)
#         self.fc = nn.Sequential(
#             nn.Linear(hidden_dim, 256),
#             nn.ReLU(),
#             nn.Dropout(0.5)
#         )
#         self.cross_attention = CrossAttention(hidden_dim, num_heads)

#     def forward(self, x, image_features):
#         x = self.embedding(x)
#         _, (hidden, _) = self.lstm(x)
#         hidden = hidden.squeeze(0)
#         if image_features is not None:
#             hidden = self.cross_attention(hidden.unsqueeze(1), image_features, image_features).squeeze(1)
#         return self.fc(hidden)

In [366]:
# #-----------------
# # 7. Siamese Network
# #----------------
# class SiameseNetworkWithCrossAttention(nn.Module):
#     def __init__(self, vocab_size):
#         super(SiameseNetworkWithCrossAttention, self).__init__()
#         self.image_net = ImageSubNetworkWithAttention()
#         self.text_net = TextSubNetworkWithAttention(vocab_size)

#     def forward(self, img1, img2, text1, text2):
#         img_embedding1 = self.image_net(img1, None)
#         img_embedding2 = self.image_net(img2, None)

#         text_embedding1 = self.text_net(text1, img_embedding1)
#         text_embedding2 = self.text_net(text2, img_embedding2)

#         combined_embedding1 = torch.cat([img_embedding1, text_embedding1], dim=1)
#         combined_embedding2 = torch.cat([img_embedding2, text_embedding2], dim=1)

#         return combined_embedding1, combined_embedding2

In [367]:
# #---------------------
# # 8. Contrastive Loss
# #---------------------
# class ContrastiveLoss(nn.Module):
#     def __init__(self, margin=1.0):
#         super(ContrastiveLoss, self).__init__()
#         self.margin = margin

#     def forward(self, output1, output2, label):
#         euclidean_distance = torch.nn.functional.pairwise_distance(output1, output2)
#         loss = torch.mean((1 - label) * torch.pow(euclidean_distance, 2) +
#                           label * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))
#         return loss

In [368]:
# #-------------------------
# # 9. Training and Evaluation
# #------------------------
# def train(model, device, loader, optimizer, criterion):
#     model.train()
#     total_loss = 0
#     total_correct = 0
#     for img1, text1, img2, text2, labels in loader:
#         img1, text1, img2, text2, labels = img1.to(device), text1.to(device), img2.to(device), text2.to(device), labels.to(device)
#         optimizer.zero_grad()
#         output1, output2 = model(img1, img2, text1, text2)
#         loss = criterion(output1, output2, labels)
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()
#     return total_loss / len(loader)

In [502]:
# from sklearn.metrics import accuracy_score, roc_auc_score
# import torch.nn.functional as F

# # Training loop
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = SiameseNetworkWithCrossAttention(VOCAB_SIZE).to(device)
# optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
# criterion = nn.CosineEmbeddingLoss()

# def train(model, device, loader, optimizer, criterion):
#     model.train()
#     total_loss = 0
#     for batch in loader:
#         try:
#             img1, text1, img2, text2, labels = batch
#             img1, text1, img2, text2, labels = (
#                 img1.to(device),
#                 text1.to(device),
#                 img2.to(device),
#                 text2.to(device),
#                 labels.to(device),
#             )
            
#             # Forward pass
#             combined_embedding1, combined_embedding2 = model(img1, img2, text1, text2)
#             labels = labels.unsqueeze(1).float()  # Ensure labels have correct shape
            
#             loss = criterion(combined_embedding1, combined_embedding2, labels)
#             total_loss += loss.item()
            
#             # Backward pass
#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()
#         except Exception as e:
#             print(f"Error in batch processing: {e}")
    
#     return total_loss / len(loader)

# def evaluate(model, device, loader):
#     model.eval()
#     predictions = []
#     true_labels = []
#     with torch.no_grad():
#         for batch in loader:
#             try:
#                 img1, text1, img2, text2, labels = batch
#                 img1, text1, img2, text2, labels = (
#                     img1.to(device),
#                     text1.to(device),
#                     img2.to(device),
#                     text2.to(device),
#                     labels.to(device),
#                 )
                
#                 combined_embedding1, combined_embedding2 = model(img1, img2, text1, text2)
#                 similarity = F.cosine_similarity(combined_embedding1, combined_embedding2)
                
#                 predictions.extend(similarity.cpu().numpy())
#                 true_labels.extend(labels.cpu().numpy())
#             except Exception as e:
#                 print(f"Error during evaluation: {e}")
    
#     # Compute metrics
#     accuracy = accuracy_score(true_labels, (np.array(predictions) > 0.5).astype(int)) * 100
#     auc = roc_auc_score(true_labels, predictions) if len(set(true_labels)) > 1 else None
    
#     return accuracy, auc

# # Training loop
# for epoch in range(EPOCHS):
#     train_loss = train(model, device, train_loader, optimizer, criterion)
#     print(f"Epoch {epoch+1}/{EPOCHS}, Train Loss: {train_loss:.4f}")
    
#     # Evaluate on validation set
#     val_accuracy, val_auc = evaluate(model, device, val_loader)
#     print(f"Epoch {epoch+1}/{EPOCHS}, Val Accuracy: {val_accuracy:.4f}, Val AUC: {val_auc:.4f}")

# # Final evaluation on test set
# test_accuracy, test_auc = evaluate(model, device, test_loader)
# print(f"Test Accuracy: {test_accuracy:.4f}, Test AUC: {test_auc:.4f}")

In [492]:
# #9. Training and Evaluation:
# #Train the model and track performance using metrics like ROC AUC.
# # Training and Evaluation
# from sklearn.metrics import accuracy_score
# import torch.nn.functional as F
# import torch.nn.functional as F
# from torchvision.transforms import Compose, ColorJitter
# from sklearn.model_selection import train_test_split
# # # Load data

# train_loader, val_loader, test_loader = load_data(json_path, img_dir, BATCH_SIZE)

# # Training loop
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = SiameseNetworkWithCrossAttention(VOCAB_SIZE).to(device)
# optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
# criterion = nn.CosineEmbeddingLoss()
# #labels = labels.unsqueeze(1).float()

# def train(model, device, loader, optimizer, criterion):
#     model.train()
#     total_loss = 0
#     for batch in loader:
#         valid_batch = [sample for sample in batch if sample is not None]
#         if len(valid_batch) == 5:  # Check if all elements are valid
#             img1, text1, img2, text2, labels = valid_batch
#             img1, text1, img2, text2, labels = img1.to(device), text1.to(device), img2.to(device), text2.to(device), labels.to(device)
            
#             combined_embedding1, combined_embedding2 = model(img1, img2, text1, text2)
#             labels = labels.unsqueeze(1).float()
            
#             loss = criterion(combined_embedding1, combined_embedding2, labels)
#             total_loss += loss.item()
            
#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()
#         else:
#             print(f"Warning: Invalid batch encountered. Skipping this iteration.")
    
#     return total_loss / len(loader)  

# def evaluate(model, device, loader):
#     model.eval()
#     predictions = []
#     true_labels = []
#     with torch.no_grad():
#         for batch in loader:
#             img1, text1, img2, text2 = batch
#             img1, text1, img2, text2 = img1.to(device), text1.to(device), img2.to(device), text2.to(device)
            
#             combined_embedding1, combined_embedding2 = model(img1, img2, text1, text2)
#             similarity = F.cosine_similarity(combined_embedding1, combined_embedding2)
#             predictions.extend(similarity.cpu().numpy())
#             true_labels.extend(batch[-1].cpu().numpy())
    
#     return accuracy_score(true_labels, (predictions > 0.5).astype(int)) * 100  

# # Training loop
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = SiameseNetworkWithCrossAttention(VOCAB_SIZE).to(device)
# optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
# criterion = nn.CosineEmbeddingLoss()

# for epoch in range(EPOCHS):
#     train_loss = train(model, device, train_loader, optimizer, criterion)    
#     print(f"Epoch {epoch+1}/{EPOCHS}, Train Loss: {train_loss:.4f}")
    
#     # Evaluate on validation set
#     val_accuracy = evaluate(model, device, val_loader)
#     print(f"Epoch {epoch+1}/{EPOCHS}, Val Accuracy: {val_accuracy:.4f}")

# # Final evaluation on test set
# test_accuracy = evaluate(model, device, test_loader)
# print(f"Test Accuracy: {test_accuracy:.4f}")

In [493]:
# #---------------
# # Main Excution
# #---------------
# data = load_dataset(json_path, img_dir)
# print(f"Loaded {len(data)} entries from the dataset.")

# # Initialize tokenizer (if needed)
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# # Create dataset and dataloader
# dataset = WikiDiverseDataset(data, img_dir, transform=transform, text_tokenizer=tokenizer.encode)
# dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# # Create model
# model = SiameseNetworkWithCrossAttention(VOCAB_SIZE)
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)

# # Set up optimizer and loss function
# optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
# criterion = ContrastiveLoss(margin=1.0)

# # Training loop
# for epoch in range(EPOCHS):
#     loss = train(model, device, dataloader, optimizer, criterion)
#     print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {loss:.4f}")

In [494]:
# # Main Execution
# data = load_dataset(json_path, img_dir)  # This function will load and process your data
# print(f"Loaded {len(data)} entries from the dataset.")

# # Initialize tokenizer (if needed)
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# # Create dataset and dataloader
# dataset = WikiDiverseDataset(data, img_dir, transform=transform, text_tokenizer=tokenizer.encode)
# dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# # Create model
# model = SiameseNetworkWithCrossAttention(VOCAB_SIZE)
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)

# # Set up optimizer and loss function
# optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
# criterion = ContrastiveLoss(margin=1.0)

# # Training loop
# for epoch in range(EPOCHS):
#     loss = train(model, device, dataloader, optimizer, criterion)
#     print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {loss:.4f}")


In [None]:
#-----
# Revised 
#-------
# Overview
# Siamese networks consist of two identical sub-networks that share weights and learn to compute similarity between two inputs.
# For the WikiDiverse dataset, the goal is to learn embeddings such that similar inputs are close in embedding space, while dissimilar ones are far apart.


In [401]:
# # Imports and Required Libraries
# import os
# import json
# import hashlib
# import re
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import Dataset, DataLoader
# from torchvision import models, transforms
# from transformers import BertTokenizer
# import torch.nn.functional as F
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score
# from PIL import Image
# from torchvision.transforms import Compose, ColorJitter

In [402]:
# # 1. Dataset Path and Hyperparameters
# 1 cccccc1

In [403]:
# 2 CCC

In [404]:
# 3 CCC

In [405]:
# # 4. Data Transformations
# transform = Compose([
#     transforms.Resize((224, 224)),
#     transforms.RandomHorizontalFlip(),
#     transforms.RandomVerticalFlip(),
#     transforms.RandomRotation(degrees=30),
#     ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
#     transforms.ToTensor(),
#     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
# ])

In [406]:
# # 5. Cross-Attention Mechanism
# class CrossAttention(nn.Module):
#     def __init__(self, embed_dim, num_heads=4, dropout=0.1):
#         super(CrossAttention, self).__init__()
#         self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
#         self.layer_norm = nn.LayerNorm(embed_dim)
#         self.dropout = nn.Dropout(dropout)

#     def forward(self, query, key, value):
#         attn_output, _ = self.multihead_attn(query, key, value)
#         attn_output = self.dropout(attn_output)
#         return self.layer_norm(query + attn_output)

In [407]:
# # 6. Sub-Networks for Image and Text
# class ImageSubNetworkWithAttention(nn.Module):
#     def __init__(self, embed_dim=256, num_heads=4):
#         super(ImageSubNetworkWithAttention, self).__init__()
#         base_model = models.resnet50(pretrained=True)
#         self.features = nn.Sequential(*list(base_model.children())[:-1])
#         self.fc = nn.Sequential(
#             nn.Linear(2048, embed_dim),
#             nn.ReLU(),
#             nn.Dropout(0.5)
#         )
#         self.cross_attention = CrossAttention(embed_dim, num_heads)

#     def forward(self, x, text_features):
#         x = self.features(x).view(x.size(0), -1)
#         x = self.fc(x)
#         if text_features is not None:
#             x = self.cross_attention(x.unsqueeze(1), text_features, text_features).squeeze(1)
#         return x

# class TextSubNetworkWithAttention(nn.Module):
#     def __init__(self, vocab_size, embed_dim=128, hidden_dim=256, num_heads=4):
#         super(TextSubNetworkWithAttention, self).__init__()
#         self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
#         self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, dropout=0.5)
#         self.fc = nn.Sequential(
#             nn.Linear(hidden_dim, 256),
#             nn.ReLU(),
#             nn.Dropout(0.5)
#         )
#         self.cross_attention = CrossAttention(hidden_dim, num_heads)

#     def forward(self, x, image_features):
#         x = self.embedding(x)
#         _, (hidden, _) = self.lstm(x)
#         hidden = hidden.squeeze(0)
#         if image_features is not None:
#             hidden = self.cross_attention(hidden.unsqueeze(1), image_features, image_features).squeeze(1)
#         return self.fc(hidden)

In [408]:
# # 7. Siamese Network
# class SiameseNetworkWithCrossAttention(nn.Module):
#     def __init__(self, vocab_size):
#         super(SiameseNetworkWithCrossAttention, self).__init__()
#         self.image_net = ImageSubNetworkWithAttention()
#         self.text_net = TextSubNetworkWithAttention(vocab_size)

#     def forward(self, img1, img2, text1, text2):
#         img_embedding1 = self.image_net(img1, None)
#         img_embedding2 = self.image_net(img2, None)

#         text_embedding1 = self.text_net(text1, img_embedding1)
#         text_embedding2 = self.text_net(text2, img_embedding2)

#         combined_embedding1 = torch.cat([img_embedding1, text_embedding1], dim=1)
#         combined_embedding2 = torch.cat([img_embedding2, text_embedding2], dim=1)

#         return combined_embedding1, combined_embedding2


In [409]:
# # 8. Contrastive Loss
# class ContrastiveLoss(nn.Module):
#     def __init__(self, margin=1.0):
#         super(ContrastiveLoss, self).__init__()
#         self.margin = margin

#     def forward(self, output1, output2, label):
#         euclidean_distance = torch.nn.functional.pairwise_distance(output1, output2)
#         loss = torch.mean((1 - label) * torch.pow(euclidean_distance, 2) +
#                           label * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))
#         return loss

In [495]:
# # 9. Training and Evaluation
# def train(model, device, loader, optimizer, criterion):
#     model.train()
#     total_loss = 0
#     for img1, text1, img2, text2, labels in loader:
#         img1, text1, img2, text2, labels = img1.to(device), text1.to(device), img2.to(device), text2.to(device), labels.to(device)
#         optimizer.zero_grad()
#         output1, output2 = model(img1, img2, text1, text2)
#         loss = criterion(output1, output2, labels)
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()
#     return total_loss / len(loader)

# def evaluate(model, device, loader):
#     model.eval()
#     predictions = []
#     true_labels = []
#     with torch.no_grad():
#         for img1, text1, img2, text2, labels in loader:
#             img1, text1, img2, text2, labels = img1.to(device), text1.to(device), img2.to(device), text2.to(device), labels.to(device)
#             output1, output2 = model(img1, img2, text1, text2)
#             similarity = F.cosine_similarity(output1, output2)
#             predictions.extend(similarity.cpu().numpy())
#             true_labels.extend(labels.cpu().numpy())
#     return accuracy_score(true_labels, (np.array(predictions) > 0.5).astype(int)) * 100

# # Main Execution
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# dataset = load_dataset(JSON_PATH, IMAGE_DIR)
# train_data, val_test_data = train_test_split(dataset, test_size=0.2, random_state=42)
# val_data, test_data = train_test_split(val_test_data, test_size=0.5, random_state=42)

# # train_loader = DataLoader(WikiDiverseDataset(train_data, IMAGE_DIR, transform), batch_size=BATCH_SIZE, shuffle=True)
# # val_loader = DataLoader(WikiDiverseDataset(val_data, IMAGE_DIR, transform), batch_size=BATCH_SIZE, shuffle=False)
# # test_loader = DataLoader(WikiDiverseDataset(test_data, IMAGE_DIR, transform), batch_size=BATCH_SIZE, shuffle=False)
# train_loader = DataLoader(
#     WikiDiverseDataset(train_data, IMAGE_DIR, transform=transform, text_tokenizer=tokenizer),
#     batch_size=BATCH_SIZE,
#     shuffle=True,
# )
# val_loader = DataLoader(
#     WikiDiverseDataset(val_data, IMAGE_DIR, transform=transform, text_tokenizer=tokenizer),
#     batch_size=BATCH_SIZE,
#     shuffle=False,
# )
# test_loader = DataLoader(
#     WikiDiverseDataset(test_data, IMAGE_DIR, transform=transform, text_tokenizer=tokenizer),
#     batch_size=BATCH_SIZE,
#     shuffle=False,
# )

# model = SiameseNetworkWithCrossAttention(VOCAB_SIZE).to(device)
# optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
# criterion = ContrastiveLoss()

# for epoch in range(EPOCHS):
#     train_loss = train(model, device, train_loader, optimizer, criterion)
#     print(f"Epoch {epoch+1}/{EPOCHS}, Train Loss: {train_loss:.4f}")
#     val_accuracy = evaluate(model, device, val_loader)
#     print(f"Epoch {epoch+1}/{EPOCHS}, Val Accuracy: {val_accuracy:.4f}")

# test_accuracy = evaluate(model, device, test_loader)
# print(f"Test Accuracy: {test_accuracy:.4f}")

In [40]:
# #------------------------
# # ???. WikiDiverse Dataset Class 
# #------------------------
# class WikiDiverseDataset(Dataset):
#     def __init__(self, json_path, img_dir, transform=None, text_tokenizer=None):
#         self.data = []
#         self.img_dir = img_dir
#         self.transform = transform
#         self.text_tokenizer = text_tokenizer
        
#         try:
#             with open(json_path, 'r') as f:
#                 for entry in json.load(f):
#                     try:
#                         img1_path = os.path.join(img_dir, entry['image1'])
#                         img2_path = os.path.join(img_dir, entry['image2'])
                        
#                         if os.path.exists(img1_path) and os.path.exists(img2_path):
#                             self.data.append(entry)
#                     except Exception as e:
#                         print(f"Error processing entry: {e}")
#         except PermissionError:
#             print(f"Permission denied for file: {json_path}. Please ensure you have read permissions.")
        
#         print(f"Total valid entries: {len(self.data)}")

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         entry = self.data[idx]

#         # Load images
#         img1_path = os.path.join(self.img_dir, entry['image1'])
#         img2_path = os.path.join(self.img_dir, entry['image2'])
#         img1 = Image.open(img1_path).convert('RGB')
#         img2 = Image.open(img2_path).convert('RGB')

#         # Apply image transformations
#         if self.transform:
#             img1 = self.transform(img1)
#             img2 = self.transform(img2)

#         # Tokenize text
#         text1 = entry['text1']
#         text2 = entry['text2']
#         if self.text_tokenizer:
#             text1 = self.text_tokenizer(text1, return_tensors="pt", padding=True, truncation=True, max_length=128)
#             text2 = self.text_tokenizer(text2, return_tensors="pt", padding=True, truncation=True, max_length=128)

#         label = torch.tensor(entry['label'], dtype=torch.float32)

#         return img1, text1, img2, text2, label

# def collate_fn(batch):
#     batch = [sample for sample in batch if sample is not None]
#     return default_collate(batch)

In [501]:
# import os

# img_dir = r"C:\Users\Min Dator\aics-project\wikinewsImgs"

# if not os.path.exists(img_dir):
#     print(f"Image directory does not exist: {img_dir}")
# else:
#     print(f"Image directory exists: {img_dir}")

In [496]:
# import glob

# img_files = glob.glob(os.path.join(img_dir, "*"))
# if len(img_files) == 0:
#     print("No image files found in the directory.")
# else:
#     print(f"Found {len(img_files)} image files.")
#     print(f"Example files: {img_files[:5]}")

In [14]:
# from PIL import Image

# class WikiDiverseDataset(Dataset):
#     # Existing __init__ method here...

#     def __getitem__(self, idx):
#         entry = self.data[idx]

#         img1_path = os.path.join(self.img_dir, entry['image1'])
#         img2_path = os.path.join(self.img_dir, entry['image2'])

#         try:
#             img1 = Image.open(img1_path).convert('RGB')
#             img2 = Image.open(img2_path).convert('RGB')
#         except Exception as e:
#             print(f"Error loading image: {e} for {img1_path} or {img2_path}")
#             return None

#         # Rest of the __getitem__ code...


In [16]:
# train_json_path = r"C:\Users\Min Dator\aics-project\wikidiverse_w_cands\wikidiverse_w_cands\train_w_10cands.json"
# valid_json_path = r"C:\Users\Min Dator\aics-project\wikidiverse_w_cands\wikidiverse_w_cands\valid_w_10cands.json"
# test_json_path = r"C:\Users\Min Dator\aics-project\wikidiverse_w_cands\wikidiverse_w_cands\test_w_10cands.json"

In [497]:
# train_loader = WikiDiverseDataset(train_json_path, img_dir, transform, text_tokenizer)
# val_loader = WikiDiverseDataset(valid_json_path, img_dir, transform, text_tokenizer)
# test_loader = WikiDiverseDataset(test_json_path, img_dir, transform, text_tokenizer)

# # Wrap with DataLoader
# train_loader = DataLoader(train_loader, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
# val_loader = DataLoader(val_loader, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
# test_loader = DataLoader(test_loader, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)


In [20]:
# train_json_path = r"C:\Users\Min Dator\aics-project\wikidiverse_w_cands\wikidiverse_w_cands\train_w_10cands.json"
# valid_json_path = r"C:\Users\Min Dator\aics-project\wikidiverse_w_cands\wikidiverse_w_cands\valid_w_10cands.json"
# test_json_path = r"C:\Users\Min Dator\aics-project\wikidiverse_w_cands\wikidiverse_w_cands\test_w_10cands.json"

In [500]:
# import json

# def load_json(json_file):
#     try:
#         with open(json_file, 'r') as f:
#             data = json.load(f)
#             print(f"Loaded {len(data)} entries from {json_file}")
#             return data
#     except json.JSONDecodeError as e:
#         print(f"JSON Decode Error: {e}")
#         raise
#     except FileNotFoundError as e:
#         print(f"File not found: {e}")
#         raise

# # Load JSON files
# train_data = load_json(train_json_path)
# test_data = load_json(test_json_path)
# valid_data = load_json(valid_json_path)

In [498]:
# def process_entry(entry):
#     if not isinstance(entry, dict):
#         print("Invalid entry format, skipping:", entry)
#         return None
    
#     # Check required keys
#     required_keys = ['img_path', 'mention']
#     for key in required_keys:
#         if key not in entry:
#             print(f"Missing key {key} in entry:", entry)
#             return None
    
#     # Process image path
#     img_path = entry['img_path']
#     print(f"Processing image: {img_path}")
#     return entry

# # Validate dataset
# processed_data = [process_entry(entry) for entry in train_data if process_entry(entry)]

In [499]:
# from sklearn.model_selection import train_test_split

# def split_dataset(data, test_size=0.2):
#     if len(data) == 0:
#         raise ValueError("Cannot split an empty dataset!")

#     train, test = train_test_split(data, test_size=test_size, random_state=42)
#     print(f"Split into {len(train)} train and {len(test)} test entries.")
#     return train, test

# train_split, val_split = split_dataset(train_data, test_size=0.2)
# val_split, test_split = split_dataset(val_split, test_size=0.5)


In [29]:
# from torch.utils.data import DataLoader, Dataset


# # Placeholder for missing variables
# text_tokenizer = lambda x: x  # Replace with a real tokenizer if needed
# transform = None  # Define image transformations if necessary


# class WikiDiverseDataset(Dataset):
#     def __init__(self, data, transform=None, tokenizer=None):
#         self.data = data
#         self.transform = transform
#         self.tokenizer = tokenizer

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         item = self.data[idx]
#         img_path = item['img_path']
#         text = item['mention']
        
#         # Apply tokenizer
#         if self.tokenizer:
#             text = self.tokenizer(text)
        
#         # Apply transform
#         if self.transform:
#             # Load and transform the image (placeholder)
#             pass
        
#         return img_path, text

# # Create datasets
# train_dataset = WikiDiverseDataset(train_split, transform, text_tokenizer)
# val_dataset = WikiDiverseDataset(val_split, transform, text_tokenizer)
# test_dataset = WikiDiverseDataset(test_split, transform, text_tokenizer)

# # Wrap datasets with DataLoader
# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [9]:
# from sklearn.model_selection import train_test_split
# from torch.utils.data import DataLoader
# import os

# # Assume WikiDiverseDataset is defined with proper constructor
# class WikiDiverseDataset:
#     def __init__(self, json_path, img_dir, transform=None):
#         # Load JSON file and process
#         self.json_path = json_path
#         self.img_dir = img_dir
#         self.transform = transform
#         self.data = self.load_json_data()
    
#     def load_json_data(self):
#         # Load the dataset from json_path
#         import json
#         with open(self.json_path, 'r') as file:
#             data = json.load(file)
#         return data

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         # Get data from JSON
#         entry = self.data[idx]
#         img_path = os.path.join(self.img_dir, entry['img_path'])
#         mention = entry['mention']
        
#         # Apply transform if any
#         if self.transform:
#             img = self.transform(img_path)
        
#         return img_path, mention

# # Define collate_fn if needed
# def collate_fn(batch):
#     # Custom function to handle batching
#     return batch

# # Load data function
# def load_data(json_path, img_dir, batch_size):
#     dataset = WikiDiverseDataset(json_path, img_dir)
    
#     # Split the dataset into train, validation, and test
#     train_data, val_test_data = train_test_split(dataset.data, test_size=0.2, random_state=42)
#     val_data, test_data = train_test_split(val_test_data, test_size=0.5, random_state=42)
    
#     # Create dataset objects for each set with corresponding splits
#     train_dataset = WikiDiverseDataset(json_path, img_dir)
#     train_dataset.data = train_data  # Assign the split data to the dataset
#     val_dataset = WikiDiverseDataset(json_path, img_dir)
#     val_dataset.data = val_data
#     test_dataset = WikiDiverseDataset(json_path, img_dir)
#     test_dataset.data = test_data
    
#     # Create DataLoader for each set
#     train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
#     val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
#     test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    
#     return train_loader, val_loader, test_loader

# # Example usage
# #json_path = os.path.join(os.getcwd(), 'wikidiverse_w_cands', 'wikidiverse_w_cands.json')
# json_path = r"C:\Users\Min Dator\aics-project\wikidiverse_w_cands\wikidiverse_w_cands\train_w_10cands.json"
# img_dir = r'C:\Users\Min Dator\aics-project\wikinewsImgs'
# BATCH_SIZE = 16

# train_loader, val_loader, test_loader = load_data(json_path, img_dir, BATCH_SIZE)

In [10]:

#---------------
# 5. Load Data 
#---------------

In [None]:
#-------------------------------
# 6. Cross-Attention Mechanism 
#-------------------------------


In [4]:
# # 1. Dataset Path and Hyperparameters:
# # -----------------------------
# # Dataset Path and Hyperparameters
# # -----------------------------
# # Update paths based on your directory structure
# DATASET_PATH = '\\home\\gusjembda@GU.GU.SE\\aics-project\\wikidiverse_w_cands\\wikidiverse_w_cands'
# IMAGE_DIR = os.path.join(DATASET_PATH, "wikinewsImgs")
# JSON_PATH = os.path.join(DATASET_PATH, "train_w_10cands.json")

# #DATASET_PATH = '/home/gusjembda@GU.GU.SE/aics-project'
# #IMAGE_DIR = os.path.join(DATASET_PATH, "wikinewsImgs")  # Make sure this directory exists
# #JSON_PATH = os.path.join(DATASET_PATH, "train_w_10cands.json")

# BATCH_SIZE = 16
# EPOCHS = 10
# LEARNING_RATE = 0.0001
# EMBED_DIM = 256
# VOCAB_SIZE = 10000

# # Paths
# #DATASET_PATH = '/home/gusjembda/aics-project/wikidiverse_w_cands/wikidiverse_w_cands'
# #JSON_PATH = '/home/gusjembda@GU.GU.SE/aics-project/wikidiverse_w_cands/wikidiverse_w_cands/train_w_10cands.json'

# #IMAGE_DIR = '/home/gusjembda/aics-project/wikinewsImgs'
# #ENTITY_DESC_PATH = '/home/gusjembda/aics-project/wikipedia_entity2desc_filtered.tsv'
# #ENTITY_IMGS_PATH = '/home/gusjembda/aics-project/wikipedia_entity2imgs.tsv'
# #PROJECT_DIR = '/home/gusjembda/aics-project/WikiDiverse'

# # usage:
# #json_path = os.path.join(DATASET_PATH, 'train_w_10cands.json')
# # entity_desc_df = pd.read_csv(ENTITY_DESC_PATH, sep='\t', header=None)
# # Sentity_imgs_df = pd.read_csv(ENTITY_IMGS_PATH, sep='\t', header=None)

In [5]:
# import hashlib
# import re

# for item in data:
#     m_img = item[1].split('/')[-1]
#     prefix = hashlib.md5(m_img.encode()).hexdigest()
#     suffix = re.sub(r'(\S+(?=\.(jpg|JPG|png|PNG|svg|SVG)))|(\S+(?=\.(jpeg|JPEG)))', '', m_img)
#     m_img = 'path/to/wikinewsImgs/' + prefix + suffix
#     m_img = m_img.replace('.svg', '.png').replace('.SVG', '.png')

NameError: name 'data' is not defined

In [6]:
# import os

# # Correct path for the directory you're working in
# DATASET_PATH = '\\home\\gusjembda@GU.GU.SE\\aics-project\\wikidiverse_w_cands\\wikidiverse_w_cands'

# # Define the path to the 'train_w_10cands.json' file
# JSON_PATH = os.path.join(DATASET_PATH, 'train_w_10cands.json')

# # Verify the path
# print(f"Path to the train_w_10cands.json: {JSON_PATH}")

Path to the train_w_10cands.json: \home\gusjembda@GU.GU.SE\aics-project\wikidiverse_w_cands\wikidiverse_w_cands/train_w_10cands.json


import pandas as pd

data = pd.read_json(JSON_PATH)

# Print the first few rows of the dataset
print(data.head())

In [7]:
# import os
# os.getcwd()
# print(f"Current working directory: {os.getcwd()}")

Current working directory: /mnt/c/Users/Min Dator/aics-project


In [8]:
# import os

# JSON_PATH = "/home/gusjembda@GU.GU.SE/aics-project/wikidiverse_w_cands/train_w_10cands.json"

# if os.path.exists(JSON_PATH):
#     print("File exists!")
#     try:
#         with open(JSON_PATH, 'r') as f:
#             data = f.read()
#             print(f"File content length: {len(data)}")
#     except Exception as e:
#         print(f"Error reading the file: {e}")
# else:
#     print("File does not exist!")

File does not exist!


In [9]:
# import os
# import json

# # Define the path to the train JSON file
# train_json = '/home/gusjembda@GU.GU.SE/aics-project/WikiDiverse/train.json'

# # Check if the file exists
# if os.path.exists(train_json):
#     print(f"Train JSON found: {train_json}")
#     # Open and load the JSON file
#     with open(train_json, 'r') as f:
#         data = json.load(f)
#         print("First 5 records from the train JSON:")
#         print(data[:5])  # Display the first 5 records
# else:
#     print(f"Train JSON not found: {train_json}")

Train JSON not found: /home/gusjembda@GU.GU.SE/aics-project/WikiDiverse/train.json


In [10]:
# import os

# train_json = r'/home/gusjembda@GU.GU.SE/aics-project/WikiDiverse/train.json'
# print("File exists:", os.path.exists(train_json))

File exists: False


In [11]:
# import json

# # Path to your JSON file
# #JSON_PATH = "/home/gusjembda@GU.GU.SE/aics-project/wikidiverse_w_cands/wikidiverse_w_cands/train_w_10cands.json"
# JSON_PATH = "/home/gusjembda\\@GU.GU.SE/aics-project/wikidiverse_w_cands/wikidiverse_w_cands/train_w_10cands.json"
# try:
#     with open(JSON_PATH, 'r') as f:
#         data = json.load(f)
#     print("Data loaded successfully!")
# except FileNotFoundError:
#     print(f"Error: File not found at {JSON_PATH}")
# except Exception as e:
#     print(f"Error: {e}")

Error: File not found at /home/gusjembda\@GU.GU.SE/aics-project/wikidiverse_w_cands/wikidiverse_w_cands/train_w_10cands.json


In [42]:
# import hashlib
# import re
# import os

# # Example: process each data entry to get the local path for images
# def get_image_path(item, img_dir):
#     m_img = item[1].split('/')[-1]  # Extract image filename
#     prefix = hashlib.md5(m_img.encode()).hexdigest()  # Generate a unique hash
#     suffix = re.sub(r'(\S+(?=\.(jpg|JPG|png|PNG|svg|SVG)))|(\S+(?=\.(jpeg|JPEG)))', '', m_img)  # Remove image extension
#     m_img = os.path.join(img_dir, prefix + suffix)  # Create the local path
#     m_img = m_img.replace('.svg', '.png').replace('.SVG', '.png')  # Replace .svg with .png
#     return m_img

# # Example usage
# img_dir = "/path/to/wikinewsImgs"
# data = [...]  # Your dataset here
# for item in data:
#     image_path = get_image_path(item, img_dir)
#     print(f"Processed image path: {image_path}")

In [43]:
# import json

# json_file = "/home/gusjembda@GU.GU.SE/aics-project/wikidiverse_w_cands/wikidiverse_w_cands/train_w_10cands.json"

# try:
#     with open(json_file, 'r') as f:
#         data = json.load(f)
#         print("Valid JSON")
# except json.JSONDecodeError as e:
#     print(f"Invalid JSON: {e}")

In [14]:
# import os

# path = "/home/gusjembda@GU.GU.SE/aics-project/wikidiverse_w_cands/wikidiverse_w_cands/train_w_10cands.json"

# if os.path.exists(path):
#     print("File exists and is accessible!")
# else:
#     print("File not found!")

File not found!


In [44]:
# import os

# # Path to the parent directory
# DATASET_PATH = '/home/gusjembda@GU.GU.SE/aics-project/wikidiverse_w_cands/wikidiverse_w_cands'

# # File paths
# TRAIN_JSON = os.path.join(DATASET_PATH, 'train_w_10cands.json')
# TEST_JSON = os.path.join(DATASET_PATH, 'test_w_10cands.json')
# VALID_JSON = os.path.join(DATASET_PATH, 'valid_w_10cands.json')

# # Verify paths
# print(f"Train JSON: {TRAIN_JSON}")
# print(f"Test JSON: {TEST_JSON}")
# print(f"Valid JSON: {VALID_JSON}")

In [45]:
# import os

# base_dir = '\\home\\gusjembda@GU.GU.SE\\aics-project\\wikidiverse_w_cands\\wikidiverse_w_cands'
# json_file = 'train_w_10cands.json'

# JSON_PATH = os.path.join(base_dir, json_file)

# with open(JSON_PATH, 'r') as f:
#     data = json.load(f)
#     print(data[:5])  

In [46]:
# import os
# import json

# # Use forward slashes for Linux paths
# base_dir = '/home/gusjembda@GU.GU.SE/aics-project/wikidiverse_w_cands/wikidiverse_w_cands'
# json_file = 'train_w_10cands.json'

# JSON_PATH = os.path.join(base_dir, json_file)

# # Check if the file exists
# if not os.path.exists(JSON_PATH):
#     print(f"File not found: {JSON_PATH}")
# else:
#     with open(JSON_PATH, 'r') as f:
#         data = json.load(f)
#         # Print the first 5 items (assuming the JSON is a list)
#         print(data[:5])

In [47]:
# import os
# import json

# # Use forward slashes for Linux
# base_dir = '/home/gusjembda@GU.GU.SE/aics-project/wikidiverse_w_cands/wikidiverse_w_cands'
# json_file = 'train_w_10cands.json'

# # Construct the full path
# JSON_PATH = os.path.join(base_dir, json_file)

# # Check if the file exists before trying to open it
# if not os.path.exists(JSON_PATH):
#     print(f"File not found: {JSON_PATH}")
# else:
#     # Safely open and load the JSON file
#     with open(JSON_PATH, 'r') as f:
#         data = json.load(f)
#         print(data[:5])  # Print the first 5 items

In [19]:
# # -----------------------------
# # Helper Functions
# # -----------------------------
# # Generate the local image path
# def get_image_path(url, img_dir):
#     try:
#         filename = url.split('/')[-1]
#         prefix = hashlib.md5(filename.encode()).hexdigest()
#         suffix = re.sub(r'(\S+(?=\.(jpg|jpeg|png|svg)))', '', filename, flags=re.IGNORECASE)
#         local_path = os.path.join(img_dir, f"{prefix}{suffix}".replace('.svg', '.png'))
#         return local_path
#     except Exception as e:
#         print(f"Error generating image path: {e}")
#         return None

# # Load and preprocess the dataset
# def load_dataset(json_path, img_dir):
#     try:
#         with open(json_path, 'r') as f:
#             data = json.load(f)
#         processed_data = []
#         for item in data:
#             img_path = get_image_path(item[1], img_dir)
#             if img_path and os.path.exists(img_path):
#                 processed_data.append({
#                     'sentence': item[0],
#                     'mention': item[2],
#                     'mention_type': item[3],
#                     'left_context': item[4],
#                     'right_context': item[5],
#                     'entity_url': item[6],
#                     'img_path': img_path
#                 })
        
#         if not processed_data:
#             print("No valid data entries found in the dataset.")
        
#         return processed_data
#     except Exception as e:
#         print(f"Error loading dataset: {e}")
#         return []

In [20]:
# # -----------------------------
# # Data Transformations
# # -----------------------------
# transform = Compose([
#     transforms.Resize((224, 224)),
#     transforms.RandomHorizontalFlip(),
#     transforms.RandomVerticalFlip(),
#     transforms.RandomRotation(degrees=30),
#     ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
#     transforms.ToTensor(),
#     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
# ])

In [21]:
# # -----------------------------
# # Dataset Class
# # -----------------------------
# class WikiDiverseDataset(Dataset):
#     def __init__(self, data, transform=None, text_tokenizer=None):
#         self.data = data
#         self.transform = transform
#         self.text_tokenizer = text_tokenizer

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         entry = self.data[idx]
#         try:
#             img = Image.open(entry['img_path']).convert('RGB')
#             if self.transform:
#                 img = self.transform(img)

#             text = entry['sentence']
#             if self.text_tokenizer:
#                 text = self.text_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
#             return img, text
#         except Exception as e:
#             print(f"Error processing data at index {idx}: {e}")
#             return None

In [22]:
# # -----------------------------
# # Cross-Attention Mechanism
# # -----------------------------
# class CrossAttention(nn.Module):
#     def __init__(self, embed_dim, num_heads=4, dropout=0.1):
#         super(CrossAttention, self).__init__()
#         self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
#         self.layer_norm = nn.LayerNorm(embed_dim)
#         self.dropout = nn.Dropout(dropout)

#     def forward(self, query, key, value):
#         attn_output, _ = self.multihead_attn(query, key, value)
#         attn_output = self.dropout(attn_output)
#         return self.layer_norm(query + attn_output)

In [23]:
# # -----------------------------
# # Sub-Networks for Image and Text
# # -----------------------------
# class ImageSubNetworkWithAttention(nn.Module):
#     def __init__(self, embed_dim=256, num_heads=4):
#         super(ImageSubNetworkWithAttention, self).__init__()
#         base_model = models.resnet50(pretrained=True)
#         self.features = nn.Sequential(*list(base_model.children())[:-1])  # Remove FC layer
#         self.fc = nn.Sequential(
#             nn.Linear(2048, embed_dim),
#             nn.ReLU(),
#             nn.Dropout(0.5)
#         )
#         self.cross_attention = CrossAttention(embed_dim, num_heads)

#     def forward(self, x, text_features):
#         x = self.features(x).view(x.size(0), -1)  # Flatten
#         x = self.fc(x)
#         if text_features is not None:
#             x = self.cross_attention(x.unsqueeze(1), text_features, text_features).squeeze(1)
#         return x

# class TextSubNetworkWithAttention(nn.Module):
#     def __init__(self, vocab_size, embed_dim=128, hidden_dim=256, num_heads=4):
#         super(TextSubNetworkWithAttention, self).__init__()
#         self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
#         self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, dropout=0.5)
#         self.fc = nn.Sequential(
#             nn.Linear(hidden_dim, 256),
#             nn.ReLU(),
#             nn.Dropout(0.5)
#         )
#         self.cross_attention = CrossAttention(hidden_dim, num_heads)

#     def forward(self, x, image_features):
#         x = self.embedding(x)
#         _, (hidden, _) = self.lstm(x)
#         hidden = hidden.squeeze(0)
#         if image_features is not None:
#             hidden = self.cross_attention(hidden.unsqueeze(1), image_features, image_features).squeeze(1)
#         return self.fc(hidden)

In [24]:
# # -----------------------------
# # Siamese Network
# # -----------------------------
# class SiameseNetworkWithCrossAttention(nn.Module):
#     def __init__(self, vocab_size):
#         super(SiameseNetworkWithCrossAttention, self).__init__()
#         self.image_net = ImageSubNetworkWithAttention()
#         self.text_net = TextSubNetworkWithAttention(vocab_size)

#     def forward(self, img1, img2, text1, text2):
#         img_embedding1 = self.image_net(img1, None)
#         img_embedding2 = self.image_net(img2, None)

#         text_embedding1 = self.text_net(text1, img_embedding1)
#         text_embedding2 = self.text_net(text2, img_embedding2)

#         combined_embedding1 = torch.cat([img_embedding1, text_embedding1], dim=1)
#         combined_embedding2 = torch.cat([img_embedding2, text_embedding2], dim=1)

#         return combined_embedding1, combined_embedding2

In [25]:
# # -----------------------------
# # Contrastive Loss
# # -----------------------------
# class ContrastiveLoss(nn.Module):
#     def __init__(self, margin=1.0):
#         super(ContrastiveLoss, self).__init__()
#         self.margin = margin

#     def forward(self, output1, output2, label):
#         euclidean_distance = torch.nn.functional.pairwise_distance(output1, output2)
#         loss = torch.mean((1 - label) * torch.pow(euclidean_distance, 2) +
#                           label * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))
#         return loss

In [26]:
# # -----------------------------
# # Training and Evaluation
# # -----------------------------
# def train(model, device, loader, optimizer, criterion):
#     model.train()
#     total_loss = 0
#     total_correct = 0
#     for img1, text1, img2, text2, labels in loader:
#         img1, text1, img2, text2, labels = img1.to(device), text1.to(device), img2.to(device), text2.to(device), labels.to(device)
#         optimizer.zero_grad()
#         output1, output2 = model(img1, img2, text1, text2)
#         loss = criterion(output1, output2, labels)
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()
#     return total_loss / len(loader)