In [25]:
import os
import json
import requests
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
import torch.nn.functional as F
from torchvision import models, transforms
from PIL import Image
from tqdm import tqdm

# Paths to dataset and image folder
#DATASET_PATH = "path_to_wikidiverse.json"  # Replace with your dataset file
#DATASET_PATH = r"C:\Users\Min Dator\aics-project\wikidiverse.json"
DATASET_PATH = r"C:\Users\Min Dator\aics-project\wikidiverse_data\images/062ce5e341a566a4208d801e53557538.jpg"

DATASET_PATH = r""
IMAGES_FOLDER = "downloaded_images/"
os.makedirs(IMAGES_FOLDER, exist_ok=True)

# Initialize BERT tokenizer and model for textual encoding
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Initialize ResNet model for image encoding (using ResNet-50 here)
resnet_model = models.resnet50(weights='ResNet50_Weights.DEFAULT')
resnet_model = nn.Sequential(*list(resnet_model.children())[:-1])  # Remove final classification layer

# Image preprocessing pipeline
image_preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Cross-Attention Layer
class CrossAttentionLayer(nn.Module):
    def __init__(self, hidden_size, num_attention_heads):
        super(CrossAttentionLayer, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim=hidden_size, num_heads=num_attention_heads)
        self.fc = nn.Linear(hidden_size, hidden_size)

    def forward(self, text_features, image_features):
        text_features = text_features.unsqueeze(0)  # Add batch dimension
        image_features = image_features.unsqueeze(0)  # Add batch dimension

        attn_output_text, _ = self.attention(text_features, image_features, image_features)
        attn_output_image, _ = self.attention(image_features, text_features, text_features)

        combined_output = attn_output_text + attn_output_image
        combined_output = self.fc(combined_output)
        return combined_output

# Entity Disambiguation Head
class EntityDisambiguationHead(nn.Module):
    def __init__(self, hidden_size, num_candidates):
        super(EntityDisambiguationHead, self).__init__()
        self.fc1 = nn.Linear(hidden_size, 512)
        self.fc2 = nn.Linear(512, num_candidates)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, features):
        x = F.relu(self.fc1(features))
        x = self.fc2(x)
        return self.softmax(x)

# Model combining Text and Image features
class MultimodalEntityLinkingModel(nn.Module):
    def __init__(self, hidden_size, num_attention_heads, num_candidates):
        super(MultimodalEntityLinkingModel, self).__init__()
        self.text_encoder = bert_model
        self.image_encoder = resnet_model
        self.cross_attention_layer = CrossAttentionLayer(hidden_size, num_attention_heads)
        self.disambiguation_head = EntityDisambiguationHead(hidden_size, num_candidates)

    def forward(self, text_input, image_input):
        encoded_input = tokenizer(text_input, return_tensors='pt', padding=True, truncation=True)
        text_output = self.text_encoder(**encoded_input).last_hidden_state

        image_input = image_input.unsqueeze(0)  # Add batch dimension
        image_features = self.image_encoder(image_input)
        image_features = image_features.view(image_features.size(0), -1)

        combined_features = self.cross_attention_layer(text_output, image_features)
        entity_scores = self.disambiguation_head(combined_features.squeeze(0))
        return entity_scores

# Helper Functions for Preprocessing
def download_image(image_url):
    image_name = image_url.split("/")[-1]
    local_path = os.path.join(IMAGES_FOLDER, image_name)

    if not os.path.exists(local_path):
        response = requests.get(image_url, stream=True)
        if response.status_code == 200:
            with open(local_path, "wb") as f:
                f.write(response.content)
    return local_path

def preprocess_dataset(dataset_path):
    preprocessed_data = []

    with open(dataset_path, "r") as f:
        data = json.load(f)

    for item in tqdm(data, desc="Preprocessing Dataset"):
        text = item.get("caption")
        image_url = item.get("image_url")
        entities = item.get("entities")

        tokenized_text = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

        try:
            local_image_path = download_image(image_url)
            image = Image.open(local_image_path).convert("RGB")
            image_tensor = image_preprocess(image)
        except Exception as e:
            print(f"Error processing image {image_url}: {e}")
            continue

        preprocessed_data.append({
            "text": tokenized_text,
            "image": image_tensor,
            "entities": entities,
        })

    return preprocessed_data

# Example Usage
hidden_size = 768
num_attention_heads = 8
num_candidates = 10

model = MultimodalEntityLinkingModel(hidden_size, num_attention_heads, num_candidates)

# Preprocess the dataset
preprocessed_data = preprocess_dataset(DATASET_PATH)

# Training Example
sample = preprocessed_data[0]
text_input = sample["text"]
image_input = sample["image"]
correct_entity_index = 0  # Example correct index

# Calculate loss and accuracy
def calculate_loss_and_accuracy(model, text_input, image_input, correct_entity_index):
    entity_scores = model(text_input, image_input)
    labels = torch.tensor([correct_entity_index])
    criterion = nn.CrossEntropyLoss()
    loss = criterion(entity_scores, labels)
    predicted_entity = torch.argmax(entity_scores, dim=1)
    accuracy = (predicted_entity == labels).float().mean()
    return loss, accuracy

loss, accuracy = calculate_loss_and_accuracy(model, text_input, image_input, correct_entity_index)
print(f"Loss: {loss.item()}, Accuracy: {accuracy.item()}") 


FileNotFoundError: [Errno 2] No such file or directory: ''

In [None]:
from datasets import load_dataset

# Download and load
dataset = load_dataset("wikidiverse")
print(dataset)

# Preprocessing (example: tokenizing text)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoded_dataset = dataset.map(lambda x: tokenizer(x['text'], truncation=True, padding='max_length'), batched=True)

# Save preprocessed data locally
encoded_dataset.save_to_disk('./preprocessed_wikidiverse')

In [None]:
import pandas as pd
from sklearn import datasets

df = datasets.load_boston()
type(df)

In [None]:
sklearn.utils.Bunch

In [None]:
boston = pd.DataFrame(data=df.data, columns=df.feature_names)
boston['target'] = df.target

botson.head()

In [None]:
from datasets import load_dataset

# Example to load a dataset from Hugging Face's Datasets library
dataset = load_dataset("wikidiverse")  # Replace "wikidiverse" with the actual dataset ID
print(dataset)

In [12]:
import requests
from PIL import Image
from io import BytesIO
from transformers import BertTokenizer, CLIPProcessor, CLIPModel
import torch

# Sample data
text = "The Lions versus the Packers (2007)."
image_url = "https://upload.wikimedia.org/wikipedia/commons/0/06/DetroitLionsRunningPlay-2007.jpg"
entities = [
    ("Lions", "Organization", 4, 9, "https://en.wikipedia.org/wiki/Detroit_Lions"),
    ("Packers", "Organization", 21, 28, "https://en.wikipedia.org/wiki/Green_Bay_Packers")
]

# Download and process the image
response = requests.get(image_url)
image = Image.open(BytesIO(response.content))

# Load the BERT tokenizer and process text
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
encoded_text = tokenizer(text, truncation=True, padding="max_length", return_tensors="pt")

# Load the CLIP processor and model for multimodal processing
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")

# Process image using CLIP
inputs = clip_processor(text=[text], images=image, return_tensors="pt", padding=True)
outputs = clip_model(**inputs)

# Extract image features
image_features = outputs.image_embeds

# Example to combine text and image features (via concatenation or other techniques)
# For simplicity, we concatenate the text and image features
text_features = outputs.text_embeds

# Combine text and image features (a simple concatenation for example)
combined_features = torch.cat((text_features, image_features), dim=1)

# Example entity prediction or further processing could go here
print("Combined Text and Image Features:", combined_features)


UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x7f0369abd090>

In [13]:
import requests

image_url = "https://upload.wikimedia.org/wikipedia/commons/0/06/DetroitLionsRunningPlay-2007.jpg"

# Download the image
response = requests.get(image_url)

# Check the response status code
if response.status_code == 200:
    print("Image successfully retrieved!")
else:
    print(f"Failed to retrieve image. Status code: {response.status_code}")

# Optionally, check the first few bytes of the response content
print(response.content[:100])  # Print first 100 bytes to inspect the content

Failed to retrieve image. Status code: 403
b'<!DOCTYPE html>\n<html lang="en">\n<meta charset="utf-8">\n<title>Wikimedia Error</title>\n<style>\n* { m'


In [14]:
# Check the content type of the response
print(response.headers['Content-Type'])

# If it's an image, the content type should be something like "image/jpeg" or "image/png"


text/html; charset=utf-8


In [20]:
import requests
from PIL import Image
from io import BytesIO

image_url = "https://upload.wikimedia.org/wikipedia/commons/0/06/DetroitLionsRunningPlay-2007.jpg"

# Add headers to mimic a browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Make the request with headers
response = requests.get(image_url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    try:
        image = Image.open(BytesIO(response.content))
        image.show()  # Display the image
    except Exception as e:
        print(f"Error opening image: {e}")
else:
    print(f"Failed to retrieve image. Status code: {response.status_code}")




In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('wikidiverse_data.csv')  # Change the filename if needed

# Example of inspecting the dataset
print(df.head())

# Preprocess text data (e.g., tokenization)
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoded_texts = tokenizer(df['text_column_name'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Save the preprocessed data locally
torch.save(encoded_texts, 'encoded_texts.pt')

In [21]:
import os
import hashlib
import re
import requests
from PIL import Image
from io import BytesIO

# Your data (This should be a list of tuples containing the image URL and other data)
data = [
    ["The Lions versus the Packers (2007).", "https://upload.wikimedia.org/wikipedia/commons/0/06/DetroitLionsRunningPlay-2007.jpg", "sports"],
    # Add more items here
]

# Define the base directory to save images
base_img_dir = 'path_to_wikinewsImgs/'  # Define your image storage path

# Ensure the base directory exists
os.makedirs(base_img_dir, exist_ok=True)

# Function to download and save the image
def download_and_save_image(image_url, save_path):
    try:
        response = requests.get(image_url)
        if response.status_code == 200:
            img = Image.open(BytesIO(response.content))
            img.save(save_path)
            print(f"Image saved: {save_path}")
        else:
            print(f"Failed to retrieve image from: {image_url}")
    except Exception as e:
        print(f"Error downloading image from {image_url}: {e}")

# Loop through each item in the data and process images
for item in data:
    image_url = item[1]  # Extract image URL
    
    # Extract image name and process it
    m_img = image_url.split('/')[-1]
    prefix = hashlib.md5(m_img.encode()).hexdigest()  # Generate unique hash prefix
    suffix = re.sub(r'(\S+(?=\.(jpg|JPG|png|PNG|svg|SVG)))|(\S+(?=\.(jpeg|JPEG)))', '', m_img)
    m_img = os.path.join(base_img_dir, prefix + suffix)
    
    # Ensure .svg or .SVG images are converted to .png
    m_img = m_img.replace('.svg', '.png').replace('.SVG', '.png')
    
    # Download and save the image
    download_and_save_image(image_url, m_img)

Image saved: path_to_wikinewsImgs/062ce5e341a566a4208d801e53557538.jpg


In [24]:
import hashlib
import re
import os
import requests
from PIL import Image
from io import BytesIO

# Path to store images
DATASET_PATH = r"C:\Users\Min Dator\aics-project\wikidiverse_data\images"

# Ensure the directory exists
if not os.path.exists(DATASET_PATH):
    os.makedirs(DATASET_PATH)

# Function to download and process the image
def download_image(url):
    try:
        # Get the image content
        response = requests.get(url)
        
        if response.status_code == 200:
            m_img = url.split('/')[-1]
            
            # Create a unique file name using MD5 hash
            prefix = hashlib.md5(m_img.encode()).hexdigest()
            suffix = re.sub(r'(\S+(?=\.(jpg|JPG|png|PNG|svg|SVG)))|(\S+(?=\.(jpeg|JPEG)))', '', m_img)
            
            # Construct the file path for the image
            file_path = os.path.join(DATASET_PATH, prefix + suffix)
            file_path = file_path.replace('.svg', '.png').replace('.SVG', '.png')  # Replace .svg with .png

            # Open the image and save it
            image = Image.open(BytesIO(response.content))
            image.save(file_path)

            print(f"Image saved at {file_path}")
        else:
            print(f"Failed to retrieve image. Status code: {response.status_code}")

    except Exception as e:
        print(f"Error downloading image: {e}")

# Example usage with data (replace 'data' with the actual dataset)
data = [
    ["The Lions versus the Packers (2007).", "https://upload.wikimedia.org/wikipedia/commons/0/06/DetroitLionsRunningPlay-2007.jpg", "sports", [
        ["Lions", "Organization", 4, 9, "https://en.wikipedia.org/wiki/Detroit_Lions"],
        ["Packers", "Organization", 21, 28, "https://en.wikipedia.org/wiki/Green_Bay_Packers"]
    ]]
]

# Iterate over the data and download images
for item in data:
    image_url = item[1]  # Get the image URL (second element in the data)
    download_image(image_url)


Image saved at C:\Users\Min Dator\aics-project\wikidiverse_data\images/062ce5e341a566a4208d801e53557538.jpg
