# Image processing pipeline

### Install required dependencies

In [1]:
pip install transformers torch pillow

### Import nessesary libraries

In [2]:
import os
import requests
from PIL import Image
from io import BytesIO
import pandas as pd
import numpy as np
from transformers import BlipProcessor, BlipForConditionalGeneration
from tqdm import tqdm

### Download Images
Use the Image IDs and image links extracted from articles to download images

In [None]:
def download_image(image_url, filename):
    '''
    Given the extracted image url including image ID
    Download images into the corresponding article folder
    '''
    try:
        response = requests.get(image_url)
        if response.status_code == 200:
            image = Image.open(BytesIO(response.content))
            # Some images couldn't be saved as JPEG so download differently
            if image.mode in ('RGBA', 'P', 'LA'):
                image = image.convert("RGB")
            image.save(filename, format='JPEG')
            print(f"Image saved as {filename}")
        else:
            print(f"Failed to retrieve the image from {image_url}. HTTP Status code: {response.status_code}")
    except Exception as e:
        print(f"Error downloading image from {image_url}: {e}")

#### Get the articles parquet for which we process images

In [None]:
# path to the articles (using the small dataset here)
parquet_file_path = '/home/ioana/Desktop/UvA/Once/ONCE/ebnerd-benchmark/data/ebnerd_small/articles.parquet'
try:
    df = pd.read_parquet(parquet_file_path, engine='pyarrow')
except Exception as e:
    print(f"pyarrow engine failed: {e}") # update pyarrow if this errors
    try:
        df = pd.read_parquet(parquet_file_path, engine='fastparquet')
    except Exception as e:
        print(f"fastparquet engine failed: {e}") # or update fastparquet if this errors

### Run the image download
Using a hardcoded link to images, this can be changed/retrieved from HTML

In [None]:
for index, row in df.iterrows():
    article_id = row['article_id']
    image_ids = row['image_ids']
    
    if image_ids is None or (isinstance(image_ids, (list, np.ndarray)) and len(image_ids) == 0):
        continue  # skip if image_ids is None or empty

    # Image IDs come as numpy array for us
    if isinstance(image_ids, np.ndarray):
        image_ids = image_ids.tolist()
    elif not isinstance(image_ids, list):
        image_ids = [image_ids]
    
    # we store all images associated to an article in a separate folder with article_id
    # to be able to map the captions back to the right article
    article_dir = f'downloaded_images_small/article_{article_id}'
    os.makedirs(article_dir, exist_ok=True)

    # the dowloading step
    for img_id in image_ids:
        image_url = f"https://img-cdn-p.ekstrabladet.dk/image/ekstrabladet/{img_id}/relationBig_910"
        filename = f"{article_dir}/image_{img_id}.jpg"
        download_image(image_url, filename)

## Image Captioning step

In [None]:
# loading BLIP
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

In [None]:
def generate_caption(image_path):
    try:
        image = Image.open(image_path).convert("RGB")
        inputs = processor(images=image, return_tensors="pt")
        outputs = model.generate(**inputs)
        caption = processor.decode(outputs[0], skip_special_tokens=True)
        # Filter out this term as it doesn't bring much value into the caption
        # And is frequently used due to the model pretraining vocab
        filtered_caption = caption.replace("arafed", "").strip()
        return filtered_caption
    except Exception as e:
        print(f"Error generating caption for {image_path}: {e}")
        return ""

In [2]:
# store all captions in a dict
captions_dict = {}
base_dir = 'downloaded_images_small'

for article_dir in tqdm(os.listdir(base_dir), desc="Processing articles"):
    article_path = os.path.join(base_dir, article_dir)
    
    if os.path.isdir(article_path):
        article_id = article_dir.split('_')[1]
        # we concatenate all captions into one so it's easier to merge before tokenization
        concatenated_captions = ""
        
        # iterate over each image in the article_id directory
        for image_file in os.listdir(article_path):
            image_path = os.path.join(article_path, image_file)
            caption = generate_caption(image_path)
            concatenated_captions += caption + " "
        print(concatenated_captions)
        
        captions_dict[article_id] = concatenated_captions.strip()

captions_df = pd.DataFrame(list(captions_dict.items()), columns=['article_id', 'image_caption_text'])

# safety measure save to csv
captions_df.to_csv('article_captions.csv', index=False)

# also save as parquet as the rest of the flow
parquet_file_path = 'article_captions.parquet'
captions_df.to_parquet(parquet_file_path, index=False)



race car with a broken wing sitting on the ground 
woman in a black lingerie posing on a gold chair 
man in a suit signing a document in front of a military man a view of a city with a fountain in the middle of it military man in uniform sitting at a desk writing boxes of pink and white plastic wrapped in plastic wrap on a street 
aerial view of a house with a yellow circle in the middle 
woman with a sword in her hand in front of a bookcase there is a woman sitting on a chair in a room there is a woman with a purple bracelet on her wrist there is a woman sitting on the stairs of a building there is a woman that is getting her arm tattooed smiling woman with tattoos standing in front of a tree woman sitting at a table with a bunch of ties and a pair of shoes 
a close up of a satellite image of a black object 
aerial view of a parking lot with a yellow circle in the middle 
man on a stage waving to a crowd of people smiling man with a blue shirt and black shirt on people are waving flag

#### Some articles might have no image, ensure their article id is in the dataframe
We would concat an empty string to the body in this scenario

In [None]:
original_df_path = 'articles.parquet'
original_df = pd.read_parquet(original_df_path)

original_df['article_id'] = original_df['article_id'].astype(str)

# merge to ensure we have all ids
all_article_ids = original_df[['article_id']].drop_duplicates()
captions_df_full = all_article_ids.merge(captions_df, on='article_id', how='left')

# missing elements will get an empty string
captions_df_full['image_caption_text'] = captions_df_full['image_caption_text'].fillna('')

# save as parquet
captions_df_full.to_parquet('article_captions_complete.parquet', index=False)

In [None]:
captions_df_full.describe()