In [1]:
import pandas as pd
import numpy as np
import json
from fuzzywuzzy import process
import requests
from io import BytesIO
from PIL import Image
import imagehash
from concurrent.futures import ThreadPoolExecutor, as_completed



In [2]:
# Cache for downloaded images
image_cache = {}

def download_image(image_url):
    if image_url in image_cache:
        return image_cache[image_url]
    
    try:
        response = requests.get(image_url)
        response.raise_for_status()
        img = Image.open(BytesIO(response.content))
        image_cache[image_url] = img
        return img
    except (requests.RequestException, IOError) as e:
        return None

def compute_image_hash(image_url):
    image = download_image(image_url)
    if image is None:
        return None
    return imagehash.average_hash(image)

def preprocess_images(df, batch_size=1000, max_workers=20):
    for start in range(0, len(df), batch_size):
        end = min(start + batch_size, len(df))
        batch_df = df.iloc[start:end]
        
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_index = {
                executor.submit(compute_image_hash, row['image']): index
                for index, row in batch_df.iterrows()
            }
            for future in as_completed(future_to_index):
                index = future_to_index[future]
                try:
                    image_hash = future.result()
                    df.loc[index, 'image_hash'] = image_hash
                except Exception as e:
                    df.loc[index, 'image_hash'] = '-'
        
        print(f"Processed batch {start} to {end}")

In [3]:
# Load your dataset
df_mangadex = pd.read_csv("./data/mangadex.csv")

# Add an 'image_hash' column with default values
df_mangadex['image_hash'] = np.nan

# Preprocess the images to compute hashes
preprocess_images(df_mangadex)

# Save the processed DataFrame
df_mangadex.to_csv("./data/mangadex_processed.csv", index=False)