In [1]:
import pandas as pd
import numpy as np
import json
import requests
from io import BytesIO
from PIL import Image
import imagehash
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
import time

In [2]:
# Cache for downloaded images
image_cache = {}

def download_image(image_url):
    if image_url in image_cache:
        return image_cache[image_url]
    
    try:
        response = requests.get(image_url, timeout=10)
        response.raise_for_status()
        img = Image.open(BytesIO(response.content))
        image_cache[image_url] = img
        return img
    except (requests.RequestException, IOError) as e:
        return None

def compute_image_hash(image_url):
    image = download_image(image_url)
    if image is None:
        return None
    # Resize image to reduce processing time
    image = image.resize((128, 128))
    return imagehash.average_hash(image)

def preprocess_images(df, max_workers=20):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_index = {
            executor.submit(compute_image_hash, row['image']): index
            for index, row in df.iterrows()
        }
        for future in as_completed(future_to_index):
            index = future_to_index[future]
            try:
                image_hash = future.result()
                df.loc[index, 'image_hash'] = image_hash
            except Exception as e:
                df.loc[index, 'image_hash'] = '-'

def process_in_batches(df, batch_size=1000, max_workers=20):
    for start in range(0, len(df), batch_size):
        end = min(start + batch_size, len(df))
        batch_df = df.iloc[start:end]
        preprocess_images(batch_df, max_workers)
        # Save intermediate results
        
        image_cache_hashes = {url: str(imagehash.average_hash(img)) for url, img in image_cache.items()}
        json_object = json.dumps(image_cache_hashes, indent=4)
        with open("./mangadex/image_cache.json", "w") as outfile:
            outfile.write(json_object)
    
        batch_df.to_csv(f"./mangadex/processed_batch_{start//batch_size}.csv", index=False)

In [3]:
# Load your dataset
df_mangadex = pd.read_csv("./data/mangadex.csv")

In [4]:
# Process in batches
start_time = time.time()
process_in_batches(df_mangadex, batch_size=1000, max_workers=20)
end_time = time.time()

print(f"Processing time: {end_time - start_time} seconds")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[index, 'image_hash'] = image_hash


OSError: image file is truncated (121 bytes not processed)