In [1]:
import pandas as pd
import numpy as np
import json

# Image Similarity System
from fuzzywuzzy import process
import requests
from io import BytesIO
from PIL import Image
import imagehash
from concurrent.futures import ThreadPoolExecutor, as_completed



In [2]:
image_cache = {}

def download_image(image_url):
    if image_url in image_cache:
        return image_cache[image_url]
    
    try:
        response = requests.get(image_url)
        response.raise_for_status()
        img = Image.open(BytesIO(response.content))
        image_cache[image_url] = img
        return img
    except (requests.RequestException, IOError) as e:
        return None

def compute_image_hash(image_url):
    image = download_image(image_url)
    if image is None:
        return None
    return imagehash.average_hash(image)

def preprocess_images(df, max_workers=10):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_index = {
            executor.submit(compute_image_hash, row['image']): index
            for index, row in df.iterrows()
        }
        for future in as_completed(future_to_index):
            index = future_to_index[future]
            try:
                image_hash = future.result()
                df.loc[index, 'image_hash'] = image_hash
            except Exception as e:
                df.loc[index, 'image_hash'] = '-'

In [3]:
df_komikcast=pd.read_csv("./data/komikcast.csv")
df_westmanga=pd.read_csv("./data/westmanga.csv")
# df_mangadex=pd.read_csv("./data/mangadex.csv")

print('Komikcast: ', df_komikcast.shape)
print('Westmanga: ', df_westmanga.shape)
# print('Mangadex : ', df_mangadex.shape)

Mangadex :  (23769, 10)


In [4]:
preprocess_images(df_komikcast)

In [None]:
preprocess_images(df_westmanga)

In [None]:
# preprocess_images(df_mangadex)

In [None]:
df_komikcast.to_csv("./data/komikcast.csv", index=False)
df_westmanga.to_csv("./data/westmanga.csv", index=False)
# df_mangadex.to_csv("./data/mangadex.csv", index=False)

In [None]:
image_cache_hashes = {url: str(imagehash.average_hash(img)) for url, img in image_cache.items()}

json_object = json.dumps(image_cache_hashes, indent=4)

with open("./data/image_cache.json", "w") as outfile:
    outfile.write(json_object)

