In [1]:
import pandas as pd
import numpy as np
import json

# Image Similarity System
from fuzzywuzzy import process
import requests
from io import BytesIO
from PIL import Image
import imagehash
from concurrent.futures import ThreadPoolExecutor, as_completed



In [2]:
image_cache = {}

def download_image(image_url):
    if image_url in image_cache:
        return image_cache[image_url]
    
    try:
        response = requests.get(image_url)
        response.raise_for_status()
        img = Image.open(BytesIO(response.content))
        image_cache[image_url] = img
        return img
    except (requests.RequestException, IOError) as e:
        return None

def compute_image_hash(image_url):
    image = download_image(image_url)
    if image is None:
        return None
    return imagehash.average_hash(image)

def preprocess_images(df, max_workers=10):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_index = {
            executor.submit(compute_image_hash, row['image']): index
            for index, row in df.iterrows()
        }
        for future in as_completed(future_to_index):
            index = future_to_index[future]
            try:
                image_hash = future.result()
                df.loc[index, 'image_hash'] = image_hash
            except Exception as e:
                df.loc[index, 'image_hash'] = '-'

In [3]:
# df_komikcast=pd.read_csv("./data/komikcast.csv")
# df_westmanga=pd.read_csv("./data/westmanga.csv")
df_mangadex=pd.read_csv("./data/mangadex.csv")

# print('Komikcast: ', df_komikcast.shape)
# print('Westmanga: ', df_westmanga.shape)
print('Mangadex : ', df_mangadex.shape)

Mangadex :  (23769, 10)


In [4]:
# preprocess_images(df_komikcast)
# preprocess_images(df_westmanga)

In [8]:
#split mangadex
num_split = 24
dfs_mangadex = np.array_split(df_mangadex, num_split)
dfs_mangadex_01, dfs_mangadex_02, dfs_mangadex_03, dfs_mangadex_04,dfs_mangadex_05, dfs_mangadex_06, dfs_mangadex_07, dfs_mangadex_08, dfs_mangadex_09,dfs_mangadex_10, dfs_mangadex_11, dfs_mangadex_12, dfs_mangadex_13, dfs_mangadex_14,dfs_mangadex_15, dfs_mangadex_16, dfs_mangadex_17, dfs_mangadex_18, dfs_mangadex_19,dfs_mangadex_20, dfs_mangadex_21, dfs_mangadex_22, dfs_mangadex_23, dfs_mangadex_24  = dfs_mangadex

In [9]:
preprocess_images(dfs_mangadex_01)

In [10]:
preprocess_images(dfs_mangadex_02)

In [11]:
preprocess_images(dfs_mangadex_03)

In [12]:
preprocess_images(dfs_mangadex_04)

In [13]:
preprocess_images(dfs_mangadex_05)

In [14]:
preprocess_images(dfs_mangadex_06)

In [None]:
preprocess_images(dfs_mangadex_07)

In [None]:
preprocess_images(dfs_mangadex_08)

In [None]:
preprocess_images(dfs_mangadex_09)

In [None]:
preprocess_images(dfs_mangadex_10)

In [None]:
preprocess_images(dfs_mangadex_11)

In [None]:
preprocess_images(dfs_mangadex_12)

In [None]:
preprocess_images(dfs_mangadex_13)

In [None]:
preprocess_images(dfs_mangadex_14)

In [None]:
preprocess_images(dfs_mangadex_15)

In [None]:
preprocess_images(dfs_mangadex_16)

In [None]:
preprocess_images(dfs_mangadex_17)

In [None]:
preprocess_images(dfs_mangadex_18)

In [None]:
preprocess_images(dfs_mangadex_19)

In [None]:
preprocess_images(dfs_mangadex_20)

In [None]:
preprocess_images(dfs_mangadex_21)

In [None]:
preprocess_images(dfs_mangadex_22)

In [None]:
preprocess_images(dfs_mangadex_23)

In [None]:
preprocess_images(dfs_mangadex_24)

In [None]:
image_cache_hashes = {url: str(imagehash.average_hash(img)) for url, img in image_cache.items()}

json_object = json.dumps(image_cache_hashes, indent=4)

with open("./data/image_cache2.json", "w") as outfile:
    outfile.write(json_object)



In [None]:
df_mangadex_concat = pd.concat([dfs_mangadex_01, dfs_mangadex_02, dfs_mangadex_03, dfs_mangadex_04,dfs_mangadex_05, dfs_mangadex_06, dfs_mangadex_07, dfs_mangadex_08, dfs_mangadex_09,dfs_mangadex_10, dfs_mangadex_11, dfs_mangadex_12, dfs_mangadex_13, dfs_mangadex_14,dfs_mangadex_15, dfs_mangadex_16, dfs_mangadex_17, dfs_mangadex_18, dfs_mangadex_19,dfs_mangadex_20, dfs_mangadex_21, dfs_mangadex_22, dfs_mangadex_23, dfs_mangadex_24])

In [None]:
df_mangadex_concat.to_csv("./data/mangadex_process.csv", index=False)

In [None]:
# # Load the JSON file
# with open("./data/image_cache2.json", "r") as infile:
#     loaded_image_hashes = json.load(infile)
    
# # Reconstruct the hashes
# reconstructed_hashes = {url: imagehash.hex_to_hash(hash_str) for url, hash_str in loaded_image_hashes.items()}

# reconstructed_hashes