In [9]:
import pandas as pd
import numpy as np
import requests
from PIL import Image
from io import BytesIO
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing import image
from keras.models import Model
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import process

# Description Similarity System

# Load VGG16 model pre-trained on ImageNet
base_model = VGG16(weights='imagenet')
model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc1').output)

def download_and_preprocess_image(url):
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        img = img.convert('RGB')  # Konversi gambar ke RGB
        img = img.resize((224, 224))  # VGG16 expects 224x224 images
        img_array = image.img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0)
        img_array = preprocess_input(img_array)
        return img_array
    except:
        return None


def extract_features(image_array):
    features = model.predict(image_array)
    return features.flatten()

def prepare_image_similarity(df):
    features_list = []
    for img_url in df['image']:
        img_array = download_and_preprocess_image(img_url)
        if img_array is not None:
            features = extract_features(img_array)
            features_list.append(features)
        else:
            features_list.append(np.zeros((4096,)))  # If image download fails, use a zero vector
    features_matrix = np.array(features_list)
    image_cosine_sim = cosine_similarity(features_matrix, features_matrix)
    return image_cosine_sim

def get_image_similarity(title, df, image_cosine_sim):
    if title not in df['title'].values:
        matches = process.extract(title, df['title'], limit=5)
        best_match = matches[0][0]
        print(f"Exact title not found. Using closest match: '{best_match}'")
        title = best_match
        
    idx = df[df['title'] == title].index[0]
    
    sim_scores = list(enumerate(image_cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    sim_scores = [score for score in sim_scores if score[1] < 1]

    article_indices = [i[0] for i in sim_scores]
    scores = [i[1] for i in sim_scores]
    
    valid_indices = [i for i in article_indices if i < len(df)]
    
    similarity_df = pd.DataFrame({'title': df.iloc[valid_indices]['title'], 'image_similarity': scores})
    result_df = pd.merge(similarity_df, df, on='title')
    
    return result_df

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
df_mangadex = pd.read_csv("./data/mangadex.csv")

In [None]:
# Contoh penggunaan
image_cosine_sim = prepare_image_similarity(df_mangadex)

In [None]:
similar_image_df = get_image_similarity('Fairy Tail', df_mangadex, image_cosine_sim)