# Import Library

In [11]:
import pandas as pd

# Description Similarity System
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from fuzzywuzzy import process

# Image Similarity System
import requests
from io import BytesIO
from PIL import Image
import imagehash
from concurrent.futures import ThreadPoolExecutor, as_completed

# Function

In [12]:
# Download stopwords NLTK jika belum diunduh
nltk.download('stopwords')
nltk.download('punkt')

# Mendapatkan daftar stop words dalam bahasa Indonesia
stop_words_indonesian = stopwords.words('indonesian')
stop_words_english = stopwords.words('english')

# Inisialisasi stemmer Sastrawi
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Custom Transformer for Text Preprocessing
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, stop_words, stemmer='', lang='english'):
        self.stop_words = stop_words
        self.stemmer = stemmer
        self.lang = lang
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.apply(self._preprocess_text)
    
    def _preprocess_text(self, text):
        tokens = nltk.word_tokenize(text.lower())
        filtered_tokens = [token for token in tokens if token.isalpha() and token not in self.stop_words]
        
        if self.lang == 'indonesian':
            stemmed_tokens = [self.stemmer.stem(token) for token in filtered_tokens]
            return ' '.join(stemmed_tokens)
        
        return ' '.join(filtered_tokens)

# Function to prepare the pipeline and cosine similarity matrix
def prepare_similarity_system(df, lang='english'):
    # Pipeline for TF-IDF Vectorization with Preprocessing
    stop_words = stop_words_indonesian if lang == 'indonesian' else stop_words_english
    stemmer_to_use = stemmer if lang == 'indonesian' else ''
    
    pipeline = Pipeline([
        ('preprocessor', TextPreprocessor(stop_words, stemmer_to_use, lang)),
        ('tfidf', TfidfVectorizer())
    ])

    # Apply pipeline to descriptions
    tfidf_matrix = pipeline.fit_transform(df['description'].astype(str))

    # Menghitung kemiripan cosine
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    return cosine_sim

# Function to get similarity
def get_similarity_description(title, df, cosine_sim):
    if title not in df['title'].values:
        matches = process.extract(title, df['title'], limit=5)
        best_match = matches[0][0]
        print(f"Exact title not found. Using closest match: '{best_match}'")
        title = best_match
        
    idx = df[df['title'] == title].index[0]
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    sim_scores = [score for score in sim_scores if score[1] < 1]

    # sim_scores = sim_scores[:100]

    article_indices = [i[0] for i in sim_scores]
    scores = [i[1] for i in sim_scores]
    
    valid_indices = [i for i in article_indices if i < len(df)]
    
    similarity_df = pd.DataFrame({'title': df.iloc[valid_indices]['title'], 'description_similarity': scores})
    result_df = pd.merge(similarity_df, df, on='title')
    
    return result_df

# Matching of genres
def matching_genres(df, title):
    if title not in df['title'].values:
        matches = process.extract(title, df['title'], limit=5)
        best_match = matches[0][0]
        print(f"Exact title not found. Using closest match: '{best_match}'")
        title = best_match
        
    data = df[df['title'] == title]
    main_genres = (list(data.head(1)['genre'])[0]).split(', ')
    genre_similarity = df['genre']
    
    for i, rec in enumerate(genre_similarity):
        match_count = 0
        if isinstance(rec, float): 
            rec = str(rec) 
        genres = rec.split(', ')
        for j, genre in enumerate(genres):
            if genre in main_genres:
                match_count += 1
        
        genre_similarity = (match_count / len(main_genres)) * 100
        df.at[i, 'genre_similarity'] = genre_similarity
    filtered_df = df.copy()
        
    return filtered_df

# image_cache = {}

# def download_image(image_url):
#     if image_url in image_cache:
#         return image_cache[image_url]
    
#     try:
#         response = requests.get(image_url)
#         response.raise_for_status()
#         img = Image.open(BytesIO(response.content))
#         image_cache[image_url] = img
#         return img
#     except (requests.RequestException, IOError) as e:
#         print(f"Error downloading or opening image from URL {image_url}: {e}")
#         return None

# def compute_image_hash(image_url):
#     image = download_image(image_url)
#     if image is None:
#         return None
#     return imagehash.average_hash(image)

# def preprocess_images(df, max_workers=10):
#     with ThreadPoolExecutor(max_workers=max_workers) as executor:
#         future_to_index = {
#             executor.submit(compute_image_hash, row['image']): index
#             for index, row in df.iterrows()
#         }
#         for future in as_completed(future_to_index):
#             index = future_to_index[future]
#             try:
#                 image_hash = future.result()
#                 df.at[index, 'image_hash'] = image_hash
#             except Exception as e:
#                 print(f"Error processing row at index {index}: {e}")

#     return df

# def compute_image_similarity(hash1, hash2):
#     return 1 - (hash1 - hash2) / len(hash1.hash) ** 2

# def get_similarity_image(title, df):
#     if title not in df['title'].values:
#         matches = process.extract(title, df['title'], limit=5)
#         best_match = matches[0][0]
#         print(f"Exact title not found. Using closest match: '{best_match}'")
#         title = best_match
        
#     target_row = df[df['title'] == title].iloc[0]
#     target_hash = target_row['image_hash']
    
#     if target_hash is None:
#         print("Target image could not be processed.")
#         return []
    
#     similarities = []
#     for index, row in df.iterrows():
#         if row['title'] != title:
#             other_hash = row['image_hash']
#             if other_hash is None:
#                 continue
#             similarity = compute_image_similarity(target_hash, other_hash)
#             similarities.append((row['title'], similarity))
    
#     similarities.sort(key=lambda x: x[1], reverse=True)
#     similarity_df = pd.DataFrame(similarities, columns=['title', 'image_similarity'])
#     result_df = pd.merge(similarity_df, df, on='title')
    
#     return result_df


# def all_similarity(title, df, cosine_sim):
#     columns_to_drop = ['image_similarity','genre_similarity', 'description_similarity']
#     existing_columns = [col for col in columns_to_drop if col in df.columns]

#     if existing_columns:
#         df = df.drop(existing_columns, axis=1)

#     df = matching_genres(df, title)
#     df = get_similarity_description(title, df, cosine_sim)
#     df = get_similarity_image(title, df)
    
#     df = df.sort_values(by=['description_similarity', 'genre_similarity','image_similarity'], ascending=False)

#     return df

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
image_cache = {}

def download_image(image_url):
    if image_url in image_cache:
        return image_cache[image_url]
    
    try:
        response = requests.get(image_url)
        response.raise_for_status()
        img = Image.open(BytesIO(response.content))
        image_cache[image_url] = img
        return img
    except (requests.RequestException, IOError) as e:
        print(f"Error downloading or opening image from URL {image_url}: {e}")
        return None

def compute_image_hash(image_url):
    image = download_image(image_url)
    if image is None:
        return None
    return imagehash.average_hash(image)

def preprocess_images(df, max_workers=10):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_index = {
            executor.submit(compute_image_hash, row['image']): index
            for index, row in df.iterrows()
        }
        for future in as_completed(future_to_index):
            index = future_to_index[future]
            try:
                image_hash = future.result()
                df.at[index, 'image_hash'] = image_hash
            except Exception as e:
                df.at[index, 'image_hash'] = '-'

def compute_image_similarity(hash1, hash2):
    return 1 - (hash1 - hash2) / len(hash1.hash) ** 2

def get_similarity_image(title, df):
    if title not in df['title'].values:
        matches = process.extract(title, df['title'], limit=5)
        best_match = matches[0][0]
        print(f"Exact title not found. Using closest match: '{best_match}'")
        title = best_match
        
    target_row = df[df['title'] == title].iloc[0]
    target_hash = target_row['image_hash']
    
    if target_hash is None:
        print("Target image could not be processed.")
        return []
    
    similarities = []
    for index, row in df.iterrows():
        if row['title'] != title:
            other_hash = row['image_hash']
            if other_hash is None:
                continue
            similarity = compute_image_similarity(target_hash, other_hash)
            similarities.append((row['title'], similarity))
    
    similarities.sort(key=lambda x: x[1], reverse=True)
    similarity_df = pd.DataFrame(similarities, columns=['title', 'image_similarity'])
    result_df = pd.merge(similarity_df, df, on='title')
    
    return result_df

def all_similarity(title, df, cosine_sim):
    columns_to_drop = ['image_similarity','genre_similarity', 'description_similarity']
    existing_columns = [col for col in columns_to_drop if col in df.columns]

    if existing_columns:
        df = df.drop(existing_columns, axis=1)

    df = matching_genres(df, title)
    df = get_similarity_description(title, df, cosine_sim)
    df = get_similarity_image(title, df)
    
    df = df.sort_values(by=['description_similarity', 'genre_similarity','image_similarity'], ascending=False)

    return df

# Get Data

In [14]:
df_komikcast=pd.read_csv("./data/komikcast.csv")
df_westmanga=pd.read_csv("./data/westmanga.csv")
df_mangadex=pd.read_csv("./data/mangadex.csv")

print('Komikcast: ', df_komikcast.shape)
print('Westmanga: ', df_westmanga.shape)
print('Mangadex : ', df_mangadex.shape)

Komikcast:  (8410, 10)
Westmanga:  (4810, 10)
Mangadex :  (24002, 10)


# Consine Similarity

In [15]:
consine_sim_indo_komikcast = prepare_similarity_system(df_komikcast, 'indonesian')
consine_sim_indo_westmanga = prepare_similarity_system(df_westmanga, 'indonesian')
consine_sim_eng_mangadex = prepare_similarity_system(df_mangadex)

KeyboardInterrupt: 

In [None]:
preprocess_images(df_komikcast)

In [None]:
preprocess_images(df_westmanga)

In [None]:
preprocess_images(df_mangadex)

# Get Function

In [None]:
title = 'Fairy Tail'

In [None]:
df_komikcast = all_similarity(title, df_komikcast,consine_sim_indo_komikcast)
df_komikcast.head()

Exact title not found. Using closest match: 'Fairy Tail City Hero'


In [None]:
df_westmanga = all_similarity(title, df_westmanga,consine_sim_indo_westmanga)
df_westmanga.head()

In [None]:
df_mangadex  = all_similarity(title, df_mangadex ,consine_sim_eng_mangadex)
df_mangadex.head()