# Import Library

In [12]:
import pandas as pd
import numpy as np
import json

# Description Similarity System
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from fuzzywuzzy import process
import imagehash

# Function

## Description Similarity

In [53]:
# Function to get similarity
def get_similarity_description(title, df, cosine_sim):
    if title not in df['title'].values:
        matches = process.extract(title, df['title'], limit=5)
        best_match = matches[0][0]
        print(f"Exact title not found. Using closest match: '{best_match}'")
        title = best_match
        
    idx = df[df['title'] == title].index[0]
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    sim_scores = [score for score in sim_scores if score[1] < 1]

    # sim_scores = sim_scores[:100]

    article_indices = [i[0] for i in sim_scores]
    scores = [i[1] for i in sim_scores]
    
    valid_indices = [i for i in article_indices if i < len(df)]
    
    # Handle cases where valid_indices and scores lengths do not match
    if len(valid_indices) != len(scores):
        min_length = min(len(valid_indices), len(scores))
        valid_indices = valid_indices[:min_length]
        scores = scores[:min_length]
    
    similarity_df = pd.DataFrame({'id': df.iloc[valid_indices]['id'], 'description_similarity': scores})
    result_df = pd.merge(similarity_df, df, on='id')
    
    return result_df

## Genre Similarity

In [14]:
# Matching of genres
def matching_genres(df, title):
    if title not in df['title'].values:
        matches = process.extract(title, df['title'], limit=5)
        best_match = matches[0][0]
        print(f"Exact title not found. Using closest match: '{best_match}'")
        title = best_match
        
    data = df[df['title'] == title]
    main_genres = (list(data.head(1)['genre'])[0]).split(', ')
    genre_similarity = df['genre']
    
    for i, rec in enumerate(genre_similarity):
        match_count = 0
        if isinstance(rec, float): 
            rec = str(rec) 
        genres = rec.split(', ')
        for j, genre in enumerate(genres):
            if genre in main_genres:
                match_count += 1
        
        genre_similarity = (match_count / len(main_genres)) * 100
        df.at[i, 'genre_similarity'] = genre_similarity
    filtered_df = df.copy()
        
    return filtered_df

## Image Similarity

In [15]:
def compute_image_similarity(hash1, hash2):
    return 1 - (hash1 - hash2) / len(hash1.hash) ** 2

def get_similarity_image(title, df):
    if title not in df['title'].values:
        matches = process.extract(title, df['title'], limit=5)
        best_match = matches[0][0]
        print(f"Exact title not found. Using closest match: '{best_match}'")
        title = best_match
        
    target_row = df[df['title'] == title].iloc[0]
    target_hash_str = target_row['image_hash']
    
    if not isinstance(target_hash_str, str):
        print("Target image hash is not valid.")
        return df
    
    target_hash = imagehash.hex_to_hash(target_hash_str)
    
    similarities = []
    for index, row in df.iterrows():
        if row['title'] != title:
            other_hash_str = row['image_hash']
            if not isinstance(other_hash_str, str):
                continue
            try:
                other_hash = imagehash.hex_to_hash(other_hash_str)
                similarity = compute_image_similarity(target_hash, other_hash)
                similarities.append((row['id'], similarity))
            except Exception as e:
                print(f"Error processing image hash for title '{row['title']}': {e}")
                continue
    
    similarities.sort(key=lambda x: x[1], reverse=True)
    similarity_df = pd.DataFrame(similarities, columns=['id', 'image_similarity'])
    result_df = pd.merge(similarity_df, df, on='id')
    
    return result_df

In [16]:
# def get_similarity_image(title, df):
#     if title not in df['title'].values:
#         matches = process.extract(title, df['title'], limit=5)
#         best_match = matches[0][0]
#         print(f"Exact title not found. Using closest match: '{best_match}'")
#         title = best_match
        
#     target_row = df[df['title'] == title].iloc[0]
#     target_hash = imagehash.hex_to_hash(target_row['image_hash'])
    
#     if target_hash is None:
#         print("Target image could not be processed.")
#         return []
    
#     similarities = []
#     for index, row in df.iterrows():
#         if row['title'] != title:
#             other_hash = imagehash.hex_to_hash(row['image_hash'])
#             if other_hash is None:
#                 continue
#             similarity = compute_image_similarity(target_hash, other_hash)
#             similarities.append((row['title'], similarity))
    
#     similarities.sort(key=lambda x: x[1], reverse=True)
#     similarity_df = pd.DataFrame(similarities, columns=['title', 'image_similarity'])
#     result_df = pd.merge(similarity_df, df, on='title')
    
#     return result_df

## Call Function

In [50]:
def all_similarity(title, df, cosine_sim):
    columns_to_drop = ['image_similarity','genre_similarity', 'description_similarity']
    existing_columns = [col for col in columns_to_drop if col in df.columns]

    if existing_columns:
        df = df.drop(existing_columns, axis=1)

    df = matching_genres(df, title) # Genre Similarity Function
    df = get_similarity_description(title, df, cosine_sim) # Description Similarity Function
    df = get_similarity_image(title, df) # Image Similarity Function
    
    df = df.sort_values(by=existing_columns, ascending=False)

    return df

# Get Data

In [18]:
df_komikcast=pd.read_csv("./data/komikcast.csv")
df_westmanga=pd.read_csv("./data/westmanga.csv")
# df_mangadex=pd.read_csv("./data-processed/mangadex.csv")

print('Komikcast: ', df_komikcast.shape)
print('Westmanga: ', df_westmanga.shape)
# print('Mangadex : ', df_mangadex.shape)

Komikcast:  (8410, 12)
Westmanga:  (4810, 12)


In [19]:
# Load the JSON file
with open("./data/consine_decription_similarity.json", "r") as infile:
    consine_decription_similarity = json.load(infile)

In [20]:
# Inverse: Convert lists back to NumPy arrays
consine_decription_similarity = {
    key: np.array(value) for key, value in consine_decription_similarity.items()
}

consine_sim_indo_komikcast = consine_decription_similarity['komikcast']
consine_sim_indo_westmanga = consine_decription_similarity['westmanga']

In [55]:
title = 'Fairy Tail'

In [56]:
df_komikcast = all_similarity(title, df_komikcast,consine_sim_indo_komikcast)
df_komikcast

Exact title not found. Using closest match: 'Fairy Tail Gaiden – Raigo Issen'
Exact title not found. Using closest match: 'Fairy Tail Gaiden – Raigo Issen'
Exact title not found. Using closest match: 'Fairy Tail: Blue Mistral'


Unnamed: 0,id,image_similarity,description_similarity,title,alt_title,type,description,genre,author,artist,rate,image,released,image_hash,genre_similarity
0,5310,0.781250,0.046687,Zettai ni Hatarakitakunai Dungeon Master ga Da...,I Absolutely Don't Want to Work as the Dungeon...,Manga,Masuda Keima anak lelaki sekolah menengah yang...,"Action, Adventure, Comedy, Drama, Ecchi, Fanta...",ONIKAGE Supana,-,7.0,https://komikcast.cz/wp-content/uploads/2020/0...,2020,3f031b97b980c0fe,75.0
1,3128,0.781250,0.013238,Miyori no Nai Onnanoko (A Girl With No Relatives),身寄りのない女の子,Manga,Ngomongngomong aku akan mengencani nya\n\n Pos...,Slice of Life,Endo,-,7.0,https://komikcast.cz/wp-content/uploads/2022/0...,2021,e3c181bd9999ddff,25.0
2,7150,0.765625,0.007019,Mirai no Fu Fu Desu Kedo?,"Mirai no Fuufu Desu Kedo?, みらいのふうふですけど？",Manga,Ya kehidupan yuri tau sendiri lah\n\n Post Vie...,"Ecchi, School Life, Slice of Life, Yuri",NONAKA Yuu,-,7.0,https://komikcast.cz/wp-content/uploads/2019/0...,2018,f0c080af8f80f9ff,0.0
3,5333,0.750000,0.041490,A Story About a Creepy Girl’s Smile,ゾクッとする女の子が笑ってくれる漫画,Manga,Ekspresi menyeramkan itu yang terbaik\n\n Post...,"Comedy, Romance, School Life, Slice of Life",チェックメイト (geroro44),-,7.0,https://komikcast.cz/wp-content/uploads/2020/0...,2020,ffc38999890072f6,75.0
4,5725,0.750000,0.032878,Kimetsu no Yaiba Chibi Collection,Kimetsu no Yaiba Chibi Butterfly,Manga,komik 4koma dari KnY hasil ilustrasi dari arti...,"4-Koma, Comedy, Fantasy, One-Shot, Slice of Life",m_2i_n,-,7.5,https://komikcast.cz/wp-content/uploads/2021/0...,2021,ffc39d9d111181ff,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7692,3166,0.265625,0.001643,How To Kill A God,신을 죽이는 방법,Manhwa,Mitos Terakhir\n\n Post Views 9,"Action, Fantasy, Mystery",Narak,-,7.0,https://komikcast.cz/wp-content/uploads/2022/0...,?,7e7e46307e5e7f00,50.0
7690,926,0.265625,0.008650,Emperor Qin Returns I Am The Eternal Immortal ...,Emperor Qin returns! I am the Immortal Emperor...,Manhua,Kaisar Qin Ying Zheng tidak dapat mengatasi ke...,"Action, Drama, Fantasy, Harem, Romance",博易动漫,-,7.0,https://komikcast.cz/wp-content/uploads/2024/0...,2024,0efef3c0c7277e00,25.0
7688,518,0.265625,0.031209,It Starts With A Mountain,"It Starts with a Mountain, Kāijú Yī Zuò Shān, ...",Manhua,Dimulai dengan satu pondok dan dua populasi ah...,"Action, Adventure, Drama, Historical",VV,-,7.0,https://komikcast.cz/wp-content/uploads/2021/0...,?,00186664fe7fff00,0.0
7691,776,0.265625,0.004034,Winning Pass,ウイニング パス,Manga,Haruto Ichinose punya kakak yaitu Shori Ichino...,"Comedy, Shounen, Sports",SAKAMOTO Tatsuya,-,7.0,https://komikcast.cz/wp-content/uploads/2024/0...,2023,1f1e7f6e7e7f0703,0.0


In [57]:
df_westmanga = all_similarity(title, df_westmanga,consine_sim_indo_westmanga)
df_westmanga.head(5)

Exact title not found. Using closest match: 'Fairy Tail 100 Years Quest'
Exact title not found. Using closest match: 'Fairy Tail 100 Years Quest'
Exact title not found. Using closest match: 'Fairy Tail Happys Grand'


Unnamed: 0,id,image_similarity,description_similarity,title,alt_title,type,description,genre,author,artist,rate,image,released,image_hash,genre_similarity
0,4501,0.828125,0.018807,Until Your Sword Breaks,Kimi no Katana ga Oreru Made - Tsukimiya Matsu...,Manga,Di Era Edo Tsukimiya Matsuri seorang gadis can...,"Comedy, Historical, Romance",Inoue Koharu,-,7.0,https://westmanga.fun/wp-content/uploads/2023/...,-,ffffd3b101c181c1,14.285714
1,3044,0.8125,0.023802,Otaku The Loser,Have Achieved a Lot My Achievements Are a Litt...,Manhua,Untuk menghargai perbuatan baik Meng Fan sebel...,"Comedy, Fantasy, School life, Shounen, Slice o...","夏宝, 虫2, 阅文集团起点中文网",-,7.0,https://westmanga.fun/wp-content/uploads/2021/...,-,ff89f381818181b1,42.857143
7,2803,0.78125,0.003438,Nankoufuraku no Maoujou e Youkoso,"Welcome to the Impregnable Demon King Castle, ...",Manga,Remme seorang black mage dikeluarkan dari part...,"Action, Adventure, Drama, Fantasy, Magic","Mitaka Hozumi, Kaidou Jeiichi",-,6.0,https://westmanga.fun/wp-content/uploads/2021/...,-,fbf3f3f1a0c1c1e3,42.857143
2,137,0.78125,0.0613,Akanabesensei wa Tereshirazu Akanabesensei Doe...,"Akanabe-sensei wa Tereshirazu, Akanabe-sensei ...",Manga,Koki Sakieda adalah asisten Akira Akanabe mang...,"Comedy, Romance, Shounen, Slice of Life",TAJIMI Naoya,-,7.0,https://westmanga.fun/wp-content/uploads/2024/...,-,ffe7f3e1c18000f1,28.571429
4,2177,0.78125,0.025459,Koudou ni Hattatsu Shita Igaku wa Mahou to Kub...,I Used High-Level Medicine to Counter Magic,Manga,Pandangan realistis tentang dokter modern di d...,"Adventure, Demons, Fantasy, Isekai, Magic, Med...",TSUDA Houkou,-,7.0,https://westmanga.fun/wp-content/uploads/2023/...,-,fff7e381a1e1c1e3,28.571429


In [58]:
# df_mangadex  = all_similarity(title, df_mangadex ,consine_sim_eng_mangadex)
# df_mangadex.head()