# Import Library

In [28]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Get Data

In [29]:
df=pd.read_csv("./data/komikcast.csv")
df.shape

(8410, 10)

In [30]:
df.head()

Unnamed: 0,title,alt_title,type,description,genre,author,artist,rate,image,released
0,Perfect Surgeon,"퍼펙트 써전, PFS",Manhwa,Saat dia masih kecil ayahnya dikabarkan mening...,"Drama, Fantasy, Medical, Sci-Fi, Shounen","MoiDal,뫼달",-,8.0,https://komikcast.cz/wp-content/uploads/2022/0...,2022
1,Otherworldly Sword King’s Survival Records,Survival Story of a Sword King in a Fantasy Wo...,Manhwa,Kau terpilih sebagai yang paling cocok Dengan ...,"Action, Adventure, Comedy, Demons, Fantasy, Is...","Kwon sun kyu, studio khit",-,7.0,https://komikcast.cz/wp-content/uploads/2023/0...,2020
2,Ghost Fixers,ゴーストフィクサーズ,Manga,Setelah peristiwa tertentu Kota Baru Mikurigao...,"Action, Shounen",TANAKA Yasuki,-,7.0,https://komikcast.cz/wp-content/uploads/2024/0...,2024
3,Might Through Death,"Become Stronger as You Die, Deadly Strong, Shi...",Manhwa,Mati hidup kembali dan tumbuh lebih kuat Kisah...,"Action, Sci-Fi, Seinen",SENOO Shippo,-,7.2,https://komikcast.cz/wp-content/uploads/2024/0...,2024
4,Ghoul ga Sekai wo Sukutta Koto wo Watashi dake...,"Only I Know the Ghoul Saved the World, グールが世界を...",Manga,Aku bukan pahlawan Aku monster yang memangsa m...,"Action, Adventure, Drama, Ecchi, Fantasy, Magi...",Katou Myoujin,-,7.0,https://komikcast.cz/wp-content/uploads/2023/0...,2023


In [31]:
df.columns

Index(['title', 'alt_title', 'type', 'description', 'genre', 'author',
       'artist', 'rate', 'image', 'released'],
      dtype='object')

In [32]:
df.isna().sum()

title             0
alt_title      1009
type             34
description       0
genre             6
author          323
artist            0
rate              0
image             0
released         55
dtype: int64

In [33]:
df.dropna(inplace=True)
df.head()

Unnamed: 0,title,alt_title,type,description,genre,author,artist,rate,image,released
0,Perfect Surgeon,"퍼펙트 써전, PFS",Manhwa,Saat dia masih kecil ayahnya dikabarkan mening...,"Drama, Fantasy, Medical, Sci-Fi, Shounen","MoiDal,뫼달",-,8.0,https://komikcast.cz/wp-content/uploads/2022/0...,2022
1,Otherworldly Sword King’s Survival Records,Survival Story of a Sword King in a Fantasy Wo...,Manhwa,Kau terpilih sebagai yang paling cocok Dengan ...,"Action, Adventure, Comedy, Demons, Fantasy, Is...","Kwon sun kyu, studio khit",-,7.0,https://komikcast.cz/wp-content/uploads/2023/0...,2020
2,Ghost Fixers,ゴーストフィクサーズ,Manga,Setelah peristiwa tertentu Kota Baru Mikurigao...,"Action, Shounen",TANAKA Yasuki,-,7.0,https://komikcast.cz/wp-content/uploads/2024/0...,2024
3,Might Through Death,"Become Stronger as You Die, Deadly Strong, Shi...",Manhwa,Mati hidup kembali dan tumbuh lebih kuat Kisah...,"Action, Sci-Fi, Seinen",SENOO Shippo,-,7.2,https://komikcast.cz/wp-content/uploads/2024/0...,2024
4,Ghoul ga Sekai wo Sukutta Koto wo Watashi dake...,"Only I Know the Ghoul Saved the World, グールが世界を...",Manga,Aku bukan pahlawan Aku monster yang memangsa m...,"Action, Adventure, Drama, Ecchi, Fantasy, Magi...",Katou Myoujin,-,7.0,https://komikcast.cz/wp-content/uploads/2023/0...,2023


In [34]:
df['genre'][0]

'Drama, Fantasy, Medical, Sci-Fi, Shounen'

In [35]:
def extract_genres(input_string):
    genres_data = input_string.split(',')
    extracted_genres = set()
    for genre_entry in genres_data:
        # extracted_genres.add(genre_entry)
        # return extracted_genres
        if len(genre_entry) >= 2:
            genre_name = genre_entry.strip()
            extracted_genres.add(genre_name)
    return extracted_genres

df['genre'] = df["genre"].apply(extract_genres)

In [36]:
# Download stopwords NLTK jika belum diunduh
nltk.download('stopwords')
nltk.download('punkt')

# Mendapatkan daftar stop words dalam bahasa Indonesia
stop_words = stopwords.words('indonesian')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [37]:
# Inisialisasi stemmer Sastrawi
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Membuat DataFrame
df = df.copy()

# Custom Transformer for Text Preprocessing
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, stop_words, stemmer):
        self.stop_words = stop_words
        self.stemmer = stemmer
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.apply(self._preprocess_text)
    
    def _preprocess_text(self, text):
        tokens = nltk.word_tokenize(text.lower())
        filtered_tokens = [token for token in tokens if token.isalpha() and token not in self.stop_words]
        stemmed_tokens = [self.stemmer.stem(token) for token in filtered_tokens]
        return ' '.join(stemmed_tokens)

# Pipeline for TF-IDF Vectorization with Preprocessing
pipeline = Pipeline([
    ('preprocessor', TextPreprocessor(stop_words, stemmer)),
    ('tfidf', TfidfVectorizer())
])

# Apply pipeline to descriptions
tfidf_matrix = pipeline.fit_transform(df['description'])

# Menghitung kemiripan cosine
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [38]:
# Fungsi untuk mendapatkan rekomendasi berdasarkan judul
def get_recommendations(title, cosine_sim=cosine_sim):
    # Mendapatkan indeks dari judul yang diberikan
    idx = df[df['title'] == title].index[0]
    
    # Mengurutkan skor kemiripan
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Mendapatkan skor dari 5 artikel yang paling mirip
    sim_scores = sim_scores[1:6]
    
    # Mendapatkan indeks artikel-artikel tersebut dan skornya
    article_indices = [i[0] for i in sim_scores]
    scores = [i[1] for i in sim_scores]
    
    # Mengembalikan judul artikel-artikel yang direkomendasikan dan skornya
    recommendations = [(df['title'].iloc[i], scores[idx]) for idx, i in enumerate(article_indices)]
    return recommendations

In [40]:
# Contoh penggunaan
print("Rekomendasi untuk 'Night Bookstore':")
recommendations = get_recommendations('Night Bookstore')
for title, score in recommendations:
    print(f"Title: {title}, Similarity Score: {score:.4f}")

Rekomendasi untuk 'Night Bookstore':
Title: Spy Room, Similarity Score: 0.2476
Title: NEET de Otaku na Kunoichi to naze ka Dousei hajimemashita (Somehow, I Started Living With a NEET Otaku Kunoichi), Similarity Score: 0.2000
Title: Distancia ~ The Untouchable One, Similarity Score: 0.1729
Title: A Story About My Girlfriend Whose Personality Changes Everyday, Similarity Score: 0.1726
Title: A Quirky Girl Is Inviting Me to Bed, Similarity Score: 0.1562
