# Import Library

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Get Data

In [2]:
df=pd.read_csv("./data/westmanga.csv")
df.shape

(4810, 10)

In [3]:
df.head()

Unnamed: 0,title,alt_title,type,description,genre,author,artist,rate,image,released
0,Sabitsuita Ken wo Tameshi ni Kyoukashite Mitar...,"Saat Aku Mencoba Memperkuat ""Pedang Berkarat"",...",Manga,Di dunia ini ketika Kamu berusia dua belas tah...,"Action, Adventure, Fantasy, Shounen","HASEGAWA Shunya, MANNO Mizuki",Yaeichi,8.2,https://westmanga.fun/wp-content/uploads/2021/...,2021
1,Cold,"""Cold"" College Student x College Student.",Manga,wansut,"Oneshot, Romance",Tony Hashimen,-,7.12,https://westmanga.fun/wp-content/uploads/2023/...,-
2,Honyaku no Sainou de Ore Dake ga Sekai wo Kaih...,[Honyaku] no Sainou de Ore Dake ga Sekai wo Ka...,Manga,Noah lahir dalam keluarga bangsawan sihir berg...,"Action, Adventure, Comedy, Ecchi, Fantasy, Har...",AONO Hakuto,MARU Tomoyuki,7.0,https://westmanga.fun/wp-content/uploads/2022/...,-
3,Its Too Precious and Hard to Read 4P Short Sto...,"""It's Too Precious and Hard to Read !!"" 4P Sho...",Manga,Kumpulan cerita pendek lucu dan manis untuk di...,"Anthology, Comedy, Drama, Oneshot, Romance, Sc...","9ºC, Ameno, Anthology, ARAOKA Aoi, ASAI Okuta,...","9ºC, Ameno, ARAOKA Aoi, ASAI Okuta, Azuma Fuyu...",7.6,https://westmanga.fun/wp-content/uploads/2021/...,2019
4,Jako ni wa Kaji ga Oniaida www to Iwareta Kaji...,「雑魚には鍛冶がお似合いだwww」と言われた鍛冶レベル9999の俺、追放されたので冒険者に転...,Manga,Makina seorang pandai besi yang bekerja di gui...,"Action, Adventure, Comedy, Fantasy, Harem, Seinen",HARIYA Keita,MITSUKOSHI Haruha,7.2,https://westmanga.fun/wp-content/uploads/2023/...,-


In [4]:
df.columns

Index(['title', 'alt_title', 'type', 'description', 'genre', 'author',
       'artist', 'rate', 'image', 'released'],
      dtype='object')

In [5]:
df.isna().sum()

title            0
alt_title      642
type             0
description     52
genre           12
author           1
artist           0
rate             0
image            1
released        34
dtype: int64

In [6]:
df.dropna(inplace=True)
df.head()

Unnamed: 0,title,alt_title,type,description,genre,author,artist,rate,image,released
0,Sabitsuita Ken wo Tameshi ni Kyoukashite Mitar...,"Saat Aku Mencoba Memperkuat ""Pedang Berkarat"",...",Manga,Di dunia ini ketika Kamu berusia dua belas tah...,"Action, Adventure, Fantasy, Shounen","HASEGAWA Shunya, MANNO Mizuki",Yaeichi,8.2,https://westmanga.fun/wp-content/uploads/2021/...,2021
1,Cold,"""Cold"" College Student x College Student.",Manga,wansut,"Oneshot, Romance",Tony Hashimen,-,7.12,https://westmanga.fun/wp-content/uploads/2023/...,-
2,Honyaku no Sainou de Ore Dake ga Sekai wo Kaih...,[Honyaku] no Sainou de Ore Dake ga Sekai wo Ka...,Manga,Noah lahir dalam keluarga bangsawan sihir berg...,"Action, Adventure, Comedy, Ecchi, Fantasy, Har...",AONO Hakuto,MARU Tomoyuki,7.0,https://westmanga.fun/wp-content/uploads/2022/...,-
3,Its Too Precious and Hard to Read 4P Short Sto...,"""It's Too Precious and Hard to Read !!"" 4P Sho...",Manga,Kumpulan cerita pendek lucu dan manis untuk di...,"Anthology, Comedy, Drama, Oneshot, Romance, Sc...","9ºC, Ameno, Anthology, ARAOKA Aoi, ASAI Okuta,...","9ºC, Ameno, ARAOKA Aoi, ASAI Okuta, Azuma Fuyu...",7.6,https://westmanga.fun/wp-content/uploads/2021/...,2019
4,Jako ni wa Kaji ga Oniaida www to Iwareta Kaji...,「雑魚には鍛冶がお似合いだwww」と言われた鍛冶レベル9999の俺、追放されたので冒険者に転...,Manga,Makina seorang pandai besi yang bekerja di gui...,"Action, Adventure, Comedy, Fantasy, Harem, Seinen",HARIYA Keita,MITSUKOSHI Haruha,7.2,https://westmanga.fun/wp-content/uploads/2023/...,-


In [7]:
df['genre'][0]

'Action, Adventure, Fantasy, Shounen'

In [8]:
def extract_genres(input_string):
    genres_data = input_string.split(',')
    extracted_genres = set()
    for genre_entry in genres_data:
        # extracted_genres.add(genre_entry)
        # return extracted_genres
        if len(genre_entry) >= 2:
            genre_name = genre_entry.strip()
            extracted_genres.add(genre_name)
    return extracted_genres

df['genre'] = df["genre"].apply(extract_genres)

In [9]:
# Download stopwords NLTK jika belum diunduh
nltk.download('stopwords')
nltk.download('punkt')

# Mendapatkan daftar stop words dalam bahasa Indonesia
stop_words = stopwords.words('indonesian')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
# Inisialisasi stemmer Sastrawi
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Membuat DataFrame
df = df.copy()

# Custom Transformer for Text Preprocessing
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, stop_words, stemmer):
        self.stop_words = stop_words
        self.stemmer = stemmer
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.apply(self._preprocess_text)
    
    def _preprocess_text(self, text):
        tokens = nltk.word_tokenize(text.lower())
        filtered_tokens = [token for token in tokens if token.isalpha() and token not in self.stop_words]
        stemmed_tokens = [self.stemmer.stem(token) for token in filtered_tokens]
        return ' '.join(stemmed_tokens)

# Pipeline for TF-IDF Vectorization with Preprocessing
pipeline = Pipeline([
    ('preprocessor', TextPreprocessor(stop_words, stemmer)),
    ('tfidf', TfidfVectorizer())
])

# Apply pipeline to descriptions
tfidf_matrix = pipeline.fit_transform(df['description'])

# Menghitung kemiripan cosine
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [11]:
# Fungsi untuk mendapatkan rekomendasi berdasarkan judul
def get_recommendations(title, cosine_sim=cosine_sim):
    # Mendapatkan indeks dari judul yang diberikan
    idx = df[df['title'] == title].index[0]
    
    # Mengurutkan skor kemiripan
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Mendapatkan skor dari 5 artikel yang paling mirip
    sim_scores = sim_scores[1:6]
    
    # Mendapatkan indeks artikel-artikel tersebut dan skornya
    article_indices = [i[0] for i in sim_scores]
    scores = [i[1] for i in sim_scores]
    
    # Mengembalikan judul artikel-artikel yang direkomendasikan dan skornya
    recommendations = [(df['title'].iloc[i], scores[idx]) for idx, i in enumerate(article_indices)]
    return recommendations

In [14]:
# Contoh penggunaan
print("Rekomendasi untuk 'Cold':")
recommendations = get_recommendations('Cold')
for title, score in recommendations:
    print(f"Title: {title}, Similarity Score: {score:.4f}")

Rekomendasi untuk 'Cold':
Title: A Bug in the World System, Similarity Score: 1.0000
Title: A Nyakuza Manga, Similarity Score: 1.0000
Title: A Romcom About a Dark Witch and a Zombie, Similarity Score: 1.0000
Title: Au Tabi ni Hikuku Natte iku Onnanoko, Similarity Score: 1.0000
Title: Ill Keep Making It For You, Similarity Score: 1.0000
