In [1]:
import pandas as pd
import numpy as np
import json

# Description Similarity System
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from fuzzywuzzy import process



In [2]:
# Download stopwords NLTK jika belum diunduh
nltk.download('stopwords')
nltk.download('punkt')

# Mendapatkan daftar stop words dalam bahasa Indonesia
stop_words_indonesian = stopwords.words('indonesian')
stop_words_english = stopwords.words('english')

# Inisialisasi stemmer Sastrawi
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Custom Transformer for Text Preprocessing
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, stop_words, stemmer='', lang='english'):
        self.stop_words = stop_words
        self.stemmer = stemmer
        self.lang = lang
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.apply(self._preprocess_text)
    
    def _preprocess_text(self, text):
        tokens = nltk.word_tokenize(text.lower())
        filtered_tokens = [token for token in tokens if token.isalpha() and token not in self.stop_words]
        
        if self.lang == 'indonesian':
            stemmed_tokens = [self.stemmer.stem(token) for token in filtered_tokens]
            return ' '.join(stemmed_tokens)
        
        return ' '.join(filtered_tokens)

# Function to prepare the pipeline and cosine similarity matrix
def prepare_similarity_system(df, lang='english'):
    # Pipeline for TF-IDF Vectorization with Preprocessing
    stop_words = stop_words_indonesian if lang == 'indonesian' else stop_words_english
    stemmer_to_use = stemmer if lang == 'indonesian' else ''
    
    pipeline = Pipeline([
        ('preprocessor', TextPreprocessor(stop_words, stemmer_to_use, lang)),
        ('tfidf', TfidfVectorizer())
    ])

    # Apply pipeline to descriptions
    tfidf_matrix = pipeline.fit_transform(df['description'].astype(str))

    # Menghitung kemiripan cosine
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    return cosine_sim

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
df_komikcast = pd.read_csv("./data/komikcast.csv")
df_westmanga = pd.read_csv("./data/westmanga.csv")
df_mangadex = pd.read_csv("./data/mangadex.csv")

print('Komikcast: ', df_komikcast.shape)
print('Westmanga: ', df_westmanga.shape)
print('Mangadex : ', df_mangadex.shape)

Komikcast:  (8410, 10)
Westmanga:  (4810, 10)


In [4]:
consine_sim_indo_komikcast = prepare_similarity_system(df_komikcast, 'indonesian')
consine_sim_indo_westmanga = prepare_similarity_system(df_westmanga, 'indonesian')
# consine_sim_eng_mangadex = prepare_similarity_system(df_mangadex)

In [6]:
consine_decription_similarity = {
    'komikcast': consine_sim_indo_komikcast,
    'westmanga': consine_sim_indo_westmanga,
    # 'mangadex': consine_sim_eng_mangadex
}

In [10]:
# NumPy arrays to lists
consine_decription_similarity_serializable = {
    key: value.tolist() for key, value in consine_decription_similarity.items()
}

In [12]:
# Convert the dictionary to a JSON string
json_object = json.dumps(consine_decription_similarity_serializable, indent=4)

# Write the JSON string to a file
with open("./data/consine_decription_similarity.json", "w") as outfile:
    outfile.write(json_object)

In [13]:
# Inverse: Convert lists back to NumPy arrays
consine_decription_similarity_inverse = {
    key: np.array(value) for key, value in consine_decription_similarity_serializable.items()
}