In [None]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models import HdpModel
import matplotlib.pyplot as plt
from gensim.models.coherencemodel import CoherenceModel

In [None]:
# Load the dataset
df4 = pd.read_csv('../data/review_gopay_newest_sort.csv')

In [None]:
# Download stopwords if not already downloaded
# nltk.download('stopwords')
stop_words = set(stopwords.words('indonesian'))
custom_stopwords = {'yang', 'aja', 'yg', 'nya', 'sih', 'oh', 'e', 'deh', 'ya', 'kan', 'nih'}
stop_words.update(custom_stopwords)

# remove word from stopwords
stop_words = stop_words - {'tidak'} 

# define synonym dictionary
synonym_dict = {
    'apk': 'aplikasi', 'app': 'aplikasi', 
    'oke': 'ok', 'gak': 'tidak', 'ga': 'tidak', 'gk': 'tidak',
    'g':'tidak','tf': 'transfer'
}

# function to normalize elongated words (e.g., sukaaaa -> suka)
def normalize_repeated_chars(word):
    return re.sub(r'(.)\1{2,}', r'\1', word)                            # replace >=3 chars with 1 char

# Preprocess function with normalization
def tokenize_text(text):
    if isinstance(text, str):                                           # check if the input is a string
        text = text.lower()                                             # convert to lowercase
        text = re.sub(r'\d+', '__num__', text)                                 # remove numbers
        text = re.sub(r'\W+', ' ', text)                                # remove punctuation
        text = re.sub(r'http\S+|www\S+', '__url__', text)                      # remove URLs
        tokens = text.split()                                           # split into tokens
        tokens = [normalize_repeated_chars(word) for word in tokens]    # normalize elongated words
        tokens = [synonym_dict.get(word, word) for word in tokens]      # replace with synonym if it exists
        tokens = [word for word in tokens if word not in stop_words]    # remove stopwords
        return tokens
    return []                                                           # if not string, return an empty list

# Apply preprocessing
df4['tokens'] = df4['content'].apply(tokenize_text)

In [None]:
# check output
pd.options.display.max_colwidth = None
df4[['content', 'tokens']].sample(n=10, random_state=42)

In [None]:
# Create a dictionary and corpus for HDP
dictionary = corpora.Dictionary(df4['tokens'])
corpus = [dictionary.doc2bow(tokens) for tokens in df4['tokens'] if tokens]  # Filter out empty lists

In [None]:
# view the dictionary (word to ID mapping)
print(f'The content of dictionary: \n{list(dictionary.token2id.items())[:5]}\n')

# print top documents in the corpus (+ freq)
print('The content of corpus:')
for i in range(5):
    doc = corpus[i]
    decoded_doc = [(dictionary[id], freq) for id, freq in doc]
    print(f"Document {i+1}: {decoded_doc}")

In [None]:
# Train the HDP model
hdp_model = HdpModel(corpus=corpus, id2word=dictionary)

In [None]:
# You can also explore the coherence score (if desired)
coherence_model_hdp = CoherenceModel(model=hdp_model, texts=df4['tokens'], dictionary=dictionary, coherence='c_v')
coherence_score_hdp = coherence_model_hdp.get_coherence()
print(f'HDP Coherence Score: {coherence_score_hdp}')

In [None]:
# Print the topics
print("Topics found by HDP:")
for idx, topic in hdp_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")

In [None]:
# Assign topics to documents
df4['topic'] = -1  # Initialize with -1 for unclassifiable topics
for i, item in enumerate(corpus):
    if item:  # Only assign a topic if the item is not empty
        topic_info = hdp_model.get_document_topics(item)
        if topic_info:
            df4.at[i, 'topic'] = max(topic_info, key=lambda x: x[1])[0]