In [None]:
import pandas as pd
from bertopic import BERTopic
from hdbscan import HDBSCAN
from umap import UMAP
from sentence_transformers import SentenceTransformer
from konlpy.tag import Okt
from transformers import AutoTokenizer, AutoModel


In [None]:
# Define file paths using double backslashes
input_file = "C:\\Users\\WINDOWS11\\Desktop\\kpop_agenda\\Step1\\metadata_top300_filtered.tsv"
output_file = "C:\\Users\\WINDOWS11\\Desktop\\kpop_agenda\\Step1\\metadata_top300_filtered_with_topics.tsv"

# Load the metadata TSV file
df = pd.read_csv(input_file, sep="\t")

In [None]:
# Initialize the Korean tokenizer
okt = Okt()

# Function to read text content from a file path
def read_article(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""

# Preprocessing function for Korean text using KoNLPy's Okt
def preprocess_text(text):
    # Tokenize text into morphemes
    tokens = okt.morphs(text)
    # Join tokens back into a space-separated string
    return " ".join(tokens)

In [None]:
# Read and preprocess the article texts using the file_path column
documents = [preprocess_text(read_article(fp)) for fp in df['file_path']]

In [None]:
# Set up UMAP with custom parameters (option 3)
umap_model = UMAP(n_neighbors=9, n_components=5, min_dist=0.1, random_state=119)

In [None]:
# Set up HDBSCAN with custom parameters (option 1)
hdbscan_model = HDBSCAN(min_cluster_size=17, min_samples=2, cluster_selection_method='eom')

In [None]:
# Initialize BERTopic with the custom UMAP and HDBSCAN models
topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model, verbose=True)

In [None]:
# Initialize the SentenceTransformer model 
embedding_model = SentenceTransformer("jhgan/ko-sbert-sts") 

# Compute embeddings with a specified batch size to control the number of batches
embeddings = embedding_model.encode(documents, show_progress_bar=True, batch_size=4)

# Use the precomputed embeddings in BERTopic
topics, probs = topic_model.fit_transform(documents, embeddings)

# Add the topic assignments as a new column to the DataFrame
df['topic_type'] = topics

# Save the updated DataFrame to a new TSV file
df.to_csv(output_file, sep="\t", index=False)
print(f"Topic modeling complete. Output saved to {output_file}")

In [None]:
topics, probs = topic_model.fit_transform(documents, embeddings)

df['topic_type'] = topics
df.to_csv(output_file, sep="\t", index=False)

# Get the number of unique topics (excluding -1, which represents outliers/noise)
num_topics = len(set(topics)) - 1 if -1 in topics else len(set(topics)) # handles if there is -1 or not

print(f"Number of topics: {num_topics}")  # Print the number of topics

