In [None]:
import pandas as pd
import os
from konlpy.tag import Okt
from bertopic import BERTopic  # Or use a different CTM library if preferred

In [None]:
# 1. Load the metadata
metadata_file = "C:\\Users\\WINDOWS11\\Desktop\\kpop_agenda\\Step1\\metadata_top300_filtered.tsv"
metadata_df = pd.read_csv(metadata_file, sep='\t')

In [None]:
# 2. Load the text data
texts = []
file_paths = metadata_df['file_path'].tolist()
tokenizer = Okt()  # Initialize the tokenizer

for file_path in file_paths:
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
            # Tokenize the text using konlpy
            tokens = tokenizer.morphs(text)  # Or use other suitable tokenization methods
            texts.append(" ".join(tokens))  # Join tokens back into a string for BERTopic
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        texts.append("")

In [None]:
# 3. Train the BERTopic model
topic_model = BERTopic(language="korean")  # Or specify language="korean" if primarily Korean
topics, probabilities = topic_model.fit_transform(texts)

In [None]:
# 4. Get the dominant topic for each document
dominant_topics = [topic_model.get_topic_info().iloc[topic]['Name'] for topic in topics]


In [None]:
# 5. Add the topic information to the DataFrame
metadata_df['topic_type'] = dominant_topics

In [None]:
# 6. Save the updated DataFrame
output_file = "C:/Users/WINDOWS11/Desktop/kpop_agenda/Step1/metadata_top300_with_topics.tsv"
metadata_df.to_csv(output_file, sep='\t', index=False)

In [None]:
# 7. (Optional) Explore topics
topic_model.visualize_topics()
# Or print topic information
for i in range(len(dominant_topics)):
    print(f"Article: {metadata_df['title'][i]}\nTopic: {dominant_topics[i]}\n")