In [None]:
import pandas as pd
from konlpy.tag import Okt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
# Define file paths using double backslashes
input_file = "C:\\Users\\WINDOWS11\\Desktop\\kpop_agenda\\Step1\\metadata_top300_filtered.tsv"
output_file = "C:\\Users\\WINDOWS11\\Desktop\\kpop_agenda\\Step1\\metadata_top300_filtered_with_topics.tsv"

In [None]:


# Load the metadata TSV file
df = pd.read_csv(input_file, sep="\t")

# Initialize the Korean tokenizer
okt = Okt()

# Function to read text content from a file path
def read_article(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""

In [None]:
# Preprocessing function for Korean text using KoNLPy's Okt
def preprocess_text(text):
    # Tokenize text into morphemes
    tokens = okt.morphs(text)
    # Join tokens back into a space-separated string
    return " ".join(tokens)

# Read and preprocess each article
documents = []
for idx, row in df.iterrows():
    # Adjust the column name if necessary
    file_path = row['file_path']
    raw_text = read_article(file_path)
    processed_text = preprocess_text(raw_text)
    documents.append(processed_text)

In [None]:
# Create a document-term matrix using CountVectorizer.
# Since the text is already tokenized (tokens are space-separated),
# we can simply split on whitespace.
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(), min_df=5)
doc_term_matrix = vectorizer.fit_transform(documents)

# Define number of topics (choose a value between 5 and 10)
num_topics = 7  # Adjust as needed
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(doc_term_matrix)

# Assign each document its dominant topic based on the highest probability
topic_distributions = lda.transform(doc_term_matrix)
dominant_topics = topic_distributions.argmax(axis=1)
df['dominant_topic'] = dominant_topics

In [None]:
# Top words for each topic
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        top_words = " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(f"Topic #{topic_idx}: {top_words}")

n_top_words = 10
feature_names = vectorizer.get_feature_names_out()
print_top_words(lda, feature_names, n_top_words)

# Save the updated metadata with topics back to a TSV file
df.to_csv(output_file, sep="\t", index=False)