In [None]:
# Install a compatible version of scikit-learn
#!pip install "scikit-learn<1.6"

In [None]:
import os
import pandas as pd
from bertopic import BERTopic
from hdbscan import HDBSCAN
from umap import UMAP
from sentence_transformers import SentenceTransformer
from konlpy.tag import Okt
from transformers import AutoTokenizer, AutoModel
import sklearn.utils.validation


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
real_check_array = sklearn.utils.validation.check_array

def patched_check_array(array, **kwargs):
    if 'ensure_all_finite' in kwargs and 'force_all_finite' not in kwargs:
        val = kwargs.pop('ensure_all_finite')
        kwargs['force_all_finite'] = val
        
    elif 'force_all_finite' in kwargs and 'ensure_all_finite' not in kwargs:
        val = kwargs.pop('force_all_finite')
        kwargs['ensure_all_finite'] = val

    return real_check_array(array, **kwargs)

sklearn.utils.validation.check_array = patched_check_array
print("Patch activated.")



# --- STEP 3: DOUBLE CHECK UMAP ---
# Just to be 100% sure, we overwrite UMAP's internal reference too
import umap.umap_
umap.umap_.check_array = patched_check_array
print("UMAP patched successfully.")

Patch activated.
UMAP patched successfully.


In [3]:
input_file = "C:\\Users\\WINDOWS 11\\Desktop\\kpop_agenda\\Step1\\top300_filtered.csv"
output_file = "C:\\Users\\WINDOWS 11\\Desktop\\kpop_agenda\\Step1\\top300_filtered_with_topics.csv"

# Load metadata
df = pd.read_csv(input_file)

In [4]:
# Initialize the Korean tokenizer
okt = Okt()

# Function to read text content from a file path
def read_article(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""

# Preprocessing function for Korean text using KoNLPy's Okt
def preprocess_text(text):
    # Tokenize text into morphemes
    tokens = okt.morphs(text)
    # Join tokens back into a space-separated string
    return " ".join(tokens)

In [5]:
articles_dir = "C:/Users/WINDOWS 11/Desktop/kpop_agenda/Step1/Articles"

# Updated function to handle paths correctly
def get_correct_path(file_path_from_csv):
    # 1. Strip away any old directory info (e.g. remove "C:/Users/.../kpop_agenda/")
    filename_only = os.path.basename(file_path_from_csv)
    
    # 2. Join it with the REAL folder
    return os.path.join(articles_dir, filename_only)

# Read and preprocess (Apply the fix inside the loop)
documents = []
for fp in df['file_path']:
    correct_path = get_correct_path(fp)
    documents.append(preprocess_text(read_article(correct_path)))

In [6]:
# Set up UMAP with custom parameters (option 3)
umap_model = UMAP(n_neighbors=9, n_components=5, min_dist=0.1, random_state=119)

In [7]:
# Set up HDBSCAN with custom parameters (option 1)
hdbscan_model = HDBSCAN(min_cluster_size=17, min_samples=2, cluster_selection_method='eom')

In [8]:
# Initialize BERTopic with the custom UMAP and HDBSCAN models
topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model, verbose=True)

In [11]:
#import sklearn.utils.validation
import inspect

# 1. Capture the original function
# (We check if we already patched it to avoid double-patching)
if not hasattr(sklearn.utils.validation, "_original_check_array"):
    sklearn.utils.validation._original_check_array = sklearn.utils.validation.check_array

# 2. Detect what the installed sklearn wants
# Some versions want 'force_all_finite', others want 'ensure_all_finite'
sig = inspect.signature(sklearn.utils.validation._original_check_array)
wants_force = 'force_all_finite' in sig.parameters
wants_ensure = 'ensure_all_finite' in sig.parameters

print(f"System Check: Sklearn expects '{'force_all_finite' if wants_force else 'ensure_all_finite'}'")

# 3. Define the Universal Wrapper
def _universal_check_array(*args, **kwargs):
    # If the system wants 'force_all_finite' but gets 'ensure_all_finite' (from UMAP)
    if wants_force and 'ensure_all_finite' in kwargs:
        val = kwargs.pop('ensure_all_finite')
        if 'force_all_finite' not in kwargs:
            kwargs['force_all_finite'] = val
            
    # If the system wants 'ensure_all_finite' but gets 'force_all_finite' (from HDBSCAN)
    elif wants_ensure and 'force_all_finite' in kwargs:
        val = kwargs.pop('force_all_finite')
        if 'ensure_all_finite' not in kwargs:
            kwargs['ensure_all_finite'] = val

    return sklearn.utils.validation._original_check_array(*args, **kwargs)

# 4. Apply the patch
sklearn.utils.validation.check_array = _universal_check_array
print("Universal Patch Applied. Version conflicts resolved.")

System Check: Sklearn expects 'ensure_all_finite'
Universal Patch Applied. Version conflicts resolved.


In [None]:
# Initialize the SentenceTransformer model 
embedding_model = SentenceTransformer("jhgan/ko-sbert-sts") 

# Compute embeddings with a specified batch size to control the number of batches
embeddings = embedding_model.encode(documents, show_progress_bar=True, batch_size=4)

# Use the precomputed embeddings in BERTopic
topics, probs = topic_model.fit_transform(documents, embeddings)

# Add the topic assignments as a new column to the DataFrame
df['topic_type'] = topics

# Save the updated DataFrame to a new TSV file
df.to_csv(output_file, index=False)
print(f"Topic modeling complete. Output saved to {output_file}")

Batches: 100%|██████████| 75/75 [00:29<00:00,  2.56it/s]
2025-12-28 16:53:02,053 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


TypeError: check_array() got an unexpected keyword argument 'ensure_all_finite'. Did you mean 'force_all_finite'?

In [None]:
topics, probs = topic_model.fit_transform(documents, embeddings)

df['topic_type'] = topics
df.to_csv(output_file, index=False)

# Get the number of unique topics (excluding -1, which represents outliers/noise)
num_topics = len(set(topics)) - 1 if -1 in topics else len(set(topics)) # handles if there is -1 or not

print(f"Number of topics: {num_topics}")  # Print the number of topics



In [12]:
pip install "scikit-learn<1.6"

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\WINDOWS 11\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Loading metadata from C:\Users\WINDOWS 11\Desktop\kpop_agenda\Step1\top300_filtered.csv...
Preprocessing articles...
Generating Embeddings...


Batches: 100%|██████████| 75/75 [00:28<00:00,  2.67it/s]
2025-12-28 17:05:14,970 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


Fitting BERTopic model...


RecursionError: maximum recursion depth exceeded