In [3]:
from konlpy.tag import Mecab

mecab = Mecab()

def tokenize_ko(text):
    # 명사 추출: NNG (일반명사), NNP (고유명사)
    tokens = [word for word, pos in mecab.pos(text) if pos in {'NNG', 'NNP'}]
    if not tokens:
        tokens = [word for word in mecab.morphs(text)]
    return tokens


def bertopic_tokenizer(text):
    noun_or_morph_tokens = set(tokenize_ko(text))
    pos_filtered_tokens = set(
        word for word, pos in mecab.pos(text)
        if pos in {'NNG', 'NNP', 'VV', 'VA'}  # 명사, 동사, 형용사
        and len(word) > 1
        and word not in korean_stopwords
    )
    return list(noun_or_morph_tokens & pos_filtered_tokens)


In [2]:
korean_stopwords = {
    # 의미없는 명사들
    '것', '수', '때', '곳', '중', '안', '밖', '위', '아래', '앞', '뒤', '옆',
    '이것', '그것', '저것', '여기', '거기', '저기', '이곳', '그곳', '저곳',
    '등', '및', '통해', '위해', '대해', '관해',
    '오늘', '어제', '내일', '지금', '현재', '과거', '미래',
    '사람', '사람들', '모든', '각각', '전체', '부분',
    
    # 발표·연설에서 자주 나오는 표현 (고유명사 포함)
    '이번', '이번에', '우리', '여러분', '자유', '정신', '대한민국',
    '대통령', '후보', '대표', '의원', '정부',
}

In [4]:
import re
from sklearn.feature_extraction.text import CountVectorizer
from konlpy.tag import Okt
from collections import Counter

In [4]:
from chromadb import PersistentClient
import numpy as np
from bertopic import BERTopic


  from .autonotebook import tqdm as notebook_tqdm


In [8]:
import os

base_dir = os.path.expanduser("./")

In [7]:
from chromadb import PersistentClient

for db_path in db_paths:
    print(f"📁 시도 중: {db_path}")
    try:
        client = PersistentClient(path=db_path)
        print("✅ 컬렉션 목록:", client.list_collections())
    except Exception as e:
        print("❌ 에러:", e)


NameError: name 'db_paths' is not defined

In [5]:
import os
from fastapi import APIRouter, Depends, HTTPException
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from konlpy.tag import Okt
from openai import OpenAI

# --- 1. 불용어 및 토크나이저 정의 ---
okt = Okt()
korean_stopwords = {
    '것', '수', '때', '곳', '중', '안', '밖', '위', '아래', '앞', '뒤', '옆',
    '이것', '그것', '저것', '여기', '거기', '저기', '이곳', '그곳', '저곳',
    '등', '및', '통해', '위해', '대해', '관해',
    '오늘', '어제', '내일', '지금', '현재', '과거', '미래',
    '사람', '사람들', '모든', '각각', '전체', '부분',
    '더', '같은', '이번', '이번에', '우리', '여러분', '자유', '정신',
    '대한민국', '대통령', '후보', '대표', '의원', '정부','합니다','열리는', '했다 시장',
}

def tokenize_ko(text):
    tokens = okt.nouns(text)
    if not tokens:
        tokens = okt.morphs(text)
    return tokens

def bertopic_tokenizer(text):
    noun_or_morph_tokens = set(tokenize_ko(text))
    pos_filtered_tokens = set(
        word for word, pos in okt.pos(text)
        if pos in {'Noun', 'Adjective', 'Verb'} and len(word) > 1 and word not in korean_stopwords
    )
    return list(noun_or_morph_tokens & pos_filtered_tokens)

vectorizer = CountVectorizer(
    tokenizer=bertopic_tokenizer,
    token_pattern=None,
    lowercase=False,
    stop_words=list(korean_stopwords),
    min_df=5,
    max_df=0.85,
    ngram_range=(1, 3),
    max_features=5000
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os

# 기준 디렉토리
base_dir = "/home/ubuntu/ssami/ssami-back"

# DB 경로 목록
db_paths = [
    os.path.join(base_dir, "chroma_db_news"),
    os.path.join(base_dir, "chroma_db_editorial"),
    os.path.join(base_dir, "chroma_db_opinion")
]

In [4]:
from chromadb import PersistentClient
import numpy as np
import pandas as pd

# DB에서 모든 요소 포함해서 불러오기
def load_full_from_chroma(db_path: str, collection_name: str = "langchain"):
    client = PersistentClient(path=db_path)
    collection = client.get_collection(collection_name)
    results = collection.get(include=["documents", "embeddings", "metadatas"])
    return results["documents"], results["embeddings"], results["metadatas"]

# 통합 결과 저장할 리스트
all_documents = []
all_embeddings = []
all_metadatas = []

# 각 DB에서 데이터 불러와 병합
for path in db_paths:
    try:
        docs, embeds, metas = load_full_from_chroma(path)
        all_documents.extend(docs)
        all_embeddings.extend(embeds)
        all_metadatas.extend(metas)
    except Exception as e:
        print(f"⚠️ DB 불러오기 실패: {path} - {e}")

# numpy 배열로 변환
title_embeddings = np.array(all_embeddings)
titles = all_documents  # 여기에 title이 들어있다고 가정
metadatas = all_metadatas

print(f"📄 총 문서 수: {len(titles)}")
print(f"📌 임베딩 shape: {title_embeddings.shape}")
print(f"📝 예시 메타데이터: {metadatas[0] if metadatas else '없음'}")


📄 총 문서 수: 112267
📌 임베딩 shape: (112267, 1536)
📝 예시 메타데이터: {'_id': '6864c8c353fca1c65b9781f2', 'date_int': 20240330, 'title': '내로남불부동산?양문석 편법 대출이었다', 'datatype': 'article', 'url': 'https://n.news.naver.com/mnews/article/022/0003919485?sid=100'}


In [6]:
from bertopic import BERTopic
import numpy as np

batch_size = 112267
n_total = len(titles)

topic_models = []
all_topics = []
all_probs = []

for i in range(0, n_total, batch_size):
    print(f"▶️ Batch {i} ~ {i+batch_size}")
    batch_titles = titles[i:i+batch_size:3]
    batch_embeds = title_embeddings[i:i+batch_size:3]
    topic_model = BERTopic(
        embedding_model=None,
        vectorizer_model=vectorizer,
        min_topic_size=10,
        verbose=True,
        nr_topics="auto",
        language="multilingual"
    )
    topics, probs = topic_model.fit_transform(batch_titles, embeddings=batch_embeds)

    topic_models.append(topic_model)
    all_topics.extend(topics)
    all_probs.extend(probs)

2025-07-22 10:40:34,902 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


▶️ Batch 0 ~ 112267


2025-07-22 10:41:08,827 - BERTopic - Dimensionality - Completed ✓
2025-07-22 10:41:08,828 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-22 10:41:12,505 - BERTopic - Cluster - Completed ✓
2025-07-22 10:41:12,506 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-07-22 10:49:50,403 - BERTopic - Representation - Completed ✓
2025-07-22 10:49:50,404 - BERTopic - Topic reduction - Reducing number of topics
2025-07-22 10:49:50,775 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-22 10:58:17,619 - BERTopic - Representation - Completed ✓
2025-07-22 10:58:17,623 - BERTopic - Topic reduction - Reduced number of topics from 282 to 119


In [7]:
topic_model.save("my_bertopic_model_fin")



In [8]:
from bertopic import BERTopic
from app.utils.tokenizer import bertopic_tokenizer  # ✅ 새로운 경로에서 불러오기

# 1. 기존 모델 로드 (일단 오류 없이 불러오기 위해 tokenizer 이름 등록)
import builtins
builtins.bertopic_tokenizer = bertopic_tokenizer
topic_model = BERTopic.load("my_bertopic_model_fin")

# 2. vectorizer에 새로운 tokenizer 바인딩
vectorizer = topic_model.vectorizer_model
vectorizer.set_params(tokenizer=bertopic_tokenizer)

# 3. 다시 저장 (pickle 시점에 함수 경로를 반영하기 위함)
topic_model.vectorizer_model = vectorizer
topic_model.save("my_bertopic_model_fixed")


