In [None]:
#bertopic을 통한 토픽모델링

In [7]:
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.10.0-py2.py3-none-any.whl (58 kB)
[K     |████████████████████████████████| 58 kB 2.8 MB/s 
[?25hCollecting umap-learn>=0.5.0
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 4.1 MB/s 
Collecting hdbscan>=0.8.28
  Downloading hdbscan-0.8.28.tar.gz (5.2 MB)
[K     |████████████████████████████████| 5.2 MB 23.7 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 7.0 MB/s 
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.19.1-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 29.7 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014

In [8]:
!pip install bertopic[visualization]



In [5]:
# Colab에 Mecab 설치
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
%cd Mecab-ko-for-Google-Colab
!bash install_mecab-ko_on_colab190912.sh

Cloning into 'Mecab-ko-for-Google-Colab'...
remote: Enumerating objects: 115, done.[K
remote: Counting objects:   4% (1/24)[Kremote: Counting objects:   8% (2/24)[Kremote: Counting objects:  12% (3/24)[Kremote: Counting objects:  16% (4/24)[Kremote: Counting objects:  20% (5/24)[Kremote: Counting objects:  25% (6/24)[Kremote: Counting objects:  29% (7/24)[Kremote: Counting objects:  33% (8/24)[Kremote: Counting objects:  37% (9/24)[Kremote: Counting objects:  41% (10/24)[Kremote: Counting objects:  45% (11/24)[Kremote: Counting objects:  50% (12/24)[Kremote: Counting objects:  54% (13/24)[Kremote: Counting objects:  58% (14/24)[Kremote: Counting objects:  62% (15/24)[Kremote: Counting objects:  66% (16/24)[Kremote: Counting objects:  70% (17/24)[Kremote: Counting objects:  75% (18/24)[Kremote: Counting objects:  79% (19/24)[Kremote: Counting objects:  83% (20/24)[Kremote: Counting objects:  87% (21/24)[Kremote: Counting objects:  91% (22/24)[

In [9]:
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from konlpy.tag import Mecab
from bertopic import BERTopic

In [10]:
text_file = "/content/review.txt"

In [12]:
documents = [line.strip() for line in open(text_file, encoding="utf-8").readlines()]

In [13]:
preprocessed_documents = []

for line in tqdm(documents):
  # 빈 문자열이거나 숫자로만 이루어진 줄은 제외
  if line and not line.replace(' ', '').isdecimal():
    preprocessed_documents.append(line)

100%|██████████| 810/810 [00:00<00:00, 304125.53it/s]


In [14]:
class CustomTokenizer:
    def __init__(self, tagger):
        self.tagger = tagger
    def __call__(self, sent):
        sent = sent[:1000000]
        word_tokens = self.tagger.nouns(sent)
        result = [word for word in word_tokens if len(word) > 1]
        return result

In [15]:
custom_tokenizer = CustomTokenizer(Mecab())

In [16]:
vectorizer = CountVectorizer(tokenizer=custom_tokenizer, max_features=3000)

In [17]:
model = BERTopic(embedding_model="sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens", \
                 vectorizer_model=vectorizer,
                 nr_topics=50,
                 top_n_words=10,
                 #language="multilingual",
                 calculate_probabilities=True)

In [None]:
topics, probs = model.fit_transform(preprocessed_documents)

In [19]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,317,-1_카페_커피_친절_분위기
1,0,235,0_커피_음식_가격_사람
2,2,56,2_주차장_조양_여자_화장실
3,8,41,8_가격_음료_카페_대기
4,5,32,5_남자_식겁_컨셉_정서방
5,10,30,10_이걸_맛집_옹이_배탈
6,7,16,7_다음_갈게_부모_친절
7,11,16,11_친절_안락_오브제_바다
8,9,15,9_드라이브_힐링_맛집_분위기
9,4,14,4_조개찜_딸기_허버_수칙


In [20]:
model.visualize_topics()

In [21]:
model.visualize_distribution(probs[0])

In [22]:
for i in range(0, 50):
  print(i,'번째 토픽 :', model.get_topic(i))

0 번째 토픽 : [('커피', 0.03194977520636129), ('음식', 0.03190447760169624), ('가격', 0.028396501242247484), ('사람', 0.026590346320477895), ('강된장', 0.026106830164768287), ('반찬', 0.025926212697245453), ('꽃게', 0.024848069692592945), ('비빔밥', 0.023778016645783235), ('직원', 0.022241339676703702), ('정도', 0.022099887570116078)]
1 번째 토픽 : [('자리', 0.14537311178278509), ('정도', 0.09452732878637017), ('주차장', 0.08465593746277728), ('방송', 0.07738607603756538), ('주변', 0.07273329418090939), ('사람', 0.07266355645438771), ('이상', 0.0674380480338452), ('분위기', 0.059031434720629855), ('탄지', 0.05691590775281888), ('가운데', 0.05691590775281888)]
2 번째 토픽 : [('주차장', 0.03794304173807354), ('조양', 0.03749002047567257), ('여자', 0.03696096788019535), ('화장실', 0.03689709382534073), ('키즈', 0.0359682523873823), ('방직', 0.0359682523873823), ('남자', 0.03483971317129312), ('강된장', 0.033574597863062466), ('맛집', 0.03303411924076639), ('매장', 0.031327221762710536)]
3 번째 토픽 : [('여긴', 0.18466474016521497), ('지인', 0.17450484898955), ('이상', 0.166242