In [1]:
import os
os.getcwd()

'C:\\Users\\hyukj'

In [2]:
import pandas as pd
df = pd.read_csv('Pubmed_2021_2023.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,pmid,abstract
0,0,37196221,Messenger RNA (mRNA) has received great attent...
1,1,37150326,Lipid nanoparticles (LNPs) have revolutionized...
2,2,37741463,RNA therapies have recently taken a giant leap...
3,3,37162501,Lipid nanoparticles (LNPs) have been recognize...
4,4,36719091,Based on the clinical success of an in vitro t...


In [3]:
docc=df["abstract"].astype(str).values.tolist()

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
my_stop_words = [
    "to","and","liposomes","of","was","lipid","is","be","publication","publishers",
    "bentham","copyright","doi","drug","we","delivery","as","cells","nanoparticles",
    "article","journal","for","are","by","can","retracts","submitting","authors","manuscripts",
    "editorial","plagiarism","permission","published","disclaimer","legal","forbidden","corrects",
    "apologizes","inconvenience","withdrawn","illustration","science","policy","table",
    "submitted","have","cell","if","readers","httpsbenthamsciencecomeditorialpoliciesmainphp","withdrawal",
    "yechezkel","barenholzs","using","results","elsewhere","strictly","study","treatment","on"
]

# 위 규칙과 똑같은 CountVectorizer를 만듭니다 (학습 X, 도구 생성 O)
vectorizer = CountVectorizer(stop_words=my_stop_words)

# 이 vectorizer에서 '문서를 쪼개고 불용어를 버리는 기능(analyzer)'만 뽑아냅니다.
analyzer = vectorizer.build_analyzer()

# -----------------------------------------------------------------------
# 2. 문서 토큰화 수행
# -----------------------------------------------------------------------
# 원본 문서를 analyzer에 통과시켜 토큰 리스트로 변환합니다.
# 이제 'drug', 'delivery' 같은 단어는 리스트에서 사라집니다.
tokenized_docs = [analyzer(doc) for doc in docc]

### 아래는 간단한 토큰화 예시 코드(BERTopic의 재현을 위해 위 방식 사용했음)

In [5]:
import re
tokenized_docs = []
for doc in docc:
    # 1) 소문자 변환
    # 2) 길이가 2 이상인 영문 단어만 추출 (특수문자, 숫자 제거)
    tokens = re.findall(r'\b[a-z]{2,}\b', doc.lower())
    tokenized_docs.append(tokens)

# 결과 예시: "This is a study." -> ['this', 'is', 'study']
print(f"문서 토큰화 완료: 총 {len(tokenized_docs)}개")

문서 토큰화 완료: 총 13100개


In [15]:
topic_1 = ['cancer', 'tumor', 'potential', 'membrane', 'release']
topic_2 = ['amphotericin', 'mucormycosis', 'fungal', 'antifungal', 'infections']
topic_3 = ['chemotherapy', 'pld', 'pfs', 'doxorubicin', 'naliri']
topic_4 = ['bupivacaine', 'opiod', 'postoperative', 'block', 'analgesia']
topic_5 = ['ocular', 'retinal', 'corneal', 'glaucoma', 'intraocular']
topic_6 = ['leishmaniasis', 'leishmania', 'amphotericin', 'cutaneous', 'diagnosis']

topics = [topic_1, topic_2, topic_3,topic_4, topic_5, topic_6]

In [16]:
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
# 3. Coherence / NPMI 점수 계산
# -------------------------------------------------------------
# 사전(Dictionary) 생성
dictionary = Dictionary(tokenized_docs)

# (1) NPMI 점수 계산 (엄격한 기준, 추천)
coherence_model_npmi = CoherenceModel(topics=topics, 
                                      texts=tokenized_docs, 
                                      dictionary=dictionary, 
                                      coherence='c_npmi')
npmi_score = coherence_model_npmi.get_coherence()

# (2) C_v 점수 계산 (보편적 기준)
coherence_model_cv = CoherenceModel(topics=topics, 
                                    texts=tokenized_docs, 
                                    dictionary=dictionary, 
                                    coherence='c_v')
cv_score = coherence_model_cv.get_coherence()

# -------------------------------------------------------------
# 4. 결과 출력
# -------------------------------------------------------------
print(f"\n===== 평가 결과 =====")
print(f"평가 토픽 개수: {len(topics)}")
print(f"NPMI Score   : {npmi_score:.4f} (범위: -1 ~ 1, 높을수록 좋음)")
print(f"C_v Score    : {cv_score:.4f}  (범위: 0 ~ 1, 높을수록 좋음)")
print(f"=====================")


===== 평가 결과 =====
평가 토픽 개수: 6
NPMI Score   : 0.1944 (범위: -1 ~ 1, 높을수록 좋음)
C_v Score    : 0.7984  (범위: 0 ~ 1, 높을수록 좋음)
