In [1]:
import gensim
import pickle
import pandas as pd

from gensim.models.coherencemodel import CoherenceModel

In [2]:
df = pd.read_csv("../files/df_lemmatize.csv")

In [4]:
df.columns

Index(['Unnamed: 0', 'journal_name', 'year', 'title', 'author', 'author_info',
       'abstract', 'lemmatize_abstract'],
      dtype='object')

In [5]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [7]:
import ast
def to_list(x):
    return ast.literal_eval(x)

df['lemmatize_abstract'] = df.lemmatize_abstract.map(to_list)

In [9]:
from gensim import corpora

dictionary = corpora.Dictionary(df.lemmatize_abstract)
corpus = [dictionary.doc2bow(text) for text in df.lemmatize_abstract]

In [12]:
import gensim

k = 4
model = gensim.models.ldamodel.LdaModel(corpus, # input data
                                        num_topics=k, # 토픽 모델링을 하고싶은 갯수
                                        id2word=dictionary, 
                                        passes=5,
                                        eval_every=None)

In [18]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

vis = gensimvis.prepare(model, corpus, dictionary)

In [19]:
pyLDAvis.display(vis)

In [21]:
from tqdm.notebook import tqdm

# 토픽1의 비율이 50%가 넘는 문서의 index
topic_1_list = []
check_topic = model[corpus]

for i, topic_dist in tqdm(enumerate(check_topic)):
    for idx, dist in topic_dist:
        if idx == 1 and dist >= 0.5:
            topic_1_list.append(i)

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [25]:

from operator import itemgetter

# 가장 dist가 높은 토픽 & 분포 저장
main_topic = []
for topic_list in tqdm(model[corpus]):
    main_topic.append(max(topic_list, key=itemgetter(1)))
    

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=30726.0), HTML(value='')))




In [27]:
main_topic[0]

(1, 0.56093156)

In [28]:
# 특정 토픽이 60% 넘을 경우 해당 토픽으로 indexing

def get_over_60(x):
    if x[1] >= 0.6:
        return x[0]
    else:
        return -1
    # 참고) 같은 표현
    # return x[0] if x[1] >= 0.6 else -1


topics = list(map(get_over_60, main_topic))

In [29]:
df['topic'] = topics

In [31]:
df[['abstract', 'topic']]

Unnamed: 0,abstract,topic
0,Sialoperoxidase and myeloperoxidase are the tw...,-1
1,To determine whether periodontal disease is po...,2
2,Polymeric material plays an important role as ...,1
3,Treating periodontitis through non-surgical pe...,-1
4,Recent studies have reported a relationship be...,-1
...,...,...
30721,PMID: 7000745 Indexed for MEDLINE,2
30722,Periodontal surgery is (too) often utilized bu...,0
30723,Numerous patients with Papillon-Lefvre syndrom...,2
30724,The authors investigated in 60 cases the heali...,-1


In [35]:
test_df = df[df.topic != -1]

In [41]:
test_df_1980 = test_df[(test_df.year >= 1980) & (test_df.year < 1990)]
def topic_percent(x):
    return x/sum(test_df_1980.topic.value_counts())

result_df_1980 = test_df_1980.topic.value_counts().map(topic_percent)
result_df_1980 = result_df_1980.sort_index()
result_df_1980

0    0.460227
1    0.164773
2    0.202652
3    0.172348
Name: topic, dtype: float64

In [39]:
test_df_1990 = test_df[(test_df.year >= 1990) & (test_df.year < 2000)]
result_df_1990 = test_df_1990.topic.value_counts() / sum(test_df_1990.topic.value_counts())
result_df_1990 = result_df_1990.sort_index()
result_df_1990




0    0.385229
1    0.207090
2    0.191433
3    0.216248
Name: topic, dtype: float64

In [42]:
test_df_2000 = test_df[(test_df.year >= 2000) & (test_df.year < 2010)]
result_df_2000 = test_df_2000.topic.value_counts() / sum(test_df_2000.topic.value_counts())
result_df_2000 = result_df_2000.sort_index()
result_df_2000

0    0.337291
1    0.224746
2    0.227327
3    0.210635
Name: topic, dtype: float64

In [43]:
test_df_2010 = test_df[(test_df.year >= 2010) & (test_df.year < 2022)]
result_df_2010 = test_df_2010.topic.value_counts() / sum(test_df_2010.topic.value_counts())
result_df_2010 = result_df_2010.sort_index()
result_df_2010

0    0.253036
1    0.279040
2    0.291185
3    0.176738
Name: topic, dtype: float64

In [44]:
result_df = pd.concat([result_df_1980, result_df_1990, result_df_2000, result_df_2010], axis=1)
result_df.columns = ['topic_1980', 'topic_1990', 'topic_2000', 'topic_2010']
result_df

Unnamed: 0,topic_1980,topic_1990,topic_2000,topic_2010
0,0.460227,0.385229,0.337291,0.253036
1,0.164773,0.20709,0.224746,0.27904
2,0.202652,0.191433,0.227327,0.291185
3,0.172348,0.216248,0.210635,0.176738
