# LDA에서 Author Topic Model

- LDA에서 저자정보가 추가된 토픽모델.
- 문서별 단어분포를 반영하던 파라미터에, 저자별 문서분포 정보까지 추가된 형태.

In [1]:
import pandas as pd
import numpy as np
from pprint import pprint

import pickle

In [13]:
# 데이터 불러오기
with open("./data/cleaned_data.pk", "rb") as f:
    data = pickle.load(f)

data.reset_index(inplace=True, drop=True)
print(data.info())
print(data.tail())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2223 entries, 0 to 2222
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Date     2223 non-null   object
 1   User     2223 non-null   object
 2   Message  2223 non-null   object
dtypes: object(3)
memory usage: 52.2+ KB
None
                     Date          User                   Message
2218  2021-03-30 10:19:34    Rt회계학과 양윤철  내친구 댕기는 회사는 전액 회사 부담이라서 
2219  2021-03-30 10:19:41  Rt글로벌비즈니스최성진                         굿
2220  2021-03-30 10:19:42  Rt글로벌비즈니스최성진                       복지굿
2221  2021-03-30 10:19:54  Rt글로벌비즈니스최성진                  너가 건의해봐 
2222  2021-03-30 10:20:29  Rt글로벌비즈니스최성진                   장우용 핼맷컷


### ATM 사용을 위한 데이터 처리

In [14]:
users = set(data["User"])
users

{'Rt건축공학조수영',
 'Rt경영조현준',
 'Rt글로벌비즈니스최성진',
 'Rt김근형',
 'Rt무역학과이용석',
 'Rt무역학과장우용',
 'Rt법학차진영',
 'Rt정보통신공학과전민재',
 'Rt회계학과 양윤철',
 '이현수'}

In [15]:
# 사용자 별로 데이터 묶기
authors = data.groupby("User")
pprint(authors.groups)
print(type(authors.groups))

{'Rt건축공학조수영': Int64Index([57, 58, 76, 77, 78, 86, 87, 88, 89, 90, 92, 93, 94, 527, 1690,
            1919, 2203],
           dtype='int64'),
 'Rt경영조현준': Int64Index([   1,    2,    3,    4,    7,    8,    9,   10,   11,   12,
            ...
            2159, 2187, 2188, 2189, 2190, 2191, 2195, 2196, 2207, 2209],
           dtype='int64', length=480),
 'Rt글로벌비즈니스최성진': Int64Index([  81,   82,  105,  114,  115,  117,  118,  125,  126,  127,
            ...
            2200, 2201, 2210, 2211, 2216, 2217, 2219, 2220, 2221, 2222],
           dtype='int64', length=521),
 'Rt김근형': Int64Index([520, 523, 1269, 1270, 1276], dtype='int64'),
 'Rt무역학과이용석': Int64Index([1543, 1545], dtype='int64'),
 'Rt무역학과장우용': Int64Index([   6,   13,   36,   37,   44,   45,   48,   49,   51,   53,
            ...
            2114, 2115, 2119, 2122, 2124, 2174, 2175, 2176, 2177, 2184],
           dtype='int64', length=612),
 'Rt법학차진영': Int64Index([   0,   29,   30,   59,   95,   96,  100,  275,  515,  516,  519,
    

In [16]:
# 묶인 데이터를 Int64Index에서 list로 바꾸기

author2doc = {}

for user, index in authors.groups.items():
    author2doc[user] = list(index)
    
print(author2doc)

{'Rt건축공학조수영': [57, 58, 76, 77, 78, 86, 87, 88, 89, 90, 92, 93, 94, 527, 1690, 1919, 2203], 'Rt경영조현준': [1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 14, 16, 17, 18, 20, 23, 24, 25, 26, 28, 31, 32, 33, 34, 35, 38, 39, 40, 41, 42, 43, 46, 47, 50, 55, 56, 60, 61, 62, 65, 67, 68, 69, 70, 73, 74, 75, 80, 162, 163, 168, 171, 172, 173, 177, 179, 180, 184, 186, 187, 193, 194, 195, 198, 200, 204, 205, 209, 210, 213, 216, 218, 220, 223, 224, 229, 234, 236, 238, 244, 249, 252, 256, 261, 262, 264, 265, 272, 273, 277, 279, 285, 286, 287, 289, 290, 291, 292, 293, 312, 313, 314, 351, 352, 353, 356, 357, 446, 447, 448, 450, 452, 453, 454, 460, 461, 462, 463, 466, 468, 469, 472, 473, 474, 475, 476, 478, 479, 483, 484, 485, 486, 487, 500, 502, 505, 513, 525, 535, 542, 546, 549, 551, 552, 555, 556, 569, 572, 578, 582, 588, 589, 592, 593, 594, 597, 601, 604, 608, 617, 621, 622, 625, 628, 631, 632, 637, 658, 664, 667, 669, 671, 672, 673, 674, 676, 679, 680, 683, 686, 687, 689, 693, 694, 697, 701, 703, 705, 706, 710, 71

In [17]:
# gensim에 들어갈 데이터 만들기
tokenized_data = [msg.split() for msg in list(data["Message"])]
print(tokenized_data[6:11])

[['뭐'], ['뭘그만해'], ['칭찬해주는걸', '멈추라는건'], ['내겐', '가장어렵지'], ['넷마블', '어제보다', '7천원올랐다']]


### gensim을 이용한 Author Topic Model

In [18]:
from gensim.models import AuthorTopicModel
from gensim.corpora import Dictionary, bleicorpus
from gensim import corpora

import os

In [27]:
# ATM에 사용할 Dictionary 만들기
if not os.path.exists("kakao(ATM)_dict"):
    dictionary = corpora.Dictionary(tokenized_data)
    dictionary.save("kakao(ATM)_dict")
else:
    dictionary = Dictionary.load("kakao(ATM)_dict")
# ATM에 사용할 corpus 만들기
if not os.path.exists("kakao(ATM)_corpus"):
    corpus = [dictionary.doc2bow(doc) for doc in tokenized_data]
    corpora.BleiCorpus.serialize("kakao(ATM)_corpus", corpus)
else:
    corpus = bleicorpus.BleiCorpus("kakao(ATM)_corpus")

In [28]:
# ATM에 들어갈 데이터 확인
print("Number of authors : %d" % len(authors))
print("Number of unique tokens : %d" % len(dictionary))
print("Number of documents : %d" % len(corpus))

Number of authors : 10
Number of unique tokens : 3527
Number of documents : 2223


In [32]:
# 사람이 이해할 수 있는 형태로 코퍼스 사전 재구성 해보기
[[(dictionary[id], freq) for id, freq in cp] for cp in corpus][:3]

[[('일어나자', 1)], [('대답들좀', 1)], [('해라', 1)]]

In [33]:
# Author Topic Model 실행
NUM_TOPICS = 4

if not os.path.exists("kakao(ATM)_model"):
    model = AuthorTopicModel(corpus=corpus, id2word=dictionary.id2token, num_topics=NUM_TOPICS,
                            author2doc=author2doc, passes=5)
    model.save("kakao(ATM)_model")
else:
    model = AuthorTopicModel.load("kakao(ATM)_model")

In [37]:
# 학습 결과 확인하기
model.show_topic(1, topn=10)

[('메시지입니다', 0.06998926472390145),
 ('삭제된', 0.06942961420509),
 ('사진', 0.016675978480968377),
 ('야', 0.007292905226777227),
 ('윤철이', 0.00706231464006189),
 ('조넛츠', 0.006293122024314025),
 ('술병', 0.005944345011876993),
 ('왜', 0.005043627407625348),
 ('양윤철', 0.004692501109712981),
 ('장우용', 0.004136543174538135)]