In [4]:
from gensim.models import AuthorTopicModel
from gensim.corpora import Dictionary, bleicorpus
from gensim import corpora
from tqdm import tqdm_notebook
from pprint import pprint
import pandas as pd

# 사용자 간의 유사성을 평가하기 위한 measure를 사용하기 위해 불러오기
from gensim.matutils import hellinger
from gensim import matutils

import os

In [6]:
NUM_TOPICS=4

if not os.path.exists("kakao(ATM)_model"):
    model = AuthorTopicModel(corpus=corpus, id2word=dictionary.id2token, author2doc=author2doc,
                                       num_topics=NUM_TOPICS, passes=5)
    model.save("kakao(ATM)_model")
else:
    model = AuthorTopicModel.load("kakao(ATM)_model")

In [9]:
# 토픽 별 분포 확인하기
model.show_topic(2)

[('회사', 0.010639931686868161),
 ('뭐', 0.008227820600482244),
 ('서울', 0.007257877682602509),
 ('결혼살짝', 0.006856398379919562),
 ('그렇게', 0.0068396719099689755),
 ('십일조', 0.006837268771586288),
 ('대덕구', 0.006806303358281884),
 ('연봉', 0.006767872960892339),
 ('진짜', 0.005039934319615734),
 ('샵검색', 0.004378986237608631)]

In [10]:
# 토픽 라벨 지정
topic_labels = ["Topic0", "Topic1", "Topic2", "Topic3"]

In [13]:
# 토픽 별로 topN 단어 확인
for topics in model.show_topics(model.num_topics):
    print("Label : " + topic_labels[topics[0]])
    words = ""
    for word, prob in model.show_topic(topics[0], topn=5):
        words = words + word + " "
    print("Words : ", words)
    print()

Label : Topic0
Words :  그냥 나는 그거 이제 성진아 

Label : Topic1
Words :  용석이 사진 시발 야 굿 

Label : Topic2
Words :  회사 뭐 서울 결혼살짝 그렇게 

Label : Topic3
Words :  메시지입니다 삭제된 와우 사진 조넛츠 



In [17]:
# 사용자 별로 토픽 분포 확인하기.
def show_authors(name):
    print("User : ", name)
    print("Docs : ", model.author2doc[name])
    print("Topic Distribution : ")
    pprint([(topic_labels[topic[0]], topic[1]) for topic in model[name]])

In [18]:
show_authors("이현수")

User :  이현수
Docs :  [15, 19, 21, 22, 27, 111, 112, 113, 116, 123, 190, 191, 192, 196, 197, 206, 207, 217, 221, 222, 241, 247, 250, 255, 259, 260, 271, 274, 280, 282, 283, 308, 309, 310, 317, 318, 366, 407, 408, 411, 414, 419, 428, 437, 443, 444, 456, 457, 464, 465, 467, 470, 508, 528, 529, 532, 533, 534, 536, 537, 539, 540, 553, 719, 722, 735, 749, 756, 757, 758, 764, 765, 767, 771, 772, 778, 779, 782, 783, 784, 785, 787, 790, 791, 793, 797, 798, 802, 805, 820, 827, 829, 830, 859, 874, 875, 876, 877, 881, 884, 917, 918, 919, 921, 922, 926, 932, 934, 938, 991, 1362, 1404, 1405, 1406, 1407, 1449, 1450, 1452, 1460, 1475, 1478, 1483, 1486, 1487, 1488, 1492, 1497, 1498, 1500, 1501, 1503, 1504, 1509, 1510, 1513, 1515, 1516, 1519, 1520, 1521, 1524, 1527, 1530, 1531, 1533, 1551, 1552, 1598, 1600, 1602, 1604, 1605, 1606, 1608, 1609, 1610, 1611, 1612, 1614, 1615, 1618, 1619, 1620, 1621, 1623, 1624, 1625, 1626, 1636, 1637, 1638, 1639, 1644, 1645, 1648, 1650, 1652, 1654, 1655, 1713, 1870, 1871, 18

In [19]:
[model[author] for author in model.id2author.values()]

[[(0, 0.9900447527065799)],
 [(0, 0.05385583486371133), (3, 0.9444359339854045)],
 [(1, 0.0621073653779892), (3, 0.937276782054423)],
 [(0, 0.017754366507299703),
  (1, 0.9467440892252599),
  (2, 0.017756305873089143),
  (3, 0.017745238394351305)],
 [(0, 0.06986163076625194),
  (1, 0.39768285040534174),
  (2, 0.4609559968977104),
  (3, 0.07149952193069599)],
 [(0, 0.026821139990375515), (1, 0.2561067964521437), (2, 0.7167184277643799)],
 [(2, 0.986207537766024)],
 [(1, 0.8350050083359307), (3, 0.1640650869321945)],
 [(0, 0.1379957842916384), (2, 0.8534031409515449)],
 [(0, 0.9624618326686454), (3, 0.03681492321270207)]]

In [20]:
# Hellinger Distance를 이용하여 비슷한 토픽을 가진 사용자를 추정하는 함수.

# author-topic 분포 만들기
author_vecs = [model[author] for author in model.id2author.values()]

def similarity(vec1, vec2):
    '''Get similarity between two vectors'''
    dist = hellinger(matutils.sparse2full(vec1, model.num_topics),
                    matutils.sparse2full(vec2, model.num_topics))
    sim = 1.0 / (1.0 + dist)
    
    return sim

def get_sims(vec):
    '''Get similarity of vector to all authors.'''
    sims = [similarity(vec, vec2) for vec2 in author_vecs]
    return sims

def get_table(name, top_n=10, smallest_author=1):
    '''
    Get table with similarities, author names, and author sizes.
    Return `top_n` authors as a dataframe.
    '''
    
    # 유사도 측정하기
    sims = get_sims(model[name])
    
    # 저자별 정보 정렬하기
    table = []
    for elem in enumerate(sims):
        author_name = model.id2author[elem[0]]
        sim = elem[1]
        author_size = len(model.author2doc[author_name])
        
        table.append((author_name, sim, author_size))
    
    # 사용자 패턴 분석 결과를 Dataframe으로 만들기
    
    df = pd.DataFrame(table, columns=["Author", "Score", "Size"])
    df = df.sort_values("Score", ascending=False)[:top_n]
    return df

In [21]:
# 사용자별 대화 패턴 검증
get_table("이현수")

Unnamed: 0,Author,Score,Size
9,이현수,1.0,199
0,Rt건축공학조수영,0.880256,17
1,Rt경영조현준,0.566693,480
8,Rt회계학과 양윤철,0.557323,216
4,Rt무역학과이용석,0.546425,2
2,Rt글로벌비즈니스최성진,0.525767,521
5,Rt무역학과장우용,0.52196,612
3,Rt김근형,0.521283,5
7,Rt정보통신공학과전민재,0.510224,129
6,Rt법학차진영,0.500911,42
