In [70]:
import pandas as pd
import nltk
import MeCab
import re
import string
import matplotlib as mpl
import matplotlib.pyplot as plt
import platform
from wordcloud import WordCloud
import numpy as np
import pprint
import gensim
import os

path = "C:/Windows/Fonts/malgun.ttf"
if platform.system() == "Darwin":
    mpl.rc("font", family="AppleGothic")
elif platform.system() == "Windows":
    font_name = mpl.font_manager.FontProperties(fname=path).get_name()
    mpl.rc('font', family=font_name)

In [2]:
def pos(text):
    p = re.compile(".+\t[A-Z]+")
    return [tuple(p.match(line).group().split("\t")) for line in MeCab.Tagger().parse(text).splitlines()[:-1]]

def morphs(text):
    p = re.compile(".+\t[A-Z]+")
    return [p.match(line).group().split("\t")[0] for line in MeCab.Tagger().parse(text).splitlines()[:-1]]

def nouns(text):
    p = re.compile(".+\t[A-Z]+")
    temp = [tuple(p.match(line).group().split("\t")) for line in MeCab.Tagger().parse(text).splitlines()[:-1]]
    nouns=[]
    for word in temp:
        if word[1] in ["NNG", "NNP", "NNB", "NNBC", "NP", "NR"]:
            nouns.append(word[0])
    return nouns

def cln(text):
    return re.sub("[^ㄱ-ㅣ가-힣 ]", "", text)

def def_sw(path):
    sw = set()
    for i in string.punctuation:
        sw.add(i)
    with open(path, encoding="utf-8") as f:
        for word in f:
            sw.add(word.split("\n")[0])
    return sw

sw = def_sw("stopwords-ko.txt")

In [3]:
raw_data = pd.read_excel("카톡.xlsx", index_col=0)

In [295]:
data = raw_data

auts = data.groupby("user", as_index=True)

In [296]:
aut2doc = {}
for user, idx in auts.groups.items():
    aut2doc[user] = list(idx)

In [297]:
msgs_tkn = [msg.split() for msg in data["msg"]]

if not os.path.exists("kakaotalk id2word"):
    id2word = gensim.corpora.Dictionary(msgs_tkn)
    id2word.save("kakaotalk id2word")
else:
    id2word = gensim.corpora.Dictionary.load("kakaotalk id2word")

In [298]:
if not os.path.exists("kakaotalk dtm"):
    dtm = [id2word.doc2bow(msg) for msg in msgs_tkn]
    gensim.corpora.BleiCorpus.serialize("kakaotalk dtm", dtm)
else:
    dtm = gensim.corpora.bleicorpus.BleiCorpus("kakaotalk dtm")

In [299]:
%%time
n_topics = 4
if not os.path.exists("kakaotalk model"):
    model = gensim.models.AuthorTopicModel(corpus=dtm, id2word=id2word, num_topics=n_topics, author2doc=aut2doc, passes=1000)
    model.save("kakaotalk model")
else:
    model = gensim.models.AuthorTopicModel.load("kakaotalk model")

Wall time: 8.98 ms


In [300]:
model.show_topic(1, topn=20)

[('애저', 0.0003456090116005109),
 ('ㅋㅋㅋ', 0.00033980361590891595),
 ('사진', 0.0003398034430835203),
 ('아님?', 0.00033979944199907667),
 ('ㅋㅋㅋㅋ', 0.00033979685393563684),
 ('7시10분', 0.00033979624721101285),
 ('ㅋㅋ', 0.00033979560989994245),
 ('아', 0.0003397955392544214),
 ('ㅇㅇ', 0.00033979271215195796),
 ('ㅋㅋㅋㅋㅋㅋ', 0.0003397926560857651),
 ('굳', 0.00033979199123262407),
 ('오', 0.00033979171986388436),
 ('ㅋㅋㅋㅋㅋ', 0.0003397914965882391),
 ('나', 0.0003397914806650847),
 ('골드스푼!', 0.0003397914695042316),
 ('허허', 0.0003397914397771411),
 ('가볼까?', 0.0003397913927656504),
 ('헐진짜?!', 0.0003397913776875538),
 ('구글드라이브', 0.0003397912750498104),
 ('밤에?', 0.00033979098444283484)]