In [1]:
!pip install nltk
!pip install gensim
!pip install pyLDAvis



In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import collections

In [3]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("reuters")
nltk.download("brown")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### データを取得

In [4]:
from nltk.corpus import brown as corpus

### datasetの中身を確認

In [5]:
for n,item in enumerate(corpus.words(corpus.fileids()[0])[:300]):
    print(item, end=" ")
    if (n%25) ==24:
      print(" ")

The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place .  
The jury further said in term-end presentments that the City Executive Committee , which had over-all charge of the election , `` deserves the praise  
and thanks of the City of Atlanta '' for the manner in which the election was conducted . The September-October term jury had been charged  
by Fulton Superior Court Judge Durwood Pye to investigate reports of possible `` irregularities '' in the hard-fought primary which was won by Mayor-nominate Ivan  
Allen Jr. . `` Only a relative handful of such reports was received '' , the jury said , `` considering the widespread interest in  
the election , the number of voters and the size of this city '' . The jury said it did find that many of Georgia's  
registration and election laws `` are outmoded or inadequate and often ambiguous '' . It recommended that Fulton legislators act `

In [6]:
docs=[corpus.words(fileid) for fileid in corpus.fileids()]
print(docs[:5])
print("num of docs:", len(docs))

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...], ['Austin', ',', 'Texas', '--', 'Committee', 'approval', ...], ['Several', 'defendants', 'in', 'the', 'Summerdale', ...], ['Oslo', 'The', 'most', 'positive', 'element', 'to', ...], ['East', 'Providence', 'should', 'organize', 'its', ...]]
num of docs: 500


## 前処理編

### 例 : ストップワードリストの作成

### nltkのストップワードリスト

In [7]:
en_stop = nltk.corpus.stopwords.words('english')

### 例:【発展】記号や数字は正規表現で消してみる

In [8]:
en_stop= ["``","/",",.",".,",";","--",":",")","(",'"','&',"'",'),',',"','-','.,','.,"','.-',"?",">","<", 
          "0","1","2","3","4","5","6","7","8","9","10","11","12","86","1986","1987","000", 
          "said","say","u","v","mln","ct","net","dlrs","tonne","pct","shr","nil","company","lt","share","year","billion","price"] \
         +en_stop

### 前処理関数の作成

In [9]:
from nltk.corpus import wordnet as wn #lemmatize関数のためのimport

def preprocess_word(word, stopwordset):
    
    #1.make words lower ex: Python =>python
    word=word.lower()
    
    #2.remove "," and "."
    if word in [",",".", "!", "\"", "''"]:
        return None
    
    #3.remove stopword  ex: the => (None) 
    if word in stopwordset:
        return None
    
    #4.lemmatize  ex: cooked=>cook
    lemma = wn.morphy(word)
    if lemma is None:
        return word

    elif lemma in stopwordset: #lemmatizeしたものがstopwordである可能性がある
        return None
    else:
        return lemma
    

def preprocess_document(document):
    document=[preprocess_word(w, en_stop) for w in document]
    document=[w for w in document if w is not None]
    return document

def preprocess_documents(documents):
    return [preprocess_document(document) for document in documents]

In [10]:
import gensim
from gensim import corpora

In [11]:
#documentを，gensim LDAが読み込めるデータ構造にする

#辞書の作成
dictionary = corpora.Dictionary(preprocess_documents(docs))
#コーパスの作成
corpus_ = [dictionary.doc2bow(doc) for doc in preprocess_documents(docs)]

In [12]:
#Dictionary:gensimにおける辞書クラス
#token2id属性には単語と辞書IDとの対応が格納される

print(dictionary.token2id)



In [13]:
#corpusにはdocumentごとに単語の(ID、出現回数)のリストが得られる

print(corpus_[0][:10]) #文章での出現順でなく辞書IDの若い順なことに注意

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]


In [14]:
#before
print([w.lower() for w in corpus.sents(corpus.fileids()[0])[0]])

#after
print(dictionary.doc2bow([w.lower() for w in corpus.sents(corpus.fileids()[0])[0]]))

#これを全文書の全文に適用したのがcorpus_

['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', "atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
[(68, 1), (149, 1), (196, 1), (214, 1), (247, 1), (250, 1), (273, 1), (312, 1), (327, 1), (434, 1), (454, 1), (487, 1)]


## LDA学習

In [15]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus_,
                                           num_topics=20,
                                           id2word=dictionary,
                                           alpha=0.1,                             #optional LDAのハイパーパラメータalpha
                                           eta=0.1,                                 #optional LDAのハイパーパラメータbeta
                                           #minimum_probability=0.0    #optional 学習結果に格納するトピック・単語の生起確率の下限
                                          )

## パラメータの確認

In [16]:
#(トピックID, 当該トピックにおける単語とそのprobability)  ※　のうち、上位num_words位 
topics = ldamodel.print_topics(num_words=15)
for topic in topics:
    print(topic)

(0, '0.007*"one" + 0.005*"would" + 0.003*"make" + 0.003*"could" + 0.003*"go" + 0.003*"two" + 0.003*"like" + 0.003*"know" + 0.002*"come" + 0.002*"time" + 0.002*"take" + 0.002*"may" + 0.002*"first" + 0.002*"man" + 0.002*"get"')
(1, '0.005*"would" + 0.004*"make" + 0.004*"one" + 0.003*"could" + 0.003*"new" + 0.003*"two" + 0.002*"first" + 0.002*"way" + 0.002*"even" + 0.002*"years" + 0.002*"take" + 0.002*"state" + 0.002*"get" + 0.002*"must" + 0.002*"time"')
(2, '0.005*"would" + 0.005*"one" + 0.004*"take" + 0.003*"make" + 0.003*"know" + 0.003*"may" + 0.003*"first" + 0.002*"new" + 0.002*"time" + 0.002*"two" + 0.002*"get" + 0.002*"come" + 0.002*"go" + 0.002*"state" + 0.002*"use"')
(3, '0.007*"one" + 0.003*"first" + 0.003*"take" + 0.003*"make" + 0.003*"would" + 0.003*"get" + 0.003*"new" + 0.003*"two" + 0.002*"like" + 0.002*"even" + 0.002*"man" + 0.002*"come" + 0.002*"also" + 0.002*"could" + 0.002*"long"')
(4, '0.006*"one" + 0.004*"would" + 0.003*"could" + 0.003*"make" + 0.003*"use" + 0.003*"time

In [17]:
#[(当該documentにおけるトピックIDとそのprobability　)]　 ※　のうち、minimum_probabilityの値を超えるもの

for n,item in enumerate(corpus_[:10]):
    print("document ID "+str(n)+":" ,end="")
    print(ldamodel.get_document_topics(item))

document ID 0:[(2, 0.048623264), (6, 0.6747069), (11, 0.24067448), (13, 0.03458788)]
document ID 1:[(6, 0.061847072), (13, 0.3509392), (17, 0.5787913)]
document ID 2:[(6, 0.5640871), (12, 0.013872216), (13, 0.40693364), (15, 0.011220149)]
document ID 3:[(13, 0.9982909)]
document ID 4:[(6, 0.01599767), (8, 0.023094276), (13, 0.95208645)]
document ID 5:[(6, 0.8243922), (13, 0.17398436)]
document ID 6:[(1, 0.01602458), (6, 0.0697521), (7, 0.37992334), (11, 0.12551335), (13, 0.4068964)]
document ID 7:[(13, 0.48603547), (17, 0.039738573), (18, 0.4631301)]
document ID 8:[(6, 0.33005798), (8, 0.30803716), (13, 0.18848069), (14, 0.17098513)]
document ID 9:[(6, 0.97963554), (13, 0.016176792)]


In [18]:
#documentのcategory
categories=[corpus.categories(fileid) for fileid in corpus.fileids()]

In [19]:
n=0

#n番目のdocumentのトピック分布
print(ldamodel.get_document_topics(corpus_[n]))

#n番目のdocumentのcategory
print(categories[n])

#n番目のdocumentの生の文章
print(" ".join(docs[n]))

[(2, 0.034651604), (6, 0.67422926), (11, 0.25774133), (13, 0.03197023)]
['news']


In [20]:
from gensim.models.ldamodel import CoherenceModel

for i in range(2, 21):
    lda = gensim.models.ldamodel.LdaModel(corpus=corpus_,
                                           num_topics=i,
                                           id2word=dictionary,
                                           alpha=0.1,                             #optional LDAのハイパーパラメータalpha
                                           eta=0.1,                                 #optional LDAのハイパーパラメータbeta
                                           #minimum_probability=0.0    #optional 学習結果に格納するトピック・単語の生起確率の下限
                                          )

    cm = CoherenceModel(model = lda, corpus = corpus_, coherence = 'u_mass')
    coherence = cm.get_coherence()

    perwordbound = lda.log_perplexity(corpus_)
    perplexity = np.exp2(-perwordbound)

    print(f"num_topics = {i}, coherence = {coherence}, perplexity = {perplexity}")

num_topics = 2, coherence = -0.335760897137364, perplexity = 517.5607000892712
num_topics = 3, coherence = -0.323538456288812, perplexity = 547.9550990621244
num_topics = 4, coherence = -0.34488007872512905, perplexity = 574.2717604985168
num_topics = 5, coherence = -0.2957108043735956, perplexity = 594.7729179964059
num_topics = 6, coherence = -0.2983580210343371, perplexity = 617.1277522230955
num_topics = 7, coherence = -0.36441198704398114, perplexity = 636.0639321945965
num_topics = 8, coherence = -0.29514197648123097, perplexity = 660.7899014273956
num_topics = 9, coherence = -0.32886112637945497, perplexity = 674.7736020088561
num_topics = 10, coherence = -0.3650439005396283, perplexity = 701.5669413534793
num_topics = 11, coherence = -0.3341921607633729, perplexity = 713.3297779008767
num_topics = 12, coherence = -0.3441206448214671, perplexity = 742.2186421025128
num_topics = 13, coherence = -0.34155504993108327, perplexity = 762.937191464733
num_topics = 14, coherence = -0.31

## 可視化

In [21]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [22]:
#全documentを学習に用いた場合結構時間がかかる(20min~)
#gensimではK個のトピックに0~K-1のidが割り振られていたのに対し，pyLDAvisでは1~Kのidが割り振られていることに注意

lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus_, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [23]:
#上で可視化したモデルをgoogle drive上にsaveできる

pyLDAvis.save_html(lda_display,'vis.html')