In [4]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

In [5]:
import jieba
from langconv import *
jieba.load_userdict("dict.txt.big.txt")
jieba.load_userdict("dict.txt")

In [7]:

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
    
# create sample documents
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health." 

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_set:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)

In [8]:
print(ldamodel.print_topics(num_topics=3, num_words=3))

[(0, u'0.086*health + 0.086*brocolli + 0.086*good'), (1, u'0.068*mother + 0.068*brother + 0.068*drive')]


In [9]:
print(text.count('health'))

2


In [10]:
for vector in corpus:
    print(vector)

[(0, 2), (1, 2), (2, 1), (3, 1), (4, 1), (5, 2)]
[(3, 1), (4, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)]
[(8, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1)]
[(3, 1), (4, 1), (8, 1), (18, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1)]
[(0, 1), (1, 1), (19, 2), (30, 1), (31, 1)]


In [11]:
content=[]
with open("essay") as f:
    for line in f:
        line = Converter('zh-hant').convert(line.decode('utf-8'))
        content.append(line)

In [12]:
# Demo of word segmentation algorithm
print content[0]
print " / ".join(jieba.cut(content[0]))

剛剛擊敗世界圍棋冠軍的 AlphaGo，是怎樣「思考」的？"

剛剛 / 擊敗 / 世界圍棋 / 冠軍 / 的 /   / AlphaGo / ， / 是 / 怎樣 / 「 / 思考 / 」 / 的 / ？ / " / 



In [13]:
# Check if a word is all CJK characters
def isCJK(w):
    if not w.isalpha():
        return False
    # for c in w:
    #     n = unicodedata.name(unicode(c))
    #     if not n.startswith("CJK"):
    #         return False
    return True

# loop through document list
import unicodedata
texts=[]
for i in content:
    # clean and tokenize document string
    tokens=[]
    for w in jieba.cut(i):
        if len(w)>1:
            if isCJK(w):
                tokens.append(w.strip())
    texts.append(tokens)  

In [14]:
dictionary = corpora.Dictionary(texts)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

In [15]:
tfidf = models.TfidfModel(corpus)

In [16]:
## Frequency Count
print "Frequency Count: "
for id, cnt in corpus[40]:
    print "(%s, %d) " % (dictionary[id], cnt),

Frequency Count: 
(AlphaGo, 1)  (一個, 1)  (大腦, 1)  (落子, 1)  (選擇器, 1)  (第二個, 1)  (問題, 1)  (回答, 1)  (相對, 1) 


In [17]:
print "Term weighting by TF-IDF: "
for id, score in tfidf[corpus[1]]:
    print "(%s, %.2f) " % (dictionary[id], score),

Term weighting by TF-IDF: 
(冠軍, 0.23)  (AlphaGo, 0.06)  (歐洲, 0.27)  (首次, 0.27)  (研發, 0.27)  (宣佈, 0.27)  (選手, 0.20)  (AI, 0.12)  (職業, 0.20)  (谷歌, 0.27)  (Hui, 0.27)  (圍棋, 0.28)  (戰勝, 0.27)  (他們, 0.15)  (人類, 0.13)  (二段, 0.27)  (Fan, 0.27)  (DeepMind, 0.27)  (神經網絡, 0.13) 


In [18]:
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(tfidf[corpus], num_topics=2, id2word = dictionary, passes=20)

In [19]:
for index,topic in ldamodel.print_topics(num_topics=2, num_words=10):
    print index,topic

0 0.005*大腦 + 0.005*AlphaGo + 0.005*判斷 + 0.004*落子 + 0.004*閱讀 + 0.004*局面 + 0.004*可能 + 0.004*如果 + 0.004*這個 + 0.004*下一步
1 0.006*訓練 + 0.006*通過 + 0.005*一個 + 0.005*落子 + 0.005*圍棋 + 0.005*他們 + 0.004*選擇器 + 0.004*使用 + 0.004*網絡 + 0.004*AlphaGo


In [20]:
lsi = models.LsiModel(tfidf[corpus], id2word=dictionary, num_topics=2)

In [21]:
for i,j in lsi.show_topic(0)[:10]: print i,j

落子 -0.223021077592
選擇器 -0.187362307902
局面 -0.185162691145
大腦 -0.178620820471
通過 -0.168648805455
訓練 -0.146857883844
AlphaGo -0.146011080108
閱讀 -0.144261517574
評估器 -0.141535235406
他們 -0.139948636181


In [22]:
print tfidf[corpus]

<gensim.interfaces.TransformedCorpus object at 0x116759210>


In [27]:
for i,j in corpus[1]: print dictionary[i],j

冠軍 1
AlphaGo 1
歐洲 1
首次 1
研發 1
宣佈 1
選手 1
AI 1
職業 1
谷歌 1
Hui 1
圍棋 2
戰勝 1
他們 1
人類 1
二段 1
Fan 1
DeepMind 1
神經網絡 1


In [50]:
document = ["圍棋", "選擇器"]
dictionary.add_documents([document])
# dictionary.doc2bow(document) 
corpus.append(dictionary.doc2bow(document))

In [53]:
 a = dict(one=1, two=2, three=3)

In [57]:
a.get('one')

1

In [63]:
china = "DeepMind"

In [65]:
len(china)

6

In [None]:
jieba.