# Gensim pakage

In [2]:
## Bag of Words encoring
## TF-IDF encording
## Topic Modeling

In [3]:
!pip install gensim

Collecting gensim
  Downloading gensim-3.8.3-cp37-cp37m-macosx_10_9_x86_64.whl (24.2 MB)
[K     |████████████████████████████████| 24.2 MB 452 kB/s eta 0:00:01
Collecting smart-open>=1.8.1
  Downloading smart_open-2.0.0.tar.gz (103 kB)
[K     |████████████████████████████████| 103 kB 27.0 MB/s eta 0:00:01
Collecting boto
  Downloading boto-2.49.0-py2.py3-none-any.whl (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 14.4 MB/s eta 0:00:01
[?25hCollecting boto3
  Downloading boto3-1.13.11-py2.py3-none-any.whl (128 kB)
[K     |████████████████████████████████| 128 kB 24.8 MB/s eta 0:00:01
Collecting jmespath<1.0.0,>=0.7.1
  Downloading jmespath-0.10.0-py2.py3-none-any.whl (24 kB)
Collecting s3transfer<0.4.0,>=0.3.0
  Downloading s3transfer-0.3.3-py2.py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 8.5 MB/s  eta 0:00:01
[?25hCollecting botocore<1.17.0,>=1.16.11
  Downloading botocore-1.16.11-py2.py3-none-any.whl (6.2 MB)
[K     |█████████████████████

## Bag of Words encoring

#### - Dictionary Class use
######     token2id 속성으로 사전저장
######     doc2bow 메서드로 BOW 인코딩
#### - TfidfModel 클래스를 이용하면 TF-IDF 인코딩도 가능

In [4]:
# step 1
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
    'The last document?',
]

In [5]:
# step 2
token_list = [[text for text in doc.split()] for doc in corpus]
token_list

[['This', 'is', 'the', 'first', 'document.'],
 ['This', 'is', 'the', 'second', 'second', 'document.'],
 ['And', 'the', 'third', 'one.'],
 ['Is', 'this', 'the', 'first', 'document?'],
 ['The', 'last', 'document?']]

In [7]:
# step 3 dict. object create
from gensim.corpora import Dictionary

dictionary = Dictionary(token_list)
dictionary.token2id


{'This': 0,
 'document.': 1,
 'first': 2,
 'is': 3,
 'the': 4,
 'second': 5,
 'And': 6,
 'one.': 7,
 'third': 8,
 'Is': 9,
 'document?': 10,
 'this': 11,
 'The': 12,
 'last': 13}

In [9]:
# step 4 BOW ENCORDING
term_matrix = [dictionary.doc2bow(token) for token in token_list]
term_matrix # sparse matrix form

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)],
 [(0, 1), (1, 1), (3, 1), (4, 1), (5, 2)],
 [(4, 1), (6, 1), (7, 1), (8, 1)],
 [(2, 1), (4, 1), (9, 1), (10, 1), (11, 1)],
 [(10, 1), (12, 1), (13, 1)]]

In [11]:
# step 5 TF-IDF encording
from gensim.models import TfidfModel

tfidf = TfidfModel(term_matrix)

for doc in tfidf[term_matrix]:
    print("doc:")
    for k, v in doc:
        print(k, v) # 중요도 만큼 스케일링(감중)

doc:
0 0.49633406058198626
1 0.49633406058198626
2 0.49633406058198626
3 0.49633406058198626
4 0.12087183801361165
doc:
0 0.25482305694621393
1 0.25482305694621393
3 0.25482305694621393
4 0.0620568558708622
5 0.8951785160431313
doc:
4 0.07979258234193365
6 0.5755093812740171
7 0.5755093812740171
8 0.5755093812740171
doc:
2 0.3485847413542797
4 0.08489056411237639
9 0.6122789185961829
10 0.3485847413542797
11 0.6122789185961829
doc:
10 0.37344696513776354
12 0.6559486886294514
13 0.6559486886294514


# ML에서 토픽 모델링(특정 단어의 비중) 을 할때 
<img src= 'resources/topicmodel.png'>

# Mobis_TextMining

In [17]:
# step 1 text download
from sklearn.datasets import fetch_20newsgroups

newsgroups = fetch_20newsgroups(categories = ["comp.graphics", "rec.sport.baseball", "sci.med"])

In [19]:
# setp 2 noun extract

from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

tagged_list = [pos_tag(word_tokenize(doc))for doc in newsgroups.data]
nouns_list = [[t[0]for t in doc if t[1].startswith("N")] for doc in tagged_list]

In [20]:
# setp 3 표제어 extract
from nltk.stem import WordNetLemmatizer

lm = WordNetLemmatizer()

nouns_list = [[lm.lemmatize(w, pos = 'n') for w in doc] for doc in nouns_list]

In [21]:
# setp 4 stop-words remove
import re
token_list = [[text.lower() for text in doc] for doc in nouns_list]
token_list = [[re.sub(r"[^A-Za-z]+", '',word) for word in doc] for doc in token_list]

In [22]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words += ["", "subject", "article", "line", "year", "month", "address", "keyword", "msg"]

token_list = [[word for word in doc
              if (word not in stop_words) and (2 < len(word) < 10)]
             for doc in token_list]

In [24]:
# step 5 topic modeling
from gensim import corpora

dictionary = corpora.Dictionary(token_list)
doc_term_matrix = [dictionary.doc2bow(tokens) for tokens in token_list]

In [26]:
%%time
from gensim.models.ldamodel import LdaModel

model = LdaModel(corpus=doc_term_matrix, id2word=dictionary, num_topics=3)

CPU times: user 7.93 s, sys: 40.1 ms, total: 7.97 s
Wall time: 2.54 s


In [27]:
model.print_topics()

[(0,
  '0.013*"lines" + 0.005*"game" + 0.005*"time" + 0.004*"geb" + 0.004*"banks" + 0.003*"people" + 0.003*"gordon" + 0.003*"day" + 0.003*"computer" + 0.003*"way"'),
 (1,
  '0.013*"lines" + 0.007*"image" + 0.006*"file" + 0.005*"team" + 0.005*"time" + 0.004*"game" + 0.004*"people" + 0.004*"program" + 0.004*"computer" + 0.004*"science"'),
 (2,
  '0.009*"lines" + 0.007*"image" + 0.005*"time" + 0.005*"point" + 0.005*"problem" + 0.004*"color" + 0.004*"program" + 0.003*"system" + 0.003*"anyone" + 0.003*"player"')]

In [32]:
# step viualization
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, doc_term_matrix, dictionary)
vis
