# Тематическое моделирование в gensim

gensim: https://radimrehurek.com/gensim/ 

In [2]:
import gensim
import string
import nltk

nltk.download('punkt')
nltk.download('stopwords')

In [3]:
from sklearn.datasets import fetch_20newsgroups

twenty = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

### Предобработка текста в nltk

In [33]:
from nltk import word_tokenize
from collections import Counter

from gensim import corpora, models, similarities
from nltk.corpus import stopwords

In [5]:
docs = []
for text in twenty.data:
    tokens = [w.lower() for w in word_tokenize(text) if not w in string.punctuation]
    docs.append(tokens)

In [6]:
print('Токенизированный текст:')
print(docs[4])

Токенизированный текст:
['1', 'i', 'have', 'an', 'old', 'jasmine', 'drive', 'which', 'i', 'can', 'not', 'use', 'with', 'my', 'new', 'system', 'my', 'understanding', 'is', 'that', 'i', 'have', 'to', 'upsate', 'the', 'driver', 'with', 'a', 'more', 'modern', 'one', 'in', 'order', 'to', 'gain', 'compatability', 'with', 'system', '7.0.1.', 'does', 'anyone', 'know', 'of', 'an', 'inexpensive', 'program', 'to', 'do', 'this', 'i', 'have', 'seen', 'formatters', 'for', '20', 'buit', 'have', 'no', 'idea', 'if', 'they', 'will', 'work', '2', 'i', 'have', 'another', 'ancient', 'device', 'this', 'one', 'a', 'tape', 'drive', 'for', 'which', 'the', 'back', 'utility', 'freezes', 'the', 'system', 'if', 'i', 'try', 'to', 'use', 'it', 'the', 'drive', 'is', 'a', 'jasmine', 'direct', 'tape', 'bought', 'used', 'for', '150', 'w/', '6', 'tapes', 'techmar', 'mechanism', 'essentially', 'i', 'have', 'the', 'same', 'question', 'as', 'above', 'anyone', 'know', 'of', 'an', 'inexpensive', 'beckup', 'utility', 'i', 'can

In [9]:
stop = set(stopwords.words('english'))
print(stop)

{'won', 'about', 'the', 'until', 'our', 'doing', 'o', 'before', "you're", 'when', 'haven', 'is', 'these', 'it', 'below', 'its', 'all', 'll', 'for', 'needn', 'couldn', 'out', 'they', 're', 'few', 'yourselves', 'which', 'did', 'once', 'should', 'y', 'in', 'so', 'she', "didn't", 'wouldn', 'here', "weren't", "she's", 'under', 'have', 'down', 'there', "don't", 'above', 'ain', 'while', "it's", "should've", "doesn't", 'on', 'any', 'yours', "you've", 'am', 'how', 'now', 'has', 'hadn', 'you', 'were', 'hasn', "couldn't", "haven't", 'after', 'weren', 'we', 'm', 'more', 'aren', 'his', 'only', 'own', "shouldn't", 'just', 'very', 'as', "you'd", "hadn't", 'will', 'himself', 'shouldn', 'by', 'some', 'then', "aren't", 'doesn', 'their', 'me', 'can', 'why', 'my', 'each', 'isn', "you'll", 'that', 'through', "mightn't", 'ourselves', 'what', 'if', 'because', 'during', 'this', 'themselves', 'too', 'at', 'he', 'most', 'mightn', 'who', 'again', 'having', 'had', 'those', 'against', 'into', 'her', 'such', 'are',

In [10]:
new_docs = []
for tokens in docs:
    new_docs.append([token for token in tokens if not token in stop])
docs = new_docs

In [11]:
print(docs[4])

['1', 'old', 'jasmine', 'drive', 'use', 'new', 'system', 'understanding', 'upsate', 'driver', 'modern', 'one', 'order', 'gain', 'compatability', 'system', '7.0.1.', 'anyone', 'know', 'inexpensive', 'program', 'seen', 'formatters', '20', 'buit', 'idea', 'work', '2', 'another', 'ancient', 'device', 'one', 'tape', 'drive', 'back', 'utility', 'freezes', 'system', 'try', 'use', 'drive', 'jasmine', 'direct', 'tape', 'bought', 'used', '150', 'w/', '6', 'tapes', 'techmar', 'mechanism', 'essentially', 'question', 'anyone', 'know', 'inexpensive', 'beckup', 'utility', 'use', 'system', '7.0.1']


### Построение словаря

In [12]:
dictionary = corpora.Dictionary(docs)
print(dictionary)

Dictionary(198027 unique tokens: ['actually', 'also', 'anyway', 'bashers', 'beat']...)


In [13]:
# Отфильтруем словарь
dictionary.filter_extremes(no_below=2, no_above=1., keep_n=300000)
print(dictionary)

Dictionary(60483 unique tokens: ['actually', 'also', 'anyway', 'bashers', 'beat']...)


In [14]:
new_doc = "Hello world"
new_vec = dictionary.doc2bow(new_doc.lower().split())

In [15]:
new_vec

[(770, 1), (4686, 1)]

In [16]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [44]:
' '.join(docs[4])

'1 old jasmine drive use new system understanding upsate driver modern one order gain compatability system 7.0.1. anyone know inexpensive program seen formatters 20 buit idea work 2 another ancient device one tape drive back utility freezes system try use drive jasmine direct tape bought used 150 w/ 6 tapes techmar mechanism essentially question anyone know inexpensive beckup utility use system 7.0.1'

In [21]:
corpus[4][:10]

[(57, 2),
 (129, 2),
 (136, 1),
 (141, 2),
 (187, 1),
 (214, 1),
 (217, 1),
 (218, 1),
 (219, 1),
 (220, 1)]

In [22]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [24]:
corpus_tfidf[4][:10]

[(57, 0.10784792232122509),
 (129, 0.08266735127211784),
 (136, 0.05518784389606809),
 (141, 0.06689964090667665),
 (187, 0.11251057761898882),
 (214, 0.058101154032607086),
 (217, 0.05911425797903921),
 (218, 0.12641517194194246),
 (219, 0.05790583135701678),
 (220, 0.0893675450895299)]

### Latent Semantic Indexing

In [25]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)

In [26]:
corpus_lsi = lsi[corpus_tfidf]

In [34]:
corpus_lsi[4]

[(0, 0.020816924779021428),
 (1, 0.1296091996844934),
 (2, -0.13854490188808927),
 (3, -0.050746207069905326),
 (4, -0.04186608141893042),
 (5, -0.0504692546443486),
 (6, -0.04893042866907269),
 (7, 0.11353057515691871),
 (8, -0.01601140910569015),
 (9, -0.02067360103502217)]

In [27]:
lsi.print_topics(num_topics=10, num_words=20)

[(0,
  '0.987*"--" + 0.036*"\'\'" + 0.030*"``" + 0.026*"..." + 0.023*"n\'t" + 0.023*"\'s" + 0.019*"1" + 0.017*"would" + 0.016*"one" + 0.016*"0" + 0.014*"people" + 0.014*"get" + 0.014*"know" + 0.014*"2" + 0.014*"like" + 0.013*"\'m" + 0.012*"use" + 0.012*"think" + 0.012*"also" + 0.011*"time"'),
 (1,
  '0.214*"\'\'" + 0.184*"``" + 0.166*"n\'t" + 0.162*"..." + 0.154*"\'s" + -0.151*"--" + 0.140*"would" + 0.119*"one" + 0.112*"people" + 0.102*"like" + 0.100*"know" + 0.095*"get" + 0.095*"think" + 0.094*"god" + 0.090*"\'m" + 0.086*"could" + 0.078*"good" + 0.075*"also" + 0.075*"use" + 0.074*"time"'),
 (2,
  '0.235*"god" + 0.215*"\'\'" + 0.208*"``" + -0.203*"windows" + -0.173*"thanks" + -0.153*"drive" + -0.151*"card" + -0.115*"please" + -0.109*"dos" + 0.107*"jesus" + 0.106*"people" + -0.104*"anyone" + -0.102*"file" + -0.093*"software" + -0.091*"program" + -0.088*"disk" + -0.087*"system" + -0.086*"pc" + -0.086*"advance" + -0.084*"scsi"'),
 (3,
  '0.653*"..." + -0.263*"god" + -0.177*"\'\'" + 0.155*

In [28]:
lsi.show_topic(6, topn=30)

[('god', -0.4306688039179093),
 ('drive', -0.1877007193040164),
 ('1', -0.1766646012860656),
 ('jesus', -0.17619198407151593),
 ("''", 0.16712517166703753),
 ('key', 0.1623093176872031),
 ('``', 0.16106272509196076),
 ('government', 0.15609403662028445),
 ('game', -0.13855296093607627),
 ('2', -0.1352844816797491),
 ('0', -0.1337306608460705),
 ('encryption', 0.12332604394341512),
 ('chip', 0.10554805262607204),
 ('clipper', 0.10048071747132058),
 ('scsi', -0.09680810612120865),
 ('keys', 0.09123301141181202),
 ('games', -0.09074507569966152),
 ('card', -0.08773809300599353),
 ('bible', -0.08687077436950377),
 ('christ', -0.08595694045309701),
 ('3', -0.08225086526971159),
 ('x', 0.07943780311442146),
 ('ide', -0.07846652958591761),
 ('faith', -0.0724904444311264),
 ('4', -0.07098078342969166),
 ('file', 0.07080471936365003),
 ('sin', -0.07019960764297747),
 ('program', 0.06837328136714965),
 ('controller', -0.06782883197559879),
 ('escrow', 0.06671773341378624)]

### LDA

In [29]:
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=20, passes=10)

In [30]:
corpus_lda = lda[corpus]

In [42]:
corpus_lda[4]

[(4, 0.09660939),
 (7, 0.056115594),
 (8, 0.54729444),
 (14, 0.19318286),
 (17, 0.0935933)]

In [31]:
lda.print_topics(20)

[(0,
  '0.016*"\'s" + 0.009*"medical" + 0.008*"number" + 0.007*"gm" + 0.007*"disease" + 0.007*"year" + 0.007*"health" + 0.006*"study" + 0.006*"patients" + 0.006*"aids"'),
 (1,
  '0.009*"\'s" + 0.009*"one" + 0.009*"people" + 0.007*"would" + 0.006*"said" + 0.005*"children" + 0.005*"could" + 0.005*"gun" + 0.004*"us" + 0.004*"n\'t"'),
 (2,
  '0.114*"..." + 0.006*".." + 0.006*"new" + 0.006*"_/" + 0.005*"canada" + 0.005*"newsletter" + 0.005*"insurance" + 0.004*"10" + 0.004*"apr" + 0.004*"1993"'),
 (3,
  '0.020*"\'s" + 0.010*"game" + 0.009*"one" + 0.008*"n\'t" + 0.007*"year" + 0.007*"team" + 0.007*"last" + 0.006*"first" + 0.006*"games" + 0.006*"car"'),
 (4,
  '0.080*"1" + 0.066*"2" + 0.056*"0" + 0.045*"3" + 0.041*"4" + 0.029*"5" + 0.024*"6" + 0.024*"25" + 0.018*"8" + 0.018*"7"'),
 (5,
  '0.013*"one" + 0.011*"\'s" + 0.010*"would" + 0.008*"\'\'" + 0.007*"``" + 0.007*"people" + 0.006*"may" + 0.005*"many" + 0.005*"n\'t" + 0.004*"even"'),
 (6,
  '0.020*"adl" + 0.020*"san" + 0.017*"**" + 0.011*"fra

In [32]:
lda.show_topic(6, topn=30)

[('adl', 0.020037087),
 ('san', 0.019549584),
 ('**', 0.016578296),
 ('francisco', 0.011214948),
 ('undefined', 0.011157134),
 ('mercury', 0.01080147),
 ('bullock', 0.010738022),
 ('sea', 0.009864705),
 ('cross', 0.009534127),
 ('bob', 0.009395483),
 ('linked', 0.0086471215),
 ('african', 0.0076269344),
 ('gopher', 0.0075768246),
 ('gerard', 0.0075175473),
 ('pioneer', 0.006932547),
 ('blah', 0.006878086),
 ('south', 0.0065655424),
 ('search', 0.006030813),
 ('ear', 0.0059703784),
 ('allocation', 0.00577314),
 ('fm', 0.0052724774),
 ('league', 0.005091593),
 ('1972', 0.004782156),
 ('richmond', 0.0047606123),
 ('immaculate', 0.004598279),
 ('blew', 0.004542653),
 ('group', 0.004525175),
 ('said', 0.004482768),
 ('unit', 0.003919986),
 ('los', 0.003909707)]

### Similarities

In [35]:
index = similarities.MatrixSimilarity(corpus_lsi)

In [41]:
doc = docs[0]
vec_bow = dictionary.doc2bow(doc)
vec_lsi = lsi[tfidf[vec_bow]]
vec_lda = lda[vec_bow]
print(vec_lsi)
print(vec_lda)

[(0, 0.012297394207041167), (1, 0.08793741360107815), (2, 0.011102033572175549), (3, 0.08200957226929101), (4, -0.0876069861633575), (5, 0.0352729419930182), (6, -0.05713668475512029), (7, -0.06460791718160311), (8, 0.05558379038201229), (9, 0.025246421484046107)]
[(0, 0.01366062), (1, 0.052016526), (3, 0.49014208), (4, 0.040521912), (13, 0.24438803), (15, 0.1017427), (16, 0.029942267), (19, 0.018849857)]


In [25]:
sims = index[vec_lsi] # ищет похожие вектора
print(list(enumerate(sims)))

[(0, 1.0), (1, -0.016020186), (2, 0.24476933), (3, -0.13402167), (4, 0.018832482), (5, 0.52422357), (6, 0.30389732), (7, 0.9549281), (8, 0.8806156), (9, 0.0059505217), (10, 0.19897309), (11, -0.016295858), (12, 0.39718354), (13, 0.12901296), (14, -0.0032772087), (15, 0.17627211), (16, -0.018927034), (17, 0.21632572), (18, 0.14980677), (19, 0.5041377), (20, 0.6059541), (21, 0.19476672), (22, 0.4055552), (23, 0.72389776), (24, 0.67794764), (25, 0.4351244), (26, 0.5370468), (27, 0.032690026), (28, 0.07486583), (29, 0.19004422), (30, 0.47575748), (31, 0.14667453), (32, 0.0076875687), (33, 0.86102754), (34, 0.55279696), (35, 0.18732095), (36, 0.38883895), (37, 0.14185339), (38, 0.48589802), (39, 0.428982), (40, 0.66759175), (41, 0.26409405), (42, 0.62091035), (43, -0.06595177), (44, 0.079058126), (45, 0.037893936), (46, 0.11635432), (47, 0.6117934), (48, 0.31287283), (49, -0.092176564), (50, 0.25913638), (51, 0.57340825), (52, 0.0030790581), (53, -0.21169293), (54, 0.5895906), (55, 0.616497