In [1]:
import pandas as pd
import numpy as np
import itertools
import wikipedia
import re
import nltk

# gensim
from gensim.utils import simple_preprocess, dict_from_corpus
from gensim.models import LdaModel, TfidfModel
from gensim.models.phrases import Phrases, Phraser
from gensim.corpora import Dictionary
from gensim.parsing.preprocessing import preprocess_string, preprocess_documents


%matplotlib inline

## Wiki-Pedia API 

In [2]:
wikipedia.search(query='AAPL')

['AAPL',
 'Apple Inc.',
 'History of Apple Inc.',
 'American Association of Professional Landmen',
 'Option symbol',
 'Energy management (degree)',
 'Alpha Indexes',
 'I Am Rich',
 'Landman (oil worker)',
 'Jeff Williams (Apple)']

In [3]:
wikipage = wikipedia.page('Apple Inc.')
print(wikipage.content[:1000])

Apple Inc. is an American multinational technology company headquartered in Cupertino, California, that designs, develops, and sells consumer electronics, computer software, and online services. The company's hardware products include the iPhone smartphone, the iPad tablet computer, the Mac personal computer, the iPod portable media player, the Apple Watch smartwatch, the Apple TV digital media player, and the HomePod smart speaker. Apple's software includes the macOS and iOS operating systems, the iTunes media player, the Safari web browser, and the iLife and iWork creativity and productivity suites, as well as professional applications like Final Cut Pro, Logic Pro, and Xcode. Its online services include the iTunes Store, the iOS App Store and Mac App Store, Apple Music, and iCloud.
Apple was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in April 1976 to develop and sell Wozniak's Apple I personal computer. It was incorporated as Apple Computer, Inc. in January 1977, and sal

## LDA -- Topic Modeling

## gensim implementation

## note:
1. In gensim implementation: you never pass raw text to gensim.models, (a piece of doc/sentence = list of strings), you have to tokenize and vectorize your doc from one giant string to list of tuples (word id, counts) first, this list of tuples is called "bag of words" representation, so then you can pass "corpus" (list of docs / list of list of tuples) to gensim.models
2. How to transform a doc to bag of words (one element of corpus): first, tokenize the texts, now you have list of single-word / n-gram strings, second, use gensim.corpora.Dictionary to create a mapping between id and tokens(words/n-gram), lastly, use this dictionary that you just built to do doc2bow(['This', 'is', 'a', 'tokenized', 'document']) to convert tokenized doc to "bag of words" representation (list of tuples), and extend the list for all the docs, now you form a "corpus" (list of docs / list of list of tuples)
3. How to transform docs to Tfidf corpus: first, transform docs to bag of words corpus. Second, instantiate TfidfModel(corpus=corpus, id2word=id2word). Last, transform bag of words corpus to Tfidf corpus via tfidf[corpus]
4. How to tokenize n-grams in gensim: There is no n-gram implementation in gensim but there is collocation phrase detection implementation: use gensim.models.phrases.Phrases to train on list of docs (list of list of string tokens), use gensim.models.phrases.Phraser to tokenizer list of string tokens; Repeating above process again will yield tri-gram

In [4]:
## prepare sentences for this doc
sentences = nltk.sent_tokenize(wikipage.content)
## tokenize sentences
# tokens = [simple_preprocess(sent) for sent in sentences]
tokens = [preprocess_string(sent) for sent in sentences]
# ## tokenize sentences using collocation (bi-gram)
# # first-run: train the bi-gram collocation detector
# phrases = Phrases(sentences=tokens, min_count=5, threshold=10.0)
# # create a performant Phraser object the execute the phrase model
# bigram = Phraser(phrases)
# tokens = [bigram[sent] for sent in tokens]
# # second-run: train the tri-gram collocation detector (usually worse)
# phrases = Phrases(sentences=tokens, min_count=1, threshold=1.0)
# # create a performant Phraser object the execute the phrase model
# bigram = Phraser(phrases)
# tokens = [bigram[sent] for sent in tokens]
## build id to word dictionary from tokens
id2word = Dictionary(tokens)
## build bag of word corpus
corpus = [id2word.doc2bow(doc) for doc in tokens]
# ## (optional) build Tfidf transformed corpus, which will hurt LDA performance)
# tfidf = TfidfModel(corpus=corpus, id2word=id2word)
# tfidf_corpus = tfidf[corpus]
# corpus = tfidf_corpus
## train model
lda = LdaModel(corpus=corpus, num_topics=5, id2word=id2word, random_state=1)
# lda.update(corpus=corpus)
## print document topics and probas
topics = [lda.get_document_topics(corpus[i], minimum_probability=0) for i in range(len(corpus))]
## print the topics with max probas
topic = [
    max(
        lda.get_document_topics(corpus[i], minimum_probability=0), 
        key=lambda x: x[1]
    )[0] for i in range(len(corpus))
]

### Convert lda transformed matrix to numpy dense representation

In [5]:
from gensim.matutils import corpus2dense
# use corpus2dense to convert
X_lda = corpus2dense(lda[corpus], num_docs=len(corpus), num_terms=5).transpose()
X_lda.shape

(536, 5)

### Similarity Queries in gensim

In [6]:
from gensim.similarities import MatrixSimilarity
# create index for cosine similarity measure
index_lda = MatrixSimilarity(corpus=lda[corpus])
# query a list of similarities between corpus[0] and every doc in the entire corpus (649 docs)
sims_list = index_lda[lda[corpus[0]]]
sims_list[:3]

  if np.issubdtype(vec.dtype, np.int):


array([0.99999994, 0.31030023, 0.0178149 ], dtype=float32)