In [1]:
import numpy as np
import pandas as pd


In [2]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer

lemtzer = WordNetLemmatizer()

def lemmatize_stemming(text):
    return lemtzer.lemmatize(text, pos='v')

# Write a function to perform the pre processing steps on the entire dataset
def preprocess(text):
    result=[]
    for token in simple_preprocess(text) :
        if token not in STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result



In [3]:
import nltk

In [4]:
document_num = 50
doc_sample = 'This disk has failed many times. I would like to get it replaced.'

print("Original document: ")
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))

Original document: 
['This', 'disk', 'has', 'failed', 'many', 'times.', 'I', 'would', 'like', 'to', 'get', 'it', 'replaced.']


Tokenized and lemmatized document: 
['disk', 'fail', 'time', 'like', 'replace']


In [5]:
bitcoin_news = pd.read_csv('bitcoin_news.csv')
bitcoin_news['Title_Summary'] = bitcoin_news['Title'] + str(' ')+ bitcoin_news['Summary']
bitcoin_news.head()

Unnamed: 0,Date,Summary,Tags,Title,Title_Summary
0,2018-09-04T08:00:04-04:00,The Tuesday edition of our daily roundup is ja...,"ATM,Blockchain,Brave,Deutsche Börse,Ethereum,G...","The Daily: Wirex Adds Ethereum, Deutsche Börse...","The Daily: Wirex Adds Ethereum, Deutsche Börse..."
1,2018-08-02T17:15:26-04:00,Japan’s SBI Group is reportedly planning to cr...,"ATM,BCH,Bitcoin,bitcoin cash,BTC,BTM,coin asse...","SBI Plans Derivatives Platform, Huobi Eyes 30%...","SBI Plans Derivatives Platform, Huobi Eyes 30%..."
2,2018-09-05T01:15:31-04:00,Non-custodial crypto trading platform Shapeshi...,"AML,anti-money laundering,Bitcoin,BTC,crypto,C...",Shapeshift Moves to Membership Model Requiring...,Shapeshift Moves to Membership Model Requiring...
3,2018-08-04T04:55:54-04:00,"Intercontinental Exchange (ICE), owner of argu...","401k,Bitcoin,Boston Consulting Group,Canada,CF...",NYSE Owner: Bitcoin Should Be in Retirement Fu...,NYSE Owner: Bitcoin Should Be in Retirement Fu...
4,2018-08-06T03:45:21-04:00,Thailand’s central bank has announced the rule...,"bank of thailand,banks,Bitcoin,Bot,BTC,commerc...",Bank of Thailand Green-Lights Financial Compan...,Bank of Thailand Green-Lights Financial Compan...


In [6]:
processed_docs  = []

for doc in bitcoin_news['Title_Summary']:
    processed_docs.append(preprocess(doc))

In [7]:
import gensim
dictionary = gensim.corpora.Dictionary(processed_docs)

In [8]:
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n=100000)

In [9]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 add
1 atms
2 board
3 brave
4 brief
5 bring
6 browser
7 daily
8 edition
9 embrace
10 ethereum


In [10]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# LDA with word2vec

In [11]:
%%time
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=6, id2word=dictionary, passes=2)
lda_model.save('lda.model')

Wall time: 12.3 s


In [12]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Word: {}\n'.format(idx, topic))

Topic: 0 Word: 0.023*"bank" + 0.016*"mine" + 0.007*"world" + 0.006*"week" + 0.006*"central" + 0.006*"price" + 0.006*"report" + 0.006*"currency" + 0.006*"digital" + 0.006*"launch"

Topic: 1 Word: 0.012*"price" + 0.010*"digital" + 0.008*"cash" + 0.007*"value" + 0.007*"week" + 0.007*"past" + 0.006*"launch" + 0.006*"update" + 0.006*"platform" + 0.006*"year"

Topic: 2 Word: 0.015*"bank" + 0.013*"brief" + 0.012*"mine" + 0.010*"coin" + 0.009*"financial" + 0.008*"offer" + 0.008*"today" + 0.006*"daily" + 0.005*"coinbase" + 0.005*"launch"

Topic: 3 Word: 0.013*"announce" + 0.012*"launch" + 0.010*"fund" + 0.009*"securities" + 0.008*"bank" + 0.007*"mine" + 0.007*"digital" + 0.007*"cash" + 0.006*"offer" + 0.006*"coin"

Topic: 4 Word: 0.024*"cash" + 0.009*"service" + 0.008*"launch" + 0.008*"japanese" + 0.007*"state" + 0.007*"network" + 0.007*"platform" + 0.007*"coin" + 0.006*"year" + 0.006*"south"

Topic: 5 Word: 0.012*"report" + 0.009*"government" + 0.008*"south" + 0.008*"financial" + 0.007*"korean

In [13]:
num = 100
unseen_document = bitcoin_news['Title_Summary'][num]
print(unseen_document)

Bitmain Launches Key Crypto Mining Facility in Texas Bitmain Technologies Ltd. is launching a crypto mining facility and blockchain data center in the U.S. state of Texas at a former aluminum smelting facility with a recently-retired coal power station. The company is reportedly getting a tax abatement phased in over 10 years.


In [14]:
# Data preprocessing step for the unseen document
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.9599984288215637	 Topic: 0.023*"bank" + 0.016*"mine" + 0.007*"world" + 0.006*"week" + 0.006*"central"


In [15]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


# LDA with doc2vec

In [16]:
from gensim import models
import warnings
warnings.filterwarnings('ignore')

In [17]:
tfidf = models.TfidfModel(bow_corpus)
tfidf_corpus = tfidf[bow_corpus]

In [18]:
%%time
lda_model_tfidf = gensim.models.LdaMulticore(tfidf_corpus, num_topics=6, id2word=dictionary, passes=2)
lda_model_tfidf.save('lda_tfidf.model')

Wall time: 20.1 s


In [19]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.006*"mine" + 0.005*"coinbase" + 0.005*"bank" + 0.004*"offer" + 0.004*"service" + 0.004*"launch" + 0.004*"city" + 0.004*"world" + 0.004*"brief" + 0.004*"platform"
Topic: 1 Word: 0.008*"bank" + 0.006*"revolution" + 0.004*"price" + 0.004*"financial" + 0.004*"satoshi" + 0.003*"central" + 0.003*"world" + 0.003*"country" + 0.003*"service" + 0.003*"rise"
Topic: 2 Word: 0.005*"launch" + 0.005*"cash" + 0.005*"currency" + 0.004*"digital" + 0.004*"accept" + 0.004*"bank" + 0.004*"wallet" + 0.004*"mine" + 0.004*"platform" + 0.004*"service"
Topic: 3 Word: 0.006*"launch" + 0.005*"price" + 0.005*"wallet" + 0.005*"mine" + 0.004*"week" + 0.004*"fund" + 0.004*"report" + 0.004*"cash" + 0.003*"time" + 0.003*"million"
Topic: 4 Word: 0.007*"cash" + 0.004*"chinese" + 0.004*"mine" + 0.004*"bank" + 0.004*"south" + 0.004*"coin" + 0.003*"money" + 0.003*"week" + 0.003*"digital" + 0.003*"offer"
Topic: 5 Word: 0.007*"bank" + 0.006*"mine" + 0.004*"cash" + 0.004*"announce" + 0.004*"price" + 0.004*"lau

In [20]:
pyLDAvis.gensim.prepare(lda_model_tfidf, tfidf_corpus, dictionary)