In [24]:
import numpy as np
import pandas as pd

import gensim
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy

import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

import random

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from src import nlp_utils
from src.process_text_variables import contracted_words_dict, stop_words_dict, punc, stop_words_incl_in_sentiment_dict 
from collections import Counter

  and should_run_async(code)


# 1) Import and Process Data

In [3]:
wsm = pd.read_csv('data/wallstreet_master.csv')
stop_words_dict = stop_words_dict + stop_words_incl_in_sentiment_dict
wsm['tokens'] = wsm['contentWithHTMLTag'].apply(lambda x: nlp_utils.process_text(x, contracted_words_dict, punc, stop_words_dict, min_len=2))

  and should_run_async(code)


In [4]:
train_split_idx = 41000

all_docs_tokenized = wsm['tokens'].tolist()
# create copy to have record of original order

all_docs_randomized = all_docs_tokenized[:]
random.shuffle(all_docs_randomized)
train_docs_tokenized = all_docs_randomized[:train_split_idx]
test_docs_tokenized = all_docs_randomized[train_split_idx]

  and should_run_async(code)


## 2) Create Bigrams/Trigrams and Remove Neglible Words Based on Parts of Sentence (PoS)

In [5]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

# English multi-task CNN trained on OntoNotes. Assigns context-specific token vectors, POS tags, dependency parse and named entities.
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADJ', 'PROPN']):
    texts_revised =[]
    for post in texts:
        doc = nlp(' '.join(post))
        texts_revised.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_revised

  and should_run_async(code)


**Function Used to Calculate Score that Corresponds to "threshold" hyperparameter**

```
def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
   #...
   """
    worda_count : int
        Number of occurrences for first word.
    wordb_count : int
        Number of occurrences for second word.
    bigram_count : int
        Number of co-occurrences for phrase "worda_wordb".
    len_vocab : int
        Size of vocabulary.
    min_count: int
        Minimum collocation count threshold.
    corpus_word_count : int
        Not used in this particular scoring technique.
    """
    #...

    return (bigram_count - min_count) / worda_count / wordb_count * len_vocab
  ```

In [6]:
min_count = 10
# greater the threshold, the lower the number of words.
threshold = 50


bigram = gensim.models.Phrases(all_docs_tokenized, min_count=min_count, threshold=threshold)
trigram = gensim.models.Phrases(bigram[all_docs_tokenized], threshold=threshold)

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

  and should_run_async(code)


In [7]:
docs_incl_bigrams = make_bigrams(all_docs_tokenized)

docs_lemmatized = lemmatization(docs_incl_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADJ', 'PROPN'])

  and should_run_async(code)


In [None]:
docs_lemmatized[22222]

## 3) Create Corpus

In [8]:
# Bag of Words corpus
id2word = corpora.Dictionary(docs_lemmatized)

# Create unique ID for each word in corpus.
bow_corpus = [id2word.doc2bow(post) for post in  docs_lemmatized]

# Use the unique id as the index in id2word to see the corresponding token.
print(id2word[3])

  and should_run_async(code)


go


In [9]:
# Tfidf corpus
tfidf = models.TfidfModel(bow_corpus)
tfidf_corpus = tfidf[bow_corpus]

  and should_run_async(code)


## 4) Develop LDA Model

In [10]:
# BoW 
lda_model_bow = gensim.models.ldamodel.LdaModel(corpus=bow_corpus,
                                               id2word=id2word,
                                               num_topics=20, 
                                               random_state=3,
                                               update_every=1,
                                               chunksize=100,
                                               passes=10,
                                               alpha='auto',
                                               per_word_topics=True)

  and should_run_async(code)


In [11]:
# Tf-Idf
lda_model_tfidf = gensim.models.ldamodel.LdaModel(corpus=tfidf_corpus,
                                               id2word=id2word,
                                               num_topics=20, 
                                               random_state=3,
                                               update_every=1,
                                               chunksize=100,
                                               passes=10,
                                               alpha='auto',
                                               per_word_topics=True)

  and should_run_async(code)


In [13]:
print('BoW TOPICS:')
for t in lda_model_bow.print_topics(): 
    print(t)
    print('\n')

BoW TOPICS:
(0, '0.155*"cash" + 0.100*"custom" + 0.078*"eu" + 0.074*"access" + 0.073*"setup" + 0.049*"release" + 0.038*"tutorial" + 0.035*"multiple" + 0.030*"worry" + 0.029*"bin"')


(1, '0.306*"bank" + 0.225*"account" + 0.127*"credit" + 0.067*"fullz" + 0.040*"cc" + 0.035*"paypal" + 0.032*"fraud" + 0.027*"login" + 0.011*"serious" + 0.011*"asap"')


(2, '0.219*"allow" + 0.117*"external_contact" + 0.042*"passport" + 0.019*"hologram" + 0.000*"_" + 0.000*"phished_rule" + 0.000*"delivery_dot" + 0.000*"transfer" + 0.000*"fake" + 0.000*"real"')


(3, '0.416*"guide" + 0.054*"nice" + 0.049*"place" + 0.046*"interested" + 0.040*"s" + 0.030*"drug" + 0.029*"learn" + 0.024*"word" + 0.023*"opsec" + 0.022*"safe"')


(4, '0.188*"note" + 0.088*"sign" + 0.048*"euro" + 0.040*"pas" + 0.035*"cannabis" + 0.034*"print" + 0.031*"normal" + 0.025*"grade" + 0.024*"water" + 0.021*"like"')


(5, '0.151*"name" + 0.062*"escrow" + 0.057*"cheap" + 0.038*"scan" + 0.035*"topic" + 0.032*"join" + 0.030*"worth" + 0.028*"lar

  and should_run_async(code)


In [14]:
print('Tf-Idf TOPICS:')
for t in lda_model_tfidf.print_topics(): 
    print(t)
    print('\n')

Tf-Idf TOPICS:
(0, '0.039*"put" + 0.033*"problem" + 0.033*"allow" + 0.027*"eu" + 0.026*"kind" + 0.023*"setup" + 0.023*"thing" + 0.016*"external_contact" + 0.016*"item" + 0.011*"tutorial"')


(1, '0.069*"cc" + 0.035*"asap" + 0.023*"serious" + 0.016*"canada" + 0.000*"bump" + 0.000*"_" + 0.000*"record_cashout" + 0.000*"phished_rule" + 0.000*"room" + 0.000*"everythingcc_bank"')


(2, '0.021*"atm" + 0.011*"moment" + 0.010*"solution" + 0.000*"bump" + 0.000*"_" + 0.000*"record_cashout" + 0.000*"phished_rule" + 0.000*"room" + 0.000*"everythingcc_bank" + 0.000*"picsou"')


(3, '0.058*"profile" + 0.055*"guide" + 0.053*"order" + 0.041*"write" + 0.036*"pm" + 0.032*"message" + 0.026*"go" + 0.025*"buy" + 0.022*"day" + 0.021*"thank"')


(4, '0.193*"bank" + 0.088*"drop" + 0.035*"cash" + 0.028*"end" + 0.012*"e" + 0.012*"pas" + 0.011*"depend" + 0.010*"trouble" + 0.008*"wall_street" + 0.008*"decrypt"')


(5, '0.030*"sign" + 0.030*"scan" + 0.026*"cheap" + 0.026*"join" + 0.022*"topic" + 0.017*"worth" + 0.0

  and should_run_async(code)


## 4) Model Performance Evaluation - Baseline

### Bag-of-Words

In [15]:
# The lower, the better.
print('Perplexity: ', lda_model_bow.log_perplexity(bow_corpus))

coherence_model_bow = CoherenceModel(model=lda_model_bow, texts=docs_lemmatized, dictionary=id2word, coherence='c_v')
coherence_bow = coherence_model_bow.get_coherence()
print('Coherence Score: ', coherence_bow)

  and should_run_async(code)


Perplexity:  -17.098545791997772
Coherence Score:  0.35361736822624434


### Tf-Idf

In [16]:
print('Perplexity: ', lda_model_tfidf.log_perplexity(tfidf_corpus))

coherence_model_tfidf = CoherenceModel(model=lda_model_tfidf, texts=docs_lemmatized, dictionary=id2word, coherence='c_v')
coherence_tfidf = coherence_model_tfidf.get_coherence()
print('Coherence Score: ', coherence_tfidf)

  and should_run_async(code)


Perplexity:  -19.951403084908748
Coherence Score:  0.31366446152744454


# 5) Visualize Topics-Keywords

In [28]:
# Bag-of-Words
pyLDAvis.gensim.prepare(lda_model_bow, bow_corpus, id2word)

  and should_run_async(code)


In [29]:
# Tf-Idf
pyLDAvis.gensim.prepare(lda_model_tfidf, tfidf_corpus, id2word)

  and should_run_async(code)


# 6) Hyperparameter Optimization - 

1) **Alpha** controls the number of topics expected in the document. A low value of ‘α’ implies a fewer number of topics and a higher value implies a higher number topics in the mix. 

2) **Beta** controls the distribution of words per topic. At lower values of ‘β’, the topics will likely have fewer words and at higher values topics will likely have more words.

3) If the same keywords being repeated in multiple topics, it’s probably a sign that the ‘k’ is too large.

In [46]:
def compute_coherence_values(corpus, id2word, k, alpha, beta, texts):
    '''
    Compute c_v coherence for a range of number of topics.
    '''
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=k, id2word=id2word, random_state=3, update_every=1, chunksize=100, passes=10, alpha=alpha, eta=beta)
    coherence_model = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
    return coherence_model.get_coherence()

  and should_run_async(code)


In [47]:
# Function inputs
corpus = bow_corpus
id2word = id2word
texts = docs_lemmatized
file_name = 'bow_lda_tuning_results.csv'

grid = {}
grid['validation_set'] = {}

# Topic range
min_topics = 3
max_topics =20
step_size = 2
num_topics_range = list(range(min_topics, max_topics, step_size))

# Alpha = document-topic density
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta = topic-word density
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

for k in num_topics_range:
    for a in alpha:
        for b in beta:
            cv = compute_coherence_values(corpus, id2word, k, a, b, texts)
            
            model_results['Topics'].append(k)
            model_results['Alpha'].append(a)
            model_results['Beta'].append(b)
            model_results['Coherence'].append(cv)

pd.DataFrame(model_results).to_csv('data/'+file_name, index= False)

  and should_run_async(code)


KeyboardInterrupt: 

In [45]:
print(num_topics_range)

[3, 5, 7, 9, 11, 13, 15, 17, 19]


  and should_run_async(code)
