In [3]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.phrases import Phrases, Phraser
from gensim.models.word2vec import Text8Corpus

# Plotting tools
#import pyLDAvis
#import pyLDAvis.gensim  # don't skip this
#import matplotlib.pyplot as plt
%matplotlib inline

In [9]:
f = open("sentences_covid19.txt",'r',encoding='utf-8')
message = f.readlines()

In [32]:
message[:3]

['analysis title regaining perspective sarscov2 molecular tracing its implications\n',
 'during past three months new coronavirus sarscov2 epidemic has been growing exponentially affecting over 100 thousand people worldwide causing enormous distress economies societies affected countries \n',
 'a plethora analyses based viral sequences has already been published scientific journals well through nonpeer reviewed channels investigate sarscov2 genetic heterogeneity spatiotemporal dissemination \n']

In [21]:
message[0]

'analysis title regaining perspective sarscov2 molecular tracing its implications\n'

In [22]:
message[0].strip().split(' ')

['analysis',
 'title',
 'regaining',
 'perspective',
 'sarscov2',
 'molecular',
 'tracing',
 'its',
 'implications']

In [99]:
def read_sentences(file_path):
    data_lemmatized = []
    with open(file_path, 'r',encoding='utf-8') as in_file:
        sentences = in_file.readlines()
        sentences = [sentence.strip() for sentence in sentences]
        for sentence in sentences:
            tokens = sentence.split(' ')
            data_lemmatized.append(tokens)
    return data_lemmatized

In [None]:
data_lemmatized = read_sentences('../../../../workspace/kaggle/covid19/data/sentences_all_articles.txt')

In [79]:
len(data_lemmatized)

6329418

In [80]:
data_lemmatized[:1]

[['the',
  'rna',
  'pseudoknots',
  'footandmouth',
  'disease',
  'virus',
  'dispensable',
  'genome',
  'replication',
  'essential',
  'production',
  'infectious',
  'virus',
  '2',
  '3']]

## Using Gensim - https://radimrehurek.com/gensim/corpora/textcorpus.html



1) Define Dictionary - using id2word 


    * Dictionary encapsulates the mapping between normalized words and their integer ids
    
    
2) Structure Corpus - using doc2bow

    * Convert each document into the bag-of-words (BoW) format = list of (token_id, token_count) tuples.




In [18]:
# Create dictionary
id2word = corpora.Dictionary(data_lemmatized)

In [26]:
# Create Corpus
texts = data_lemmatized

In [27]:
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [28]:
# VIew
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]]


In [30]:
# pass id for corresponding text embedding 
id2word[0]

'analysis'

In [35]:
# interpret format of term frequency
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:2]]

[[('analysis', 1),
  ('implications', 1),
  ('its', 1),
  ('molecular', 1),
  ('perspective', 1),
  ('regaining', 1),
  ('sarscov2', 1),
  ('title', 1),
  ('tracing', 1)],
 [('sarscov2', 1),
  ('100', 1),
  ('affected', 1),
  ('affecting', 1),
  ('been', 1),
  ('causing', 1),
  ('coronavirus', 1),
  ('countries', 1),
  ('distress', 1),
  ('during', 1),
  ('economies', 1),
  ('enormous', 1),
  ('epidemic', 1),
  ('exponentially', 1),
  ('growing', 1),
  ('has', 1),
  ('months', 1),
  ('new', 1),
  ('over', 1),
  ('past', 1),
  ('people', 1),
  ('societies', 1),
  ('thousand', 1),
  ('three', 1),
  ('worldwide', 1)]]

# Building the Topic Model

1) Train the LDA model
    * define topic scope
    * tune parameter sparcity 
    
        - alpha
        - eta
2) Build model
    * chunk training size
    * update frequency
    * number of training passes

In [37]:
# Build the LDA model

lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus,
                                           id2word = id2word,
                                           num_topics = 10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [38]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.079*"•" + 0.019*"blood" + 0.015*"associated" + 0.013*"cells" + '
  '0.012*"ace2" + 0.012*"protein" + 0.011*"important" + 0.011*"levels" + '
  '0.010*"diseases" + 0.010*"cell"'),
 (1,
  '0.110*"who" + 0.088*"it" + 0.056*"response" + 0.043*"could" + 0.036*"under" '
  '+ 0.035*"available" + 0.032*"major" + 0.032*"as" + 0.030*"without" + '
  '0.024*"muscle"'),
 (2,
  '0.173*"■" + 0.020*"some" + 0.013*"drugs" + 0.010*"transmission" + '
  '0.009*"surgery" + 0.009*"must" + 0.009*"drug" + 0.009*"there" + '
  '0.008*"increased" + 0.008*"high"'),
 (3,
  '0.071*"de" + 0.036*"were" + 0.025*"patient" + 0.023*"1" + 0.022*"after" + '
  '0.018*"a" + 0.018*"more" + 0.015*"all" + 0.015*"when" + 0.014*"cause"'),
 (4,
  '0.044*"the" + 0.035*"from" + 0.028*"should" + 0.019*"other" + 0.018*"risk" '
  '+ 0.017*"oral" + 0.017*"which" + 0.017*"in" + 0.013*"use" + 0.013*"cases"'),
 (5,
  '0.069*"may" + 0.033*"can" + 0.031*"have" + 0.029*"disease" + 0.028*"dental" '
  '+ 0.022*"also" + 0.020*"has" + 0.

# Train bi-gram model

Train bi-gram model on all sentences in order to create meaningful prhases like "worth_nothing and "fake_news""

In [81]:
phrases = Phrases(data_lemmatized, min_count=5, threshold=10, progress_per=100)

# Export the trained model = use less RAM, faster processing. Model updates no longer possible.
bigram = Phraser(phrases)

Save model and load it again to check everything is OK:

In [82]:
bigram.save("../../../../workspace/kaggle/covid19/data/covid_bigram_model_v0.pkl")

In [83]:
bigram_reloaded = Phraser.load("../../../../workspace/kaggle/covid19/data/covid_bigram_model_v0.pkl")

Check our bigram model on a ransom sentence:

In [84]:
bigram_reloaded["despite social media often vehicle fake news boast news hype also worth noting tremendous effort scientific community provide free uptodate information ongoing studies well critical evaluations".split()]

['despite',
 'social_media',
 'often',
 'vehicle',
 'fake_news',
 'boast',
 'news',
 'hype',
 'also',
 'worth_noting',
 'tremendous_effort',
 'scientific_community',
 'provide',
 'free',
 'uptodate_information',
 'ongoing',
 'studies',
 'well',
 'critical',
 'evaluations']

That looks really cool!

Now let's convert our sentences to bigrams using our trained bigram model and save the new sentences:

In [86]:
data_with_bigrams = bigram_reloaded[data_lemmatized]

In [87]:
with open("../../../../workspace/kaggle/covid19/data/sentences_all_articles_bigrams.txt", 'w+') as out_fp:
    for sentence in data_with_bigrams:
        out_fp.write(' '.join(sentence))
        out_fp.write('\n')

# Train tri-gram model

Let's repeat the process, but now on the sentences after converting them to bigram, in order to create meaningful trigrams. Now we need to lower our threshold since meaningful trigrams are more rare than bigrams.

In [88]:
phrases = gensim.models.Phrases(data_with_bigrams, min_count=5, threshold=5)
trigram = Phraser(phrases)
trigram.save("../../../../workspace/kaggle/covid19/data/covid_trigram_model_v0.pkl")

In [89]:
trigram["exploration temporal structure ie presence molecular clock data assessed regression divergence roottotip genetic distanceagainst sampling time using tempest 19".split()]

['exploration',
 'temporal_structure',
 'ie',
 'presence',
 'molecular_clock',
 'data',
 'assessed',
 'regression',
 'divergence',
 'roottotip_genetic',
 'distanceagainst',
 'sampling_time',
 'using',
 'tempest',
 '19']

In [90]:
pprint(data_with_trigrams[1])

['during_past',
 'three_months',
 'new_coronavirus',
 'sarscov2',
 'epidemic',
 'has_been',
 'growing_exponentially',
 'affecting_over',
 '100_thousand',
 'people_worldwide',
 'causing',
 'enormous',
 'distress',
 'economies',
 'societies',
 'affected_countries']


Now let's convert our sentences to trigrams using our trained trigram model and save the new sentences:

In [91]:
data_with_trigrams = trigram[data_with_bigrams]

In [92]:
with open("../../../../workspace/kaggle/covid19/data/sentences_all_articles_trigrams.txt", 'w+') as out_fp:
    for sentence in data_with_trigrams:
        out_fp.write(' '.join(sentence))
        out_fp.write('\n')

After we have our trained tri-grams model, let's convert our subset sentences corpus (only COVID-19 articles):

In [100]:
data_lemmatized = read_sentences('../../../../workspace/kaggle/covid19/data/sentences_covid19.txt')
data_with_bigrams = bigram_reloaded[data_lemmatized]
data_with_trigrams = trigram[data_with_bigrams]
with open("../../../../workspace/kaggle/covid19/data/sentences_covid19_trigrams.txt", 'w+') as out_fp:
    for sentence in data_with_trigrams:
        out_fp.write(' '.join(sentence))
        out_fp.write('\n')

In [53]:
# Compute Perplexity
#print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.


In [62]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

KeyboardInterrupt: 