In [1]:
import pandas as pd
import numpy as np 
import re
% matplotlib inline
import matplotlib.pyplot as plt
from collections import Counter
import pickle
import time

In [2]:
fileObject = open('./lem_texts/trigram_text.sav','rb')  
trigram_nest = pickle.load(fileObject)  ## load trigram

In [3]:
type(trigram_nest) # needs to be a list of tokens

list

In [4]:
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(trigram_nest)

# Filter out words that occur in less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [5]:
# Vectorize data.

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(tri) for tri in trigram_nest]

In [11]:
# saving dictionary
filename = 'dictionary_full_lda.sav'
pickle.dump(dictionary, open(filename, 'wb'))

# saving corpus
filename = 'corpus_full_lda.sav'
pickle.dump(corpus, open(filename, 'wb'))

In [6]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 33960
Number of documents: 34611


In [7]:
# Train LDA model.

from gensim.models import LdaModel

# Set training parameters.
num_topics = 20
chunksize = 10000
passes = 20
iterations = 400
eval_every = None  # takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

%time model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

CPU times: user 47min 33s, sys: 9.68 s, total: 47min 43s
Wall time: 55min 51s


In [8]:
# saving lda model
filename = 'model_full_lda.sav'
pickle.dump(model, open(filename, 'wb'))

In [9]:
%%time
top_topics = model.top_topics(corpus, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -325.0113.
[([(0.025654092221442389, 'not'),
   (0.013569916014509797, 'be'),
   (0.013142664993534198, 's'),
   (0.012935766079922115, 'like'),
   (0.011971328629241029, 'think'),
   (0.011631612922307756, 'go'),
   (0.011108151725114089, 'people'),
   (0.010324159161057072, 'know'),
   (0.010219447406554963, 'time'),
   (0.010194906566912815, 'good'),
   (0.0091914809178482906, 'thing'),
   (0.0089801714610546635, 'want'),
   (0.0084053129400516804, 'come'),
   (0.007248324920441215, 'get'),
   (0.0070384223125490326, 'way'),
   (0.0069047590622133724, 'have'),
   (0.0060106630012768045, 'tell'),
   (0.0058667032273318825, 'day'),
   (0.0053949137445681878, 'work'),
   (0.0052563999324205055, 'look')],
  -152.01674105130331),
 ([(0.012882699515740498, 'people'),
   (0.0066404939230112463, 'government'),
   (0.0063746816857770108, 'world'),
   (0.0055890667697934618, 'country'),
   (0.0054377478200693455, 'power'),
   (0.0052430865382744538, 'america'),
   (0.

In [10]:
# saving lda model
filename = 'top_topics_lda.sav'
pickle.dump(top_topics, open(filename, 'wb'))

In [39]:
x = model[corpus[6]]

In [57]:
x

[(0, 0.41900108650983153),
 (3, 0.10498739678237198),
 (4, 0.040009345806446224),
 (6, 0.025438986130653329),
 (14, 0.40172714962675016)]

In [55]:
corp_df = pd.DataFrame(0, index=np.arange(len(corpus)), columns= range(0, 20))
corp_df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
34606,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
34607,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
34608,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
34609,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
34610,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [58]:
# converting our returned topic distributions into a dataframe
for i in range(len(corpus)):
    dict_corp = model[corpus[i]]
    for k in range(len(dict_corp)):
        col = dict_corp[k][0]
        corp_df.iloc[i, col] = dict_corp[k][1]

In [64]:
corp_df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
34606,0.0,0.0,0.299633,0.0,0.0,0.109582,0.216531,0.0,0.0,0.203056,0.077235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06406,0.0
34607,0.0,0.0,0.682718,0.046531,0.0,0.0,0.0,0.236786,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34608,0.0,0.0,0.0,0.0,0.0,0.056222,0.287629,0.0,0.0,0.0,0.17079,0.359693,0.0,0.0,0.0,0.0,0.0,0.070038,0.0,0.041209
34609,0.0,0.0,0.0,0.0,0.0,0.098432,0.162241,0.0,0.0,0.0,0.0,0.722731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34610,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.359376,0.0,0.0,0.144841,0.474512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
# confirming the sums of each row are 1
print('The mean of each row is: {0:.3f}'.format(corp_df.sum(axis = 1).mean()))
print('The standard deviation of each row is: {0:.3f}'.format(corp_df.sum(axis = 1).std()))

The mean of each row is: 0.989
The standard deviation of each row is: 0.011


In [64]:
corp_df.sum(axis = 1).mean()

0.9890312351906609

In [65]:
corp_df.to_csv('corpus_lda_topics.csv')

In [39]:
corp_df = pd.read_csv("./lda_topics/corpus_lda_topics.csv", encoding = "utf8", index_col = 0)

## LDA on Real News articles 

In [12]:
trigram_nest_real = trigram_nest[0: 23571]
len(trigram_nest_real)

23571

In [13]:
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(trigram_nest_real)

# Filter out words that occur in less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [14]:
# Vectorize data.
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(tri) for tri in trigram_nest_real]

In [15]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 24272
Number of documents: 23571


In [16]:
# saving dictionary
filename = 'dictionary_real_lda.sav'
pickle.dump(dictionary, open(filename, 'wb'))

# saving corpus
filename = 'corpus_real_lda.sav'
pickle.dump(corpus, open(filename, 'wb'))

In [17]:
# Train LDA model.

from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 10000
passes = 20
iterations = 400
eval_every = None  # takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

%time model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

CPU times: user 19min 27s, sys: 12 s, total: 19min 39s
Wall time: 19min 42s


In [18]:
# saving real_news lda model
filename = 'lda_model_real.sav'
pickle.dump(model, open(filename, 'wb'))

In [20]:
%%time
top_topics_real = model.top_topics(corpus, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics_real)

Average topic coherence: -650.0225.
[([(0.018627010539670447, 'game'),
   (0.01420273217697615, 'team'),
   (0.011759487035628537, 'win'),
   (0.011021663160987964, 'play'),
   (0.009113029020988133, 'season'),
   (0.0077076454254798067, 'player'),
   (0.0068436916748552514, 'time'),
   (0.0065813309145944512, 'good'),
   (0.006445627855908239, 'not'),
   (0.0057064586263921909, "'s"),
   (0.0055145125751286097, 's'),
   (0.0051938018454682506, 'week'),
   (0.0051470809992809402, 'run'),
   (0.0050762221537930087, 'go'),
   (0.0049610676906314882, 'lead'),
   (0.0048160728978449254, 'point'),
   (0.0044303533165206653, 'start'),
   (0.0042296946732828982, 'come'),
   (0.0037202049183443641, 'second'),
   (0.0036473854414749581, 'get')],
  -174.71358388361747),
 ([(0.014395491039362759, 'not'),
   (0.011315068488262844, 'people'),
   (0.0080043269125720587, 'think'),
   (0.0076614736577502798, 'like'),
   (0.0073346852297150045, 'be'),
   (0.0068078994542647847, 'time'),
   (0.006363060

In [21]:
# saving real top topics
filename = 'top_topics_real_lda.sav'
pickle.dump(top_topics_real, open(filename, 'wb'))

In [22]:
def explore_topic(topic_number, topn=25):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """
        
    print (u'{:20} {}'.format(u'term', u'frequency') + u'\n')

    for term, frequency in model.show_topic(topic_number, topn=25):
        print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))

In [24]:
explore_topic(topic_number = 0)

term                 frequency

u.s.                 0.009
percent              0.008
's                   0.008
market               0.007
china                0.006
government           0.005
month                0.005
week                 0.005
country              0.005
bank                 0.005
high                 0.004
economy              0.004
price                0.004
report               0.004
deal                 0.004
investor             0.004
accord               0.004
expect               0.004
trade                0.004
time                 0.004
big                  0.003
rise                 0.003
day                  0.003
company              0.003
share                0.003


## LDA on Fake News articles

In [25]:
trigram_nest_fake = trigram_nest[23571:]
len(trigram_nest_fake)

11040

In [26]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(trigram_nest_fake)

# Filter out words that occur in less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [27]:
# Vectorize data.
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(tri) for tri in trigram_nest_fake]

In [28]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 13822
Number of documents: 11040


In [29]:
# saving dictionary
filename = 'dictionary_fake_lda.sav'
pickle.dump(dictionary, open(filename, 'wb'))

# saving corpus
filename = 'corpus_fake_lda.sav'
pickle.dump(corpus, open(filename, 'wb'))

In [30]:
# Train LDA model.

from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 10000
passes = 20
iterations = 400
eval_every = None  # takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

%time model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

CPU times: user 11min 14s, sys: 10 s, total: 11min 24s
Wall time: 11min 24s


In [31]:
# saving fake_news lda model
filename = 'lda_model_fake.sav'
pickle.dump(model, open(filename, 'wb'))

In [32]:
%%time
top_topics_fake = model.top_topics(corpus, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -650.0225.
[([(0.025654092221442389, 'not'),
   (0.013569916014509797, 'be'),
   (0.013142664993534198, 's'),
   (0.012935766079922115, 'like'),
   (0.011971328629241029, 'think'),
   (0.011631612922307756, 'go'),
   (0.011108151725114089, 'people'),
   (0.010324159161057072, 'know'),
   (0.010219447406554963, 'time'),
   (0.010194906566912815, 'good'),
   (0.0091914809178482906, 'thing'),
   (0.0089801714610546635, 'want'),
   (0.0084053129400516804, 'come'),
   (0.007248324920441215, 'get'),
   (0.0070384223125490326, 'way'),
   (0.0069047590622133724, 'have'),
   (0.0060106630012768045, 'tell'),
   (0.0058667032273318825, 'day'),
   (0.0053949137445681878, 'work'),
   (0.0052563999324205055, 'look')],
  -152.01674105130331),
 ([(0.012882699515740498, 'people'),
   (0.0066404939230112463, 'government'),
   (0.0063746816857770108, 'world'),
   (0.0055890667697934618, 'country'),
   (0.0054377478200693455, 'power'),
   (0.0052430865382744538, 'america'),
   (0.

In [4]:
# loading lda model
fileObject = open('./lda_topics/top_topics_fake_lda.sav', 'rb')
top_topics_fake = pickle.load(fileObject)

In [5]:
def explore_topic(topic_number, topn=25):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """
        
    print (u'{:20} {}'.format(u'term', u'frequency') + u'\n')

    for term, frequency in model.show_topic(topic_number, topn=25):
        print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))

In [65]:
explore_topic(topic_number = 9)

term                 frequency

government           0.009
's                   0.005
case                 0.005
people               0.005
report               0.005
state                0.005
public               0.004
information          0.004
post                 0.004
official             0.004
work                 0.004
year                 0.004
law                  0.004
time                 0.003
know                 0.003
story                0.003
include              0.003
not                  0.003
source               0.003
u.s.                 0.003
group                0.003
list                 0.003
news                 0.003
note                 0.003
call                 0.003


In [75]:
type(dictionary)

gensim.corpora.dictionary.Dictionary