In [1]:
import gensim
from gensim import corpora
import json



In [2]:
with open('cleaned_article.json', 'r') as f:
     data = json.load(f)

In [4]:
data = filter(None, data)

2246

In [6]:
for i, article in enumerate (data):
    data[i] = [s.encode('utf-8') for s in article] # decode unicode str to str

In [10]:
dictionary = corpora.Dictionary(data)
# Represents entire corpus in bag-of-words format i.e list of
 # (id, count) tuples
doc_term_matrix = [dictionary.doc2bow(doc) for doc in data]

In [22]:
    # Create variable to store LDA class
Lda = gensim.models.ldamodel.LdaModel 
    # Initialize LDA object; estimates LDA model parameters on 
    # corpus in bag-of-words format 
ldamodel = Lda(doc_term_matrix, num_topics=10, id2word=dictionary, passes=50, minimum_probability = 0.0001) # lower probability threshold for more precise prediction
    # Save model
ldamodel.save('10ap.model')

In [23]:
# Print word distribution (top words only) of each topic
print("----------- Word distribution for each topic -----------")
topics = ldamodel.print_topics(num_topics=10, num_words=20)
for topic in topics:
    dist = [item.split("*\"")[1][:-1].encode("utf8") for item in topic[1].split(" + ")]
    print(dist)

----------- Word distribution for each topic -----------
['water', 'area', 'county', 'national', 'people', 'city', 'state', 'year', 'fire', 'mile', 'venus', 'nordstrom', 'space', 'galileo', 'river', 'high', 'ohio', 'company', 'police', 'department']
['year', 'percent', 'bill', 'tax', 'money', 'committee', 'program', 'defense', 'state', 'federal', 'fund', 'budget', 'month', 'government', 'fiscal', 'work', 'health', 'law', 'public', 'campaign']
['south', 'school', 'africa', 'people', 'student', 'year', 'black', 'african', 'percent', 'virus', 'mandela', 'group', 'report', 'computer', 'state', 'system', 'release', 'time', 'day', 'official']
['government', 'official', 'us', 'president', 'year', 'child', 'party', 'military', 'time', 'panama', 'country', 'people', 'human', 'china', 'force', 'union', 'american', 'tuesday', 'bank', 'month']
['court', 'attorney', 'case', 'trial', 'judge', 'charge', 'us', 'law', 'united', 'states', 'general', 'federal', 'document', 'dress', 'jury', 'drug', 'gover

In [25]:
result = []
for i in range(len(data[:10])):
    article = dict((x,y) for x,y in ldamodel[dictionary.doc2bow(data[:10][i])])
    for h in range (0, 10):
        if h not in article.keys():
            article[h] = 0
    result.append(article)

In [26]:
result

[{0: 0.00037885144884944007,
  1: 0.00037886674911592787,
  2: 0.15118729702940722,
  3: 0.00037885438177925489,
  4: 0.0075547909027124261,
  5: 0.00037885257873212694,
  6: 0.79810822555647898,
  7: 0.00037883473494782237,
  8: 0.040876520351527758,
  9: 0.00037890626644917981},
 {0: 0.00038469630523947716,
  1: 0.00038473254378890712,
  2: 0.0003846737339858172,
  3: 0.11800092146610167,
  4: 0.26970083991083915,
  5: 0.066224501594359775,
  6: 0.14180447679078889,
  7: 0.12230333976804572,
  8: 0.23373947963142705,
  9: 0.047072338255423521},
 {0: 0.029127816617169209,
  1: 0.027839876603177556,
  2: 0.016506793722749578,
  3: 0.00043488657489497374,
  4: 0.2918822088697256,
  5: 0.00043487195840120885,
  6: 0.44628603227335389,
  7: 0.00043490917822858486,
  8: 0.18661775719017409,
  9: 0.00043484701212517232},
 {0: 0.00043490068934013657,
  1: 0.0004348986624472305,
  2: 0.048417980924407286,
  3: 0.00043489606543127465,
  4: 0.00043496069855696371,
  5: 0.080549433534952072,
  6

In [38]:
# Print word distribution (top words only) and correspondence percentage of each topic
print("----------- Word distribution for each topic -----------")
topics = ldamodel.print_topics(num_topics=10, num_words=5)
for topic in topics:
    dist = dict((item.split("*\"")[1][:-1].encode("utf8"),item.split("*\"")[0].encode("utf8"))  for item in topic[1].split(" + "))
    print(dist)

----------- Word distribution for each topic -----------
{'water': '0.006', 'county': '0.005', 'national': '0.005', 'people': '0.005', 'area': '0.006'}
{'money': '0.006', 'tax': '0.006', 'bill': '0.007', 'percent': '0.008', 'year': '0.015'}
{'school': '0.008', 'africa': '0.008', 'student': '0.006', 'south': '0.009', 'people': '0.006'}
{'president': '0.006', 'official': '0.007', 'year': '0.005', 'us': '0.007', 'government': '0.012'}
{'case': '0.010', 'trial': '0.009', 'attorney': '0.012', 'court': '0.014', 'judge': '0.009'}
{'price': '0.008', 'trade': '0.007', 'percent': '0.013', 'market': '0.009', 'year': '0.009'}
{'city': '0.004', 'official': '0.005', 'police': '0.010', 'day': '0.004', 'people': '0.007'}
{'company': '0.008', 'plane': '0.006', 'offer': '0.005', 'ship': '0.005', 'air': '0.006'}
{'american': '0.003', 'company': '0.005', 'book': '0.004', 'york': '0.004', 'year': '0.007'}
{'president': '0.009', 'bush': '0.013', 'soviet': '0.008', 'state': '0.007', 'party': '0.007'}
