In [60]:
from gensim import corpora, models, similarities, matutils
import string
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import pymongo

In [61]:
# connect to MongoDB collection
client = pymongo.MongoClient()
db = client.mashable
collection = client.mashable.articles

In [None]:
# pull article content from MongoDB
content = []
for doc in collection.find({}, {'_id': 0, 'content': 1}):
    content.append(doc['content'].encode('utf8'))

In [None]:
# check number of documents is equal to expected
len(content)

In [62]:
# define stop words to exclude from LDA topic modeling
stop = stopwords.words('english')

In [None]:
# remove punctuation
content_no_punc = ["".join(char for char in text
                           if char not in string.punctuation) 
                   for text in content]

In [None]:
# remove stopwords and tokenize
documents = [[word.decode('utf-8')
              for word in text.lower().split() 
              if word.decode('utf-8') not in stop] 
              for text in content_no_punc]

In [63]:
# define lemmatizer
lmtzr = WordNetLemmatizer()

In [None]:
# lemmatize vocabularly
documents = [[lmtzr.lemmatize(token) for token in doc]
              for doc in documents]

In [None]:
# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for doc in documents:
     for token in doc:
            frequency[token] += 1

documents = [[token for token in doc if frequency[token] > 1]
              for doc in documents]

In [None]:
# create dictionary
dictionary = corpora.Dictionary(documents)
# store the dictionary, for future reference
dictionary.save('mashable_LDA_dictionary.dict')

In [64]:
# load dictionary
dictionary = corpora.Dictionary.load('mashable_LDA_dictionary.dict')

In [None]:
# create corpus for model
corpus = [dictionary.doc2bow(doc) for doc in documents]

In [None]:
# store to disk, for later use
corpora.MmCorpus.serialize('mashable_LDA_corpara.mm', corpus) 

In [65]:
# load corpus
corpus = corpora.MmCorpus('mashable_LDA_corpara.mm')

In [66]:
# train LDA model
# alpha and eta are hyperparameters that affect sparsity of the 
# document-topic (theta) and topic-word (lambda) distributions. 
# Both default to a symmetric 1.0/num_topics prior. Setting to 'auto'
# will learns an asymmetric prior directly from your data.

lda = models.LdaModel(corpus,
               id2word = dictionary,
               alpha = 'auto',
               eta = 'auto',
               num_topics=10)

In [67]:
# save model
lda.save('mashable.lda')

In [None]:
# load model
lda = models.LdaModel.load('mashable.lda')

In [68]:
lda.show_topics()

[(0,
  u'0.018*2014 + 0.016*\u2014 + 0.007*team + 0.005*music + 0.005*december + 0.004*ferguson + 0.004*world + 0.004*fan + 0.004*game + 0.004*one'),
 (1,
  u'0.009*\u2014 + 0.008*image + 0.008*also + 0.007*see + 0.006*video + 0.005*one + 0.005*like + 0.004*something + 0.004*time + 0.004*share'),
 (2,
  u'0.011*\u2014 + 0.008*people + 0.006*one + 0.006*like + 0.006*say + 0.005*time + 0.005*job + 0.005*work + 0.005*new + 0.005*get'),
 (3,
  u'0.015*apple + 0.010*phone + 0.010*iphone + 0.008*device + 0.008*camera + 0.008*new + 0.007*\u2014 + 0.007*screen + 0.007*also + 0.006*samsung'),
 (4,
  u'0.008*also + 0.008*price + 0.008*device + 0.006*car + 0.005*\u2014 + 0.005*one + 0.005*like + 0.004*make + 0.004*home + 0.004*get'),
 (5,
  u'0.017*company + 0.009*year + 0.008*million + 0.007*also + 0.007*said + 0.006*apple + 0.006*new + 0.006*business + 0.006*\u2014 + 0.005*share'),
 (6,
  u'0.014*said + 0.010*police + 0.008*\u2014 + 0.006*also + 0.006*government + 0.006*u + 0.006*officer + 0.00

In [49]:
# test LDA feature generation
test_doc = collection.find_one()

In [69]:
# pull content from Mongo Doc
test_doc_content = test_doc['content'].encode('utf8')
test_doc_content

'Having trouble finding something to watch on Amazon Instant Video? The retailer launched Monday an experimental browsing tool that lets users discover movies and TV shows based on their genre preferences or simply the mood they\'re in. Movies and shows are divided up into categories, some of which bear the names of genres (i.e., "Comedy" and "Mystery/Thriller"), and others which are labeled by mood, such as "Feel-Good" and "Exciting." Users can toggle between TV shows and movies, and apply filters to show only videos that are available for free viewing to Prime subscribers, or ones that bear G or PG ratings. It\'s pretty basic, but it sure beats the haphazard organization of Amazon\'s current Instant Video page, which mixes rows of new releases with bestsellers and personal recommendations. The move is the latest in a series of investment\'s Amazon is making in its streaming video platform. Last week, the company inked a licensing agreement with A+E Networks to bring past seasons of s

In [70]:
# remove punctuation
test_doc_content = "".join(char for char 
                           in test_doc_content 
                           if char 
                           not in string.punctuation)
test_doc_content

'Having trouble finding something to watch on Amazon Instant Video The retailer launched Monday an experimental browsing tool that lets users discover movies and TV shows based on their genre preferences or simply the mood theyre in Movies and shows are divided up into categories some of which bear the names of genres ie Comedy and MysteryThriller and others which are labeled by mood such as FeelGood and Exciting Users can toggle between TV shows and movies and apply filters to show only videos that are available for free viewing to Prime subscribers or ones that bear G or PG ratings Its pretty basic but it sure beats the haphazard organization of Amazons current Instant Video page which mixes rows of new releases with bestsellers and personal recommendations The move is the latest in a series of investments Amazon is making in its streaming video platform Last week the company inked a licensing agreement with AE Networks to bring past seasons of shows from AE History Lifetime and Bio 

In [71]:
# remove stopwords and tokenize
test_doc_content = [word.decode('utf-8')
                    for word in test_doc_content.lower().split() 
                    if word.decode('utf-8') not in stop] 
test_doc_content

[u'trouble',
 u'finding',
 u'something',
 u'watch',
 u'amazon',
 u'instant',
 u'video',
 u'retailer',
 u'launched',
 u'monday',
 u'experimental',
 u'browsing',
 u'tool',
 u'lets',
 u'users',
 u'discover',
 u'movies',
 u'tv',
 u'shows',
 u'based',
 u'genre',
 u'preferences',
 u'simply',
 u'mood',
 u'theyre',
 u'movies',
 u'shows',
 u'divided',
 u'categories',
 u'bear',
 u'names',
 u'genres',
 u'ie',
 u'comedy',
 u'mysterythriller',
 u'others',
 u'labeled',
 u'mood',
 u'feelgood',
 u'exciting',
 u'users',
 u'toggle',
 u'tv',
 u'shows',
 u'movies',
 u'apply',
 u'filters',
 u'show',
 u'videos',
 u'available',
 u'free',
 u'viewing',
 u'prime',
 u'subscribers',
 u'ones',
 u'bear',
 u'g',
 u'pg',
 u'ratings',
 u'pretty',
 u'basic',
 u'sure',
 u'beats',
 u'haphazard',
 u'organization',
 u'amazons',
 u'current',
 u'instant',
 u'video',
 u'page',
 u'mixes',
 u'rows',
 u'new',
 u'releases',
 u'bestsellers',
 u'personal',
 u'recommendations',
 u'move',
 u'latest',
 u'series',
 u'investments',
 u'a

In [72]:
# lemmatize vocabularly
test_doc_content = [lmtzr.lemmatize(token) for token in test_doc_content]
test_doc_content

[u'trouble',
 u'finding',
 u'something',
 u'watch',
 u'amazon',
 u'instant',
 u'video',
 u'retailer',
 u'launched',
 u'monday',
 u'experimental',
 u'browsing',
 u'tool',
 u'let',
 u'user',
 u'discover',
 u'movie',
 u'tv',
 u'show',
 u'based',
 u'genre',
 u'preference',
 u'simply',
 u'mood',
 u'theyre',
 u'movie',
 u'show',
 u'divided',
 u'category',
 u'bear',
 u'name',
 u'genre',
 u'ie',
 u'comedy',
 u'mysterythriller',
 u'others',
 u'labeled',
 u'mood',
 u'feelgood',
 u'exciting',
 u'user',
 u'toggle',
 u'tv',
 u'show',
 u'movie',
 u'apply',
 u'filter',
 u'show',
 u'video',
 u'available',
 u'free',
 u'viewing',
 u'prime',
 u'subscriber',
 u'one',
 u'bear',
 u'g',
 u'pg',
 u'rating',
 u'pretty',
 u'basic',
 u'sure',
 u'beat',
 u'haphazard',
 u'organization',
 u'amazon',
 u'current',
 u'instant',
 u'video',
 u'page',
 u'mix',
 u'row',
 u'new',
 u'release',
 u'bestseller',
 u'personal',
 u'recommendation',
 u'move',
 u'latest',
 u'series',
 u'investment',
 u'amazon',
 u'making',
 u'strea

In [73]:
# get topic distribution
lda[dictionary.doc2bow(test_doc_content)]

[(1, 0.050505202651516756),
 (2, 0.095144610420514142),
 (5, 0.27145301884279049),
 (8, 0.22502994302259652),
 (9, 0.35400241682931877)]

In [74]:
# get set of topics
test_doc_topics = {topic for topic,prob in lda[dictionary.doc2bow(test_doc_content)]}
test_doc_topics

{1, 2, 5, 8, 9}

In [75]:
# get LDA topic dictionary
test_LDA_topics = dict()

for i in range(10):
    if i in test_doc_topics:
        test_LDA_topics[i] = [prob 
                              for topic,prob 
                              in lda[dictionary.doc2bow(test_doc_content)]
                              if topic == i][0]
    else:
        test_LDA_topics[i] = 0
        
test_LDA_topics

{0: 0,
 1: 0.050514317571294882,
 2: 0.095143317982485001,
 3: 0,
 4: 0,
 5: 0.2714529385906066,
 6: 0,
 7: 0,
 8: 0.22501907893871126,
 9: 0.35400231751729327}

In [76]:
# get LDA topic dictionary
test_LDA_topics = dict()

for i in range(10):
    if i in test_doc_topics: 
        for topic,prob in lda[dictionary.doc2bow(test_doc_content)]:
            if topic == i:
                test_LDA_topics[i] = prob
    else:
        test_LDA_topics[i] = 0
        
test_LDA_topics

{0: 0,
 1: 0.050509546326685383,
 2: 0.095137970142599934,
 3: 0,
 4: 0,
 5: 0.27145370838496719,
 6: 0,
 7: 0,
 8: 0.22501073517084158,
 9: 0.35400004898534237}

In [80]:
def get_lda_features(doc):
    
    """
    Pull document from MongoDB collection of Mashable Articles 
    and generate LDA topic probabilities.
    
    Arguments:
    Doc -- MongoDB Document
    
    Output:
    Stores LDA topic probability results in Mongo DB for Document
    """
    
    # pull content from Mongo Doc
    content = doc['content'].encode('utf8')
    
    # remove punctuation
    content = "".join(char for char 
                      in content 
                      if char 
                      not in string.punctuation)
    
    # remove stopwords and tokenize
    content = [word.decode('utf-8')
               for word in content.lower().split() 
               if word.decode('utf-8') not in stop]
    
    # lemmatize vocabularly
    content = [lmtzr.lemmatize(token) 
               for token in content]
    
    # get LDA features for Model
    topic_probs = lda[dictionary.doc2bow(content)]
    topics = {}
    topics = {topic for (topic,prob) in topic_probs}
    LDA_topics = dict()
    for i in range(10):
        if i in topics: 
            for (topic,prob) in topic_probs:
                if topic == i:
                    LDA_topics[i] = prob
        else:
            LDA_topics[i] = 0
    
    collection.update_one({"_id": doc["_id"]}, 
                          {"$set": {"LDA_0_prob": LDA_topics[0], 
                                    "LDA_1_prob": LDA_topics[1],
                                    "LDA_2_prob": LDA_topics[2], 
                                    "LDA_3_prob": LDA_topics[3], 
                                    "LDA_4_prob": LDA_topics[4],
                                    "LDA_5_prob": LDA_topics[5], 
                                    "LDA_6_prob": LDA_topics[6],
                                    "LDA_7_prob": LDA_topics[7],
                                    "LDA_8_prob": LDA_topics[8],
                                    "LDA_9_prob": LDA_topics[9]}})

In [81]:
# get LDA features for all Mongo docs

progress_counter = 0 

for doc in collection.find({}, {"content": 1}):

    get_lda_features(doc)

    # show progress
    progress_counter += 1
    if progress_counter %100 == 0:
        print progress_counter

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
16600
16700
16800
16900
17000
17100
17200
17300
17400
17500
17600
17700
17800
17900
18000
18100
18200
18300
18400
1850