In [1]:
from gensim import corpora, models, similarities, matutils
import string
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import pymongo

In [2]:
# connect to MongoDB collection
client = pymongo.MongoClient()
db = client.mashable
collection = client.mashable.articles

In [3]:
# pull article content from MongoDB
content = []
for doc in collection.find({}, {'_id': 0, 'content': 1}):
    content.append(doc['content'].encode('utf8'))

In [4]:
# check number of documents is equal to expected
len(content)

39494

In [5]:
# define stop words to exclude from LDA topic modeling
stop = stopwords.words('english')

In [6]:
# remove punctuation
content_no_punc = ["".join(char for char in text
                           if char not in string.punctuation) 
                   for text in content]

In [7]:
# remove stopwords and tokenize
documents = [[word.decode('utf-8')
              for word in text.lower().split() 
              if word.decode('utf-8') not in stop] 
              for text in content_no_punc]

In [8]:
# define lemmatizer
lmtzr = WordNetLemmatizer()

In [9]:
# lemmatize vocabularly
documents = [[lmtzr.lemmatize(token) for token in doc]
              for doc in documents]

In [10]:
# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for doc in documents:
     for token in doc:
            frequency[token] += 1

documents = [[token for token in doc if frequency[token] > 1]
              for doc in documents]

In [11]:
# create dictionary
dictionary = corpora.Dictionary(documents)
# store the dictionary, for future reference
dictionary.save('mashable_LDA_dictionary.dict')

In [12]:
# load dictionary
dictionary = corpora.Dictionary.load('mashable_LDA_dictionary.dict')

In [13]:
# create corpus for model
corpus = [dictionary.doc2bow(doc) for doc in documents]

In [14]:
# store to disk, for later use
corpora.MmCorpus.serialize('mashable_LDA_corpara.mm', corpus) 

In [15]:
# load corpus
corpus = corpora.MmCorpus('mashable_LDA_corpara.mm')

In [16]:
# train LDA model
# alpha and eta are hyperparameters that affect sparsity of the 
# document-topic (theta) and topic-word (lambda) distributions. 
# Both default to a symmetric 1.0/num_topics prior. Setting to 'auto'
# will learns an asymmetric prior directly from your data.

lda = models.LdaModel(corpus,
               id2word = dictionary,
               alpha = 'auto',
               eta = 'auto',
               num_topics=10)

In [17]:
# save model
lda.save('mashable.lda')

In [18]:
# load model
lda = models.LdaModel.load('mashable.lda')

In [19]:
lda.show_topics(num_words=20)

[(0,
  u'0.011*device + 0.008*also + 0.008*phone + 0.007*\u2014 + 0.006*iphone + 0.006*new + 0.006*screen + 0.006*one + 0.005*camera + 0.005*apple + 0.005*like + 0.005*tablet + 0.004*price + 0.004*see + 0.004*car + 0.004*ipad + 0.004*display + 0.004*design + 0.004*samsung + 0.003*watch'),
 (1,
  u'0.020*image + 0.010*photo + 0.008*also + 0.006*see + 0.006*make + 0.006*courtesy + 0.005*like + 0.004*food + 0.004*add + 0.004*look + 0.004*dog + 0.004*youre + 0.004*one + 0.004*take + 0.004*get + 0.004*holiday + 0.003*say + 0.003*day + 0.003*cat + 0.003*gift'),
 (2,
  u'0.028*game + 0.009*player + 0.009*team + 0.008*\u2014 + 0.007*world + 0.007*one + 0.005*sport + 0.005*get + 0.005*also + 0.005*play + 0.004*sony + 0.004*see + 0.004*time + 0.004*could + 0.004*new + 0.004*fan + 0.004*cup + 0.004*thats + 0.004*video + 0.004*last'),
 (3,
  u'0.012*said + 0.006*government + 0.006*u + 0.005*also + 0.005*\u2014 + 0.005*new + 0.005*state + 0.004*job + 0.004*medium + 0.004*people + 0.004*security + 0

In [20]:
# test LDA feature generation
test_doc = collection.find_one()

In [21]:
# pull content from Mongo Doc
test_doc_content = test_doc['content'].encode('utf8')
test_doc_content

'The Associated Press is the latest news organization to experiment with trying to make money from Twitter by using its feed to advertise for other companies.  The AP announced Monday that it will share sponsored tweets from Samsung throughout this week for the International CES taking place in Las Vegas. The news service will let Samsung post two tweets per day to the AP\'s Twitter account, which has more than 1.5 million users, and each of these tweets will be labeled "SPONSORED TWEETS." This marks the first time that the AP has sold advertising on its Twitter feed, and the company says it spent months developing guidelines to pave the way for this and other new media business models.  For this particular promotion, Samsung will provide the sponsored tweets and non-editorial staff at the AP will handle the publishing side. In this way, the company hopes to maintain a clear dividing line between its editorial and advertising operations on Twitter. "We are thrilled to be taking this ne

In [22]:
# remove punctuation
test_doc_content = "".join(char for char 
                           in test_doc_content 
                           if char 
                           not in string.punctuation)
test_doc_content

'The Associated Press is the latest news organization to experiment with trying to make money from Twitter by using its feed to advertise for other companies  The AP announced Monday that it will share sponsored tweets from Samsung throughout this week for the International CES taking place in Las Vegas The news service will let Samsung post two tweets per day to the APs Twitter account which has more than 15 million users and each of these tweets will be labeled SPONSORED TWEETS This marks the first time that the AP has sold advertising on its Twitter feed and the company says it spent months developing guidelines to pave the way for this and other new media business models  For this particular promotion Samsung will provide the sponsored tweets and noneditorial staff at the AP will handle the publishing side In this way the company hopes to maintain a clear dividing line between its editorial and advertising operations on Twitter We are thrilled to be taking this next step in social 

In [23]:
# remove stopwords and tokenize
test_doc_content = [word.decode('utf-8')
                    for word in test_doc_content.lower().split() 
                    if word.decode('utf-8') not in stop] 
test_doc_content

[u'associated',
 u'press',
 u'latest',
 u'news',
 u'organization',
 u'experiment',
 u'trying',
 u'make',
 u'money',
 u'twitter',
 u'using',
 u'feed',
 u'advertise',
 u'companies',
 u'ap',
 u'announced',
 u'monday',
 u'share',
 u'sponsored',
 u'tweets',
 u'samsung',
 u'throughout',
 u'week',
 u'international',
 u'ces',
 u'taking',
 u'place',
 u'las',
 u'vegas',
 u'news',
 u'service',
 u'let',
 u'samsung',
 u'post',
 u'two',
 u'tweets',
 u'per',
 u'day',
 u'aps',
 u'twitter',
 u'account',
 u'15',
 u'million',
 u'users',
 u'tweets',
 u'labeled',
 u'sponsored',
 u'tweets',
 u'marks',
 u'first',
 u'time',
 u'ap',
 u'sold',
 u'advertising',
 u'twitter',
 u'feed',
 u'company',
 u'says',
 u'spent',
 u'months',
 u'developing',
 u'guidelines',
 u'pave',
 u'way',
 u'new',
 u'media',
 u'business',
 u'models',
 u'particular',
 u'promotion',
 u'samsung',
 u'provide',
 u'sponsored',
 u'tweets',
 u'noneditorial',
 u'staff',
 u'ap',
 u'handle',
 u'publishing',
 u'side',
 u'way',
 u'company',
 u'hopes',

In [24]:
# lemmatize vocabularly
test_doc_content = [lmtzr.lemmatize(token) for token in test_doc_content]
test_doc_content

[u'associated',
 u'press',
 u'latest',
 u'news',
 u'organization',
 u'experiment',
 u'trying',
 u'make',
 u'money',
 u'twitter',
 u'using',
 u'feed',
 u'advertise',
 u'company',
 u'ap',
 u'announced',
 u'monday',
 u'share',
 u'sponsored',
 u'tweet',
 u'samsung',
 u'throughout',
 u'week',
 u'international',
 u'ce',
 u'taking',
 u'place',
 u'la',
 u'vega',
 u'news',
 u'service',
 u'let',
 u'samsung',
 u'post',
 u'two',
 u'tweet',
 u'per',
 u'day',
 u'aps',
 u'twitter',
 u'account',
 u'15',
 u'million',
 u'user',
 u'tweet',
 u'labeled',
 u'sponsored',
 u'tweet',
 u'mark',
 u'first',
 u'time',
 u'ap',
 u'sold',
 u'advertising',
 u'twitter',
 u'feed',
 u'company',
 u'say',
 u'spent',
 u'month',
 u'developing',
 u'guideline',
 u'pave',
 u'way',
 u'new',
 u'medium',
 u'business',
 u'model',
 u'particular',
 u'promotion',
 u'samsung',
 u'provide',
 u'sponsored',
 u'tweet',
 u'noneditorial',
 u'staff',
 u'ap',
 u'handle',
 u'publishing',
 u'side',
 u'way',
 u'company',
 u'hope',
 u'maintain',
 

In [25]:
# get topic distribution
lda[dictionary.doc2bow(test_doc_content)]

[(3, 0.17629512356881796),
 (6, 0.19677207416550102),
 (7, 0.01741500406999354),
 (9, 0.60377688639372074)]

In [26]:
# get set of topics
test_doc_topics = {topic for topic,prob in lda[dictionary.doc2bow(test_doc_content)]}
test_doc_topics

{3, 6, 7, 9}

In [27]:
# get LDA topic dictionary
test_LDA_topics = dict()

for i in range(10):
    if i in test_doc_topics: 
        for topic,prob in lda[dictionary.doc2bow(test_doc_content)]:
            if topic == i:
                test_LDA_topics[i] = prob
    else:
        test_LDA_topics[i] = 0
        
test_LDA_topics

{0: 0,
 1: 0,
 2: 0,
 3: 0.17628670319338244,
 4: 0,
 5: 0,
 6: 0.19677729199618429,
 7: 0.017397160437935408,
 8: 0,
 9: 0.60378533152714498}

In [28]:
# define function to get LDA features

def get_lda_features(doc):
    
    """
    Pull document from MongoDB collection of Mashable Articles 
    and generate LDA topic probabilities.
    
    Arguments:
    Doc -- MongoDB Document
    
    Output:
    Stores LDA topic probability results in Mongo DB for Document
    """
    
    # pull content from Mongo Doc
    content = doc['content'].encode('utf8')
    
    # remove punctuation
    content = "".join(char for char 
                      in content 
                      if char 
                      not in string.punctuation)
    
    # remove stopwords and tokenize
    content = [word.decode('utf-8')
               for word in content.lower().split() 
               if word.decode('utf-8') not in stop]
    
    # lemmatize vocabularly
    content = [lmtzr.lemmatize(token) 
               for token in content]
    
    # get LDA features for Model
    topic_probs = lda[dictionary.doc2bow(content)]
    topics = {}
    topics = {topic for (topic,prob) in topic_probs}
    LDA_topics = dict()
    for i in range(10):
        if i in topics: 
            for (topic,prob) in topic_probs:
                if topic == i:
                    LDA_topics[i] = prob
        else:
            LDA_topics[i] = 0
    
    collection.update_one({"_id": doc["_id"]}, 
                          {"$set": {"LDA_0_prob": LDA_topics[0], 
                                    "LDA_1_prob": LDA_topics[1],
                                    "LDA_2_prob": LDA_topics[2], 
                                    "LDA_3_prob": LDA_topics[3], 
                                    "LDA_4_prob": LDA_topics[4],
                                    "LDA_5_prob": LDA_topics[5], 
                                    "LDA_6_prob": LDA_topics[6],
                                    "LDA_7_prob": LDA_topics[7],
                                    "LDA_8_prob": LDA_topics[8],
                                    "LDA_9_prob": LDA_topics[9]}})