In [1]:
from __future__ import division
import cPickle
import numpy as np
import pandas as pd
from gensim import corpora, models, similarities, matutils
import string
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from textblob import TextBlob
import string
from textstat.textstat import textstat
import statsmodels.api as sm
import sklearn

In [2]:
# load Poisson Regression Model for Share Count Predictions
with open('pois_regress.pkl', 'rb') as f:
    pois_reg = cPickle.load(f)

In [3]:
# load Random Forest Classification Model for Viarality Probability Predictions
with open('rf_class.pkl', 'rb') as f:
    RF_class = cPickle.load(f)

In [4]:
# load topic model processing tools and LDA model
stop = stopwords.words('english')
lmtzr = WordNetLemmatizer()
dictionary = corpora.Dictionary.load('mashable_LDA_dictionary.dict')
corpus = corpora.MmCorpus('mashable_LDA_corpara.mm')
lda = models.LdaModel.load('mashable.lda')

In [5]:
headline = 'Facebook rolls out suicide prevention tools in Australia'

In [6]:
content = "Many of us confess more about our feelings on Facebook than we might ever say face-to-face with another person. Recognising it could have a role to play in preventing self-harm, Facebook has come up with a number of suicide prevention tools to help people who may be depressed. The program launched in the U.S. in February, and is now rolling out in Australia, a Facebook spokesperson confirmed to Mashable Australia.  In the U.S., Facebook partnered with local mental health initiatives including Forefront, Now Matters Now and the National Suicide Prevention Lifeline to develop the language around the initiative. In Australia, Facebook is working with BeyondBlue and Headspace in a collaboration announced Friday.  Keeping people safe is our most important responsibility on Facebook, the spokesperson said.  If someone thinks another person is considering suicide based on their Facebook posts, they are urged to call emergency services, but also to report the material to Facebook.  The company said it has teams working around the clock to review the reports. Depending on the seriousness of the threat, those assessing the posts can encourage the author to speak to a mental health expert through a private pop-up message or to reach out to a friend, or even provide advice on how to come to terms with their feelings.  We're also providing new resources and support to the person who flagged the troubling post, the spokesperson said, including options for them to call or message their distressed friend letting them know they care, or reaching out to another friend or a trained professional at a suicide hotline for support.  Globally there are multiple millions of users all going through these same problems, Chris Tanti, Headspace CEO told Fairfax Media about the tools. People can be notified and help can be provided just about anywhere in Australia, which is fantastic."
content = content.encode('utf8')

In [7]:
tags = 'Australia, Facebook, Mental Health, Social Media'

In [8]:
day_published = 'Sunday'

In [9]:
channel = 'Social Media'

In [10]:
num_imgs = 2

In [11]:
# Create data dictionary and add fields tied to inputs
def create_metadata_fields(data):
    
    """
    Define dictionary of data to be input to mashable models for prediction.
    """
    
    # add values for content metadata
    data['num_imgs'] = num_imgs
    data['num_tags'] = len(tags.replace(' ','').split(","))
    data['num_videos'] = 0
    
    # add values for weekday published
    if day_published == 'Monday':
        data['weekday_is_monday'] = 1
    else:
        data['weekday_is_monday'] = 0
        
    if day_published == 'Tuesday':
        data['weekday_is_tuesday'] = 1
    else:
        data['weekday_is_tuesday'] = 0
        
    if day_published == 'Wednesday':
        data['weekday_is_wednesday'] = 1
    else:
        data['weekday_is_wednesday'] = 0
        
    if day_published == 'Thursday':
        data['weekday_is_thursday'] = 1
    else:
        data['weekday_is_thursday'] = 0
        
    if day_published == 'Friday':
        data['weekday_is_friday'] = 1
    else:
        data['weekday_is_friday'] = 0
        
    if day_published == 'Saturday':
        data['weekday_is_saturday'] = 1
    else:
        data['weekday_is_saturday'] = 0
        
    if day_published == 'Sunday':
        data['weekday_is_sunday'] = 1
    else:
        data['weekday_is_sunday'] = 0
        
    if day_published == 'Saturday' or day_published == 'Sunday':
        data['is_weekend'] = 1
    else:
        data['is_weekend'] = 0
    
    # add values for channel
    if channel == 'Business':
        data['data_channel_is_bus'] = 1
    else:
        data['data_channel_is_bus'] = 0
    
    if channel == 'Entertainment':
        data['data_channel_is_entertainment'] = 1
    else:
        data['data_channel_is_entertainment'] = 0
    
    if channel == 'Lifestyle':
        data['data_channel_is_lifestyle'] = 1
    else:
        data['data_channel_is_lifestyle'] = 0
        
    if channel == 'Social Media':
        data['data_channel_is_socmed'] = 1
    else:
        data['data_channel_is_socmed'] = 0
    
    if channel == 'Technology':
        data['data_channel_is_tech'] = 1
    else:
        data['data_channel_is_tech'] = 0
        
    if channel == 'World':
        data['data_channel_is_world'] = 1
    else:
        data['data_channel_is_world'] = 0    

In [12]:
index = [0]

In [13]:
columns = [u'LDA_0_prob', u'LDA_1_prob', u'LDA_2_prob', u'LDA_3_prob',
       u'LDA_4_prob', u'LDA_5_prob', u'LDA_6_prob', u'LDA_7_prob',
       u'LDA_8_prob', u'LDA_9_prob', u'average_token_length_content',
       u'average_token_length_title', u'avg_negative_polarity',
       u'avg_positive_polarity', u'data_channel_is_bus',
       u'data_channel_is_entertainment', u'data_channel_is_lifestyle',
       u'data_channel_is_socmed', u'data_channel_is_tech',
       u'data_channel_is_world', u'global_grade_level',
       u'global_rate_negative_words', u'global_rate_positive_words',
       u'global_reading_ease', u'global_sentiment_abs_polarity',
       u'global_sentiment_polarity', u'global_subjectivity', u'is_weekend',
       u'max_abs_polarity', u'max_negative_polarity', u'max_positive_polarity',
       u'min_negative_polarity', u'min_positive_polarity', u'n_tokens_content',
       u'n_tokens_title', u'num_imgs', u'num_tags', u'num_videos',
       u'r_non_stop_unique_tokens', u'r_non_stop_words', u'r_unique_tokens',
       u'rate_negative_words', u'rate_positive_words',
       u'title_sentiment_abs_polarity', u'title_sentiment_polarity',
       u'title_subjectivity', u'weekday_is_friday',
       u'weekday_is_monday', u'weekday_is_saturday', u'weekday_is_sunday',
       u'weekday_is_thursday', u'weekday_is_tuesday', u'weekday_is_wednesday']

In [14]:
data_df = pd.DataFrame(index=index, columns=columns)

In [15]:
data_df

Unnamed: 0,LDA_0_prob,LDA_1_prob,LDA_2_prob,LDA_3_prob,LDA_4_prob,LDA_5_prob,LDA_6_prob,LDA_7_prob,LDA_8_prob,LDA_9_prob,...,title_sentiment_abs_polarity,title_sentiment_polarity,title_subjectivity,weekday_is_friday,weekday_is_monday,weekday_is_saturday,weekday_is_sunday,weekday_is_thursday,weekday_is_tuesday,weekday_is_wednesday
0,,,,,,,,,,,...,,,,,,,,,,


In [16]:
create_metadata_fields(data_df)

In [17]:
data_df

Unnamed: 0,LDA_0_prob,LDA_1_prob,LDA_2_prob,LDA_3_prob,LDA_4_prob,LDA_5_prob,LDA_6_prob,LDA_7_prob,LDA_8_prob,LDA_9_prob,...,title_sentiment_abs_polarity,title_sentiment_polarity,title_subjectivity,weekday_is_friday,weekday_is_monday,weekday_is_saturday,weekday_is_sunday,weekday_is_thursday,weekday_is_tuesday,weekday_is_wednesday
0,,,,,,,,,,,...,,,,0,0,0,1,0,0,0


In [18]:
def create_NLP_features(data):

    # generate headline features

    # number of words in title
    data['n_tokens_title'] = len(headline.split())

    # subjectivity
    data['title_subjectivity'] = TextBlob(headline).subjectivity

    # polarity
    data['title_sentiment_polarity'] = TextBlob(headline).polarity

    # absolute value polarirty
    data['title_sentiment_abs_polarity'] = abs(data['title_sentiment_polarity'])

    # average word length
    data['average_token_length_title'] = np.mean([len(w) for w 
                                          in "".join(c for c in headline 
                                                     if c not in string.punctuation).split()])

    #generate content features

    # number of words
    data['n_tokens_content'] = len([w for w in content.split()])

    # rate of unique words
    data['r_unique_tokens'] = len(set([w.lower().decode('utf-8')
                               for w 
                               in "".join(c for c in content 
                                          if c not in string.punctuation).split()]))/data['n_tokens_content']

    # rate of non-stop word
    data['r_non_stop_words'] = len([w.lower().decode('utf-8') 
                            for w in "".join(c for c in content 
                                             if c not in string.punctuation).split() 
                            if w.decode('utf-8') 
                            not in stop])/data['n_tokens_content']

    # rate of unique non-stop word
    data['r_non_stop_unique_tokens'] = len(set([w.lower().decode('utf-8') 
                               for w in "".join(c for c in content 
                                                if c not in string.punctuation).split() 
                               if w.decode('utf-8')
                               not in stop]))/data['n_tokens_content']

    # average word length
    data['average_token_length_content'] = np.mean([len(w) for w 
                                            in "".join(c for c in content
                                                       if c not in string.punctuation).split()])

    # subjectivity
    data['global_subjectivity'] = TextBlob(content.decode('utf-8')).subjectivity

    # polarity
    data['global_sentiment_polarity'] = TextBlob(content.decode('utf-8')).polarity

    # absolute polarity
    data['global_sentiment_abs_polarity'] = abs(data['global_sentiment_polarity'])

    # get polarity by word
    polarity_list = [(w.decode('utf-8'), TextBlob(w.decode('utf-8')).polarity) 
                             for w in "".join(c for c in content 
                                              if c not in string.punctuation).split()]

    # global positive word rate
    data['global_rate_positive_words'] = len([(w,p) 
                                      for (w,p) 
                                      in polarity_list 
                                      if p > 0])/len(polarity_list)

    # global negative word rate
    data['global_rate_negative_words'] = len([(w,p) 
                                      for (w,p) 
                                      in polarity_list 
                                      if p < 0])/len(polarity_list)

    # positive word rate (among non-nuetral words)
    if [(w,p) for (w,p) in polarity_list if p != 0]:
        data['rate_positive_words'] = len([(w,p) 
                                   for (w,p) 
                                   in polarity_list 
                                   if p > 0])/len([(w,p) 
                                                   for (w,p) 
                                                   in polarity_list 
                                                   if p != 0])
    else:
        data['rate_positive_words'] = 0

    # negative word rate (among non-nuetral words)
    if [(w,p) for (w,p) in polarity_list if p != 0]:
        data['rate_negative_words'] = len([(w,p) 
                                   for (w,p) 
                                   in polarity_list 
                                   if p < 0])/len([(w,p) 
                                                   for (w,p) 
                                                   in polarity_list 
                                                   if p != 0])

    else:
        data['rate_negative_words'] = 0 

    # average polarity of positive words
    if [p for (w,p) in polarity_list if p > 0]:
        data['avg_positive_polarity'] = np.mean([p for (w,p) 
                                         in polarity_list 
                                         if p > 0])
    else:
        data['avg_positive_polarity'] = 0

    # minimum polarity of positive words
    if [p for (w,p) in polarity_list if p > 0]:
        data['min_positive_polarity'] = min([p for (w,p) 
                                     in polarity_list 
                                     if p > 0])
    else:
        data['min_positive_polarity'] = 0

    # maximum polarity of positive words
    if [p for (w,p) in polarity_list if p > 0]:
        data['max_positive_polarity'] = max([p for (w,p) 
                                     in polarity_list 
                                     if p > 0])
    else: 
        data['max_positive_polarity'] = 0

    # average polarity of negative words
    if [p for (w,p) in polarity_list if p < 0]:
        data['avg_negative_polarity'] = np.mean([p for (w,p) 
                                         in polarity_list 
                                         if p < 0])
    else:
        data['avg_negative_polarity'] = 0

    # minimum polarity of negative words
    if [p for (w,p) in polarity_list if p < 0]:
        data['min_negative_polarity'] = min([p for (w,p) 
                                     in polarity_list 
                                     if p < 0])
    else:
        data['min_negative_polarity'] = 0

    # maximum polarity of negative words
    if [p for (w,p) in polarity_list if p < 0]:
        data['max_negative_polarity'] = max([p for (w,p) 
                                 in polarity_list 
                                 if p < 0])
    else:
        data['max_negative_polarity'] = 0

    # abs maximum polarity, sum of abs of max positive and abs of min negative polarity
    data['max_abs_polarity'] = data['max_positive_polarity'] + abs(data['min_negative_polarity'])

    # Flesch Reading Ease
    data['global_reading_ease'] = textstat.flesch_reading_ease(content.decode('utf-8'))

    # Flesch Kincaid Grade Level
    data['global_grade_level'] = textstat.flesch_kincaid_grade(content.decode('utf-8'))

In [19]:
create_NLP_features(data_df)

In [20]:
data_df

Unnamed: 0,LDA_0_prob,LDA_1_prob,LDA_2_prob,LDA_3_prob,LDA_4_prob,LDA_5_prob,LDA_6_prob,LDA_7_prob,LDA_8_prob,LDA_9_prob,...,title_sentiment_abs_polarity,title_sentiment_polarity,title_subjectivity,weekday_is_friday,weekday_is_monday,weekday_is_saturday,weekday_is_sunday,weekday_is_thursday,weekday_is_tuesday,weekday_is_wednesday
0,,,,,,,,,,,...,0,0,0,0,0,0,1,0,0,0


In [21]:
polarity_data = {}

In [22]:
polarity_list = [(w.decode('utf-8'), TextBlob(w.decode('utf-8')).polarity) 
                  for w in "".join(c for c in content 
                                   if c not in string.punctuation).split()]

In [23]:
polarity_data['children'] = [{'word': w, 'polarity': abs(p), 'color': '#1A79BB'} if p>0 
                 else {'word': w, 'polarity': abs(p), 'color': '#bb1a29'}
                 for (w,p) in polarity_list if p != 0]

In [24]:
polarity_data

{'children': [{'color': '#1A79BB', 'polarity': 0.5, 'word': u'Many'},
  {'color': '#1A79BB', 'polarity': 0.5, 'word': u'more'},
  {'color': '#1A79BB', 'polarity': 0.4, 'word': u'confirmed'},
  {'color': '#bb1a29', 'polarity': 0.1, 'word': u'mental'},
  {'color': '#1A79BB', 'polarity': 0.5, 'word': u'safe'},
  {'color': '#1A79BB', 'polarity': 0.5, 'word': u'most'},
  {'color': '#1A79BB', 'polarity': 0.4, 'word': u'important'},
  {'color': '#bb1a29', 'polarity': 0.1, 'word': u'mental'},
  {'color': '#1A79BB', 'polarity': 0.13636363636363635, 'word': u'new'},
  {'color': '#1A79BB', 'polarity': 0.1, 'word': u'professional'},
  {'color': '#1A79BB', 'polarity': 0.4, 'word': u'fantastic'}]}

In [25]:
def create_lda_features(data):

    # remove punctuation
    content_tmp = "".join(char for char 
                      in content 
                      if char 
                      not in string.punctuation)
    
    # remove stopwords and tokenize
    content_tmp = [word.decode('utf-8')
               for word in content_tmp.lower().split() 
               if word.decode('utf-8') not in stop]
    
    # lemmatize vocabularly
    content_tmp = [lmtzr.lemmatize(token) 
               for token in content_tmp]
    
    # get LDA features for Model
    topic_probs = lda[dictionary.doc2bow(content_tmp)]
    topics = {}
    topics = {topic for (topic,prob) in topic_probs}
    LDA_topics = dict()
    for i in range(10):
        if i in topics: 
            for (topic,prob) in topic_probs:
                if topic == i:
                    LDA_topics[i] = prob
        else:
            LDA_topics[i] = 0
    
    data['LDA_0_prob'] = LDA_topics[0] 
    data['LDA_1_prob'] = LDA_topics[1]
    data['LDA_2_prob'] = LDA_topics[2] 
    data['LDA_3_prob'] = LDA_topics[3] 
    data['LDA_4_prob'] = LDA_topics[4]
    data['LDA_5_prob'] = LDA_topics[5] 
    data['LDA_6_prob'] = LDA_topics[6]
    data['LDA_7_prob'] = LDA_topics[7]
    data['LDA_8_prob'] = LDA_topics[8]
    data['LDA_9_prob'] = LDA_topics[9]

In [26]:
create_lda_features(data_df)

In [27]:
data_df

Unnamed: 0,LDA_0_prob,LDA_1_prob,LDA_2_prob,LDA_3_prob,LDA_4_prob,LDA_5_prob,LDA_6_prob,LDA_7_prob,LDA_8_prob,LDA_9_prob,...,title_sentiment_abs_polarity,title_sentiment_polarity,title_subjectivity,weekday_is_friday,weekday_is_monday,weekday_is_saturday,weekday_is_sunday,weekday_is_thursday,weekday_is_tuesday,weekday_is_wednesday
0,0,0,0.395167,0,0,0.031147,0.250184,0.022346,0,0.297654,...,0,0,0,0,0,0,1,0,0,0


In [28]:
data = {}

In [29]:
create_metadata_fields(data)
create_NLP_features(data)
create_lda_features(data)

In [30]:
data

{'LDA_0_prob': 0,
 'LDA_1_prob': 0,
 'LDA_2_prob': 0.39520162159397149,
 'LDA_3_prob': 0,
 'LDA_4_prob': 0,
 'LDA_5_prob': 0.031060534226415401,
 'LDA_6_prob': 0.25022956502977983,
 'LDA_7_prob': 0.022314919231455951,
 'LDA_8_prob': 0,
 'LDA_9_prob': 0.29769134615999893,
 'average_token_length_content': 5.0657894736842106,
 'average_token_length_title': 6.125,
 'avg_negative_polarity': -0.10000000000000001,
 'avg_positive_polarity': 0.38181818181818178,
 'data_channel_is_bus': 0,
 'data_channel_is_entertainment': 0,
 'data_channel_is_lifestyle': 0,
 'data_channel_is_socmed': 1,
 'data_channel_is_tech': 0,
 'data_channel_is_world': 0,
 'global_grade_level': 11.5,
 'global_rate_negative_words': 0.006578947368421052,
 'global_rate_positive_words': 0.029605263157894735,
 'global_reading_ease': 50.16,
 'global_sentiment_abs_polarity': 0.20227272727272724,
 'global_sentiment_polarity': 0.20227272727272724,
 'global_subjectivity': 0.3971590909090909,
 'is_weekend': 1,
 'max_abs_polarity': 0.6

In [31]:
round(pois_reg.predict(sm.add_constant(data_df))[0],-2)

3200.0

In [32]:
round(RF_class.predict_proba(sm.add_constant(data_df))[0][1],2)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished


0.54

In [33]:
data['est_shares'] = round(pois_reg.predict(sm.add_constant(data_df))[0],-2)
data['est_prob'] = round(RF_class.predict_proba(sm.add_constant(data_df))[0][1],2)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished


In [34]:
data

{'LDA_0_prob': 0,
 'LDA_1_prob': 0,
 'LDA_2_prob': 0.39520162159397149,
 'LDA_3_prob': 0,
 'LDA_4_prob': 0,
 'LDA_5_prob': 0.031060534226415401,
 'LDA_6_prob': 0.25022956502977983,
 'LDA_7_prob': 0.022314919231455951,
 'LDA_8_prob': 0,
 'LDA_9_prob': 0.29769134615999893,
 'average_token_length_content': 5.0657894736842106,
 'average_token_length_title': 6.125,
 'avg_negative_polarity': -0.10000000000000001,
 'avg_positive_polarity': 0.38181818181818178,
 'data_channel_is_bus': 0,
 'data_channel_is_entertainment': 0,
 'data_channel_is_lifestyle': 0,
 'data_channel_is_socmed': 1,
 'data_channel_is_tech': 0,
 'data_channel_is_world': 0,
 'est_prob': 0.54,
 'est_shares': 3200.0,
 'global_grade_level': 11.5,
 'global_rate_negative_words': 0.006578947368421052,
 'global_rate_positive_words': 0.029605263157894735,
 'global_reading_ease': 50.16,
 'global_sentiment_abs_polarity': 0.20227272727272724,
 'global_sentiment_polarity': 0.20227272727272724,
 'global_subjectivity': 0.3971590909090909,


In [35]:
polarity_data

{'children': [{'color': '#1A79BB', 'polarity': 0.5, 'word': u'Many'},
  {'color': '#1A79BB', 'polarity': 0.5, 'word': u'more'},
  {'color': '#1A79BB', 'polarity': 0.4, 'word': u'confirmed'},
  {'color': '#bb1a29', 'polarity': 0.1, 'word': u'mental'},
  {'color': '#1A79BB', 'polarity': 0.5, 'word': u'safe'},
  {'color': '#1A79BB', 'polarity': 0.5, 'word': u'most'},
  {'color': '#1A79BB', 'polarity': 0.4, 'word': u'important'},
  {'color': '#bb1a29', 'polarity': 0.1, 'word': u'mental'},
  {'color': '#1A79BB', 'polarity': 0.13636363636363635, 'word': u'new'},
  {'color': '#1A79BB', 'polarity': 0.1, 'word': u'professional'},
  {'color': '#1A79BB', 'polarity': 0.4, 'word': u'fantastic'}]}