In [None]:
import numpy as np
import pandas as pd
from __future__ import division

In [None]:
mash_df = pd.read_csv('OnlineNewsPopularity.csv')

In [None]:
mash_url = mash_df.url.values.tolist()

In [None]:
mash_df.columns.values

In [None]:
len(mash_url)

In [None]:
mash_url[:2]

In [None]:
test_url = 'http://mashable.com/2013/01/07/hillary-clinton-helmet/'

In [None]:
test_url

In [None]:
import requests
from bs4 import BeautifulSoup
import re
from pprint import pprint

In [None]:
# request html data and create soup
response = requests.get(test_url)
assert response.status_code == 200
soup = BeautifulSoup(response.text)

In [None]:
headline = soup.find('h1', class_='title').text
headline

In [None]:
content_list = [text.text.encode('utf-8') 
                for text 
                in soup.find('section', 
                class_='article-content').find_all('p')]
content = reduce(lambda x,y: x + ' ' + y, content_list)
content

### Header Features

In [None]:
headline.split()

In [None]:
# number of words in title
n_tokens_title = len(headline.split())
n_tokens_title

In [None]:
import nltk
from textblob import TextBlob

In [None]:
# subjectivity
title_subjectivity = TextBlob(headline).subjectivity
title_subjectivity

In [None]:
# polarity
title_sentiment_polarity = TextBlob(headline).polarity
title_sentiment_polarity

In [None]:
# absolute value polarirty
title_sentiment_abs_polarity = abs(title_sentiment_polarity)
title_sentiment_abs_polarity

In [None]:
import string

In [None]:
# average word length
average_token_length_title = np.mean([len(w) for w 
                                      in "".join(c for c in headline 
                                                 if c not in string.punctuation).split()])
average_token_length_title

### Content Features

In [None]:
# number of words
n_tokens_content = len([w for w in content.split()])
n_tokens_content

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [None]:
# rate of unique words
r_unique_tokens = len(set([w.lower().decode('utf-8')
                           for w 
                           in "".join(c for c in content 
                                      if c not in string.punctuation).split()]))/n_tokens_content
r_unique_tokens

In [None]:
# rate of non-stop word
r_non_stop_words = len([w.lower().decode('utf-8') 
                        for w in "".join(c for c in content 
                                         if c not in string.punctuation).split() 
                        if w.decode('utf-8') 
                        not in stop])/n_tokens_content
r_non_stop_words

In [None]:
# rate of unique non-stop word
r_non_stop_unique_tokens = len(set([w.lower().decode('utf-8') 
                           for w in "".join(c for c in content 
                                            if c not in string.punctuation).split() 
                           if w.decode('utf-8') 
                           not in stop]))/n_tokens_content
r_non_stop_unique_tokens

In [None]:
# average word length
average_token_length_content = np.mean([len(w) for w 
                                        in "".join(c for c in content
                                                   if c not in string.punctuation).split()])
average_token_length_content

In [None]:
# subjectivity
global_subjectivity = TextBlob(content.decode('utf-8')).subjectivity
global_subjectivity

In [None]:
# polarity
global_sentiment_polarity = TextBlob(content.decode('utf-8')).polarity
global_sentiment_polarity

In [None]:
# absolute polarity
global_sentiment_abs_polarity = abs(global_sentiment_polarity)
global_sentiment_abs_polarity

In [None]:
# get polarity by word
polarity_list = [(w.decode('utf-8'), TextBlob(w.decode('utf-8')).polarity) 
                 for w in "".join(c for c in content 
                                  if c not in string.punctuation).split()]
len(polarity_list)

In [None]:
# global positive word rate
global_rate_positive_words = len([(w,p) 
                                  for (w,p) 
                                  in polarity_list 
                                  if p > 0])/len(polarity_list)
global_rate_positive_words

In [None]:
# global negative word rate
global_rate_negative_words = len([(w,p) 
                                  for (w,p) 
                                  in polarity_list 
                                  if p < 0])/len(polarity_list)
global_rate_negative_words

In [None]:
# positive word rate (among non-nuetral words)
if [(w,p) for (w,p) in polarity_list if p != 0]:
    rate_positive_words = len([(w,p) 
                               for (w,p) 
                               in polarity_list 
                               if p > 0])/len([(w,p) 
                                               for (w,p) 
                                               in polarity_list 
                                               if p != 0])
else:
    rate_positive_words = 0
rate_positive_words

In [None]:
# negative word rate (among non-nuetral words)
if [(w,p) for (w,p) in polarity_list if p != 0]:
    rate_negative_words = len([(w,p) 
                               for (w,p) 
                               in polarity_list 
                               if p < 0])/len([(w,p) 
                                               for (w,p) 
                                               in polarity_list 
                                               if p != 0])
else:
    rate_negative_words = 0
rate_negative_words

In [None]:
# average polarity of positive words
if [p for (w,p) in polarity_list if p > 0]:
    avg_positive_polarity = np.mean([p for (w,p) 
                                     in polarity_list 
                                     if p > 0])
else:
    avg_positive_polarity = 0
    
avg_positive_polarity

In [None]:
# minimum polarity of positive words
if [p for (w,p) in polarity_list if p > 0]:
    min_positive_polarity = min([p for (w,p) 
                                 in polarity_list 
                                 if p > 0])
else:
    min_positive_polarity = 0
min_positive_polarity

In [None]:
# maximum polarity of positive words
if [p for (w,p) in polarity_list if p > 0]:
    max_positive_polarity = max([p for (w,p) 
                                 in polarity_list 
                                 if p > 0])
else: 
    max_positive_polarity = 0
max_positive_polarity

In [None]:
# average polarity of negative words
if [p for (w,p) in polarity_list if p < 0]:
    avg_negative_polarity = np.mean([p for (w,p) 
                                     in polarity_list 
                                     if p < 0])
else:
    avg_negative_polarity = 0
avg_negative_polarity

In [None]:
# minimum polarity of negative words
if [p for (w,p) in polarity_list if p < 0]:
    min_negative_polarity = min([p for (w,p) 
                                 in polarity_list 
                                 if p < 0])
else:
    min_negative_polarity = 0
min_negative_polarity

In [None]:
# maximum polarity of negative words
if [p for (w,p) in polarity_list if p < 0]:
    max_negative_polarity = max([p for (w,p) 
                             in polarity_list 
                             if p < 0])
else:
    max_negative_polarity = 0
max_negative_polarity

In [None]:
# abs maximum polarity, sum of abs of max positive and abs of min negative polarity
max_abs_polarity = max_positive_polarity + abs(min_negative_polarity)
max_abs_polarity

In [None]:
from textstat.textstat import textstat

In [None]:
# Flesch Reading Ease
global_reading_ease = textstat.flesch_reading_ease(content.decode('utf-8'))
global_reading_ease

In [None]:
# Flesch Kincaid Grade Level
global_grade_level = textstat.flesch_kincaid_grade(content.decode('utf-8'))
global_grade_level

In [None]:
import pymongo

In [None]:
# connect to mongo db collection
client = pymongo.MongoClient()
db = client.mashable
collection = client.mashable.articles

In [None]:
test = collection.find_one()

In [None]:
test