In [1]:
import numpy as np
import pandas as pd
from __future__ import division

In [2]:
# define test URL
test_url = 'http://mashable.com/2013/01/07/hillary-clinton-helmet/'

In [3]:
test_url

'http://mashable.com/2013/01/07/hillary-clinton-helmet/'

In [4]:
import requests
from bs4 import BeautifulSoup
import re
from pprint import pprint

In [5]:
# request html data and create soup
response = requests.get(test_url)
assert response.status_code == 200
soup = BeautifulSoup(response.text)

In [6]:
headline = soup.find('h1', class_='title').text
headline

u"Hillary Clinton's Coworkers Welcome Her Back With a Helmet"

In [7]:
content_list = [text.text.encode('utf-8') 
                for text 
                in soup.find('section', 
                class_='article-content').find_all('p')]
content = reduce(lambda x,y: x + ' ' + y, content_list)
content

"On her first day back to work following a concussion and blood clot, Hillary Clinton's coworkers gave her a bit of a gag gift \xe2\x80\x94 a football helmet. Clinton, 65, was presented with the helmet, along with a matching jersey, during a morning meeting. Both sport official State Department insignia.  The Secretary of State was away from Washington for about a month after suffering a stomach virus then a concussion which led to a blood clot near her brain. Clinton was hospitalized for three days at New York-Presbyterian Hospital Columbia when doctors first discovered the clot on Dec. 30. SEE ALSO: Yes, Secretary Clinton Submitted Her Own 'Text From Hillary'\n Doctors expect Clinton to make a full recovery. Photos via Nick Merrill, State Department"

### Header Features

In [8]:
headline.split()

[u'Hillary',
 u"Clinton's",
 u'Coworkers',
 u'Welcome',
 u'Her',
 u'Back',
 u'With',
 u'a',
 u'Helmet']

In [9]:
# number of words in title
n_tokens_title = len(headline.split())
n_tokens_title

9

In [10]:
import nltk
from textblob import TextBlob

In [11]:
# subjectivity
title_subjectivity = TextBlob(headline).subjectivity
title_subjectivity

0.45

In [12]:
# polarity
title_sentiment_polarity = TextBlob(headline).polarity
title_sentiment_polarity

0.4

In [13]:
# absolute value polarirty
title_sentiment_abs_polarity = abs(title_sentiment_polarity)
title_sentiment_abs_polarity

0.4

In [14]:
import string

In [15]:
# average word length
average_token_length_title = np.mean([len(w) for w 
                                      in "".join(c for c in headline 
                                                 if c not in string.punctuation).split()])
average_token_length_title

5.4444444444444446

### Content Features

In [16]:
# number of words
n_tokens_content = len([w for w in content.split()])
n_tokens_content

124

In [17]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [18]:
# rate of unique words
r_unique_tokens = len(set([w.lower().decode('utf-8')
                           for w 
                           in "".join(c for c in content 
                                      if c not in string.punctuation).split()]))/n_tokens_content
r_unique_tokens

0.6854838709677419

In [19]:
# rate of non-stop word
r_non_stop_words = len([w.lower().decode('utf-8') 
                        for w in "".join(c for c in content 
                                         if c not in string.punctuation).split() 
                        if w.decode('utf-8') 
                        not in stop])/n_tokens_content
r_non_stop_words

0.6935483870967742

In [20]:
# rate of unique non-stop word
r_non_stop_unique_tokens = len(set([w.lower().decode('utf-8') 
                           for w in "".join(c for c in content 
                                            if c not in string.punctuation).split() 
                           if w.decode('utf-8') 
                           not in stop]))/n_tokens_content
r_non_stop_unique_tokens

0.5725806451612904

In [21]:
# average word length
average_token_length_content = np.mean([len(w) for w 
                                        in "".join(c for c in content
                                                   if c not in string.punctuation).split()])
average_token_length_content

4.887096774193548

In [22]:
# subjectivity
global_subjectivity = TextBlob(content.decode('utf-8')).subjectivity
global_subjectivity

0.3964015151515151

In [23]:
# polarity
global_sentiment_polarity = TextBlob(content.decode('utf-8')).polarity
global_sentiment_polarity

0.21079545454545456

In [24]:
# absolute polarity
global_sentiment_abs_polarity = abs(global_sentiment_polarity)
global_sentiment_abs_polarity

0.21079545454545456

In [25]:
# get polarity by word
polarity_list = [(w.decode('utf-8'), TextBlob(w.decode('utf-8')).polarity) 
                 for w in "".join(c for c in content 
                                  if c not in string.punctuation).split()]
len(polarity_list)

124

In [26]:
# global positive word rate
global_rate_positive_words = len([(w,p) 
                                  for (w,p) 
                                  in polarity_list 
                                  if p > 0])/len(polarity_list)
global_rate_positive_words

0.04838709677419355

In [27]:
# global negative word rate
global_rate_negative_words = len([(w,p) 
                                  for (w,p) 
                                  in polarity_list 
                                  if p < 0])/len(polarity_list)
global_rate_negative_words

0.0

In [28]:
# positive word rate (among non-nuetral words)
if [(w,p) for (w,p) in polarity_list if p != 0]:
    rate_positive_words = len([(w,p) 
                               for (w,p) 
                               in polarity_list 
                               if p > 0])/len([(w,p) 
                                               for (w,p) 
                                               in polarity_list 
                                               if p != 0])
else:
    rate_positive_words = 0
rate_positive_words

1.0

In [29]:
# negative word rate (among non-nuetral words)
if [(w,p) for (w,p) in polarity_list if p != 0]:
    rate_negative_words = len([(w,p) 
                               for (w,p) 
                               in polarity_list 
                               if p < 0])/len([(w,p) 
                                               for (w,p) 
                                               in polarity_list 
                                               if p != 0])
else:
    rate_negative_words = 0
rate_negative_words

0.0

In [30]:
# average polarity of positive words
if [p for (w,p) in polarity_list if p > 0]:
    avg_positive_polarity = np.mean([p for (w,p) 
                                     in polarity_list 
                                     if p > 0])
else:
    avg_positive_polarity = 0
    
avg_positive_polarity

0.28106060606060607

In [31]:
# minimum polarity of positive words
if [p for (w,p) in polarity_list if p > 0]:
    min_positive_polarity = min([p for (w,p) 
                                 in polarity_list 
                                 if p > 0])
else:
    min_positive_polarity = 0
min_positive_polarity

0.1

In [32]:
# maximum polarity of positive words
if [p for (w,p) in polarity_list if p > 0]:
    max_positive_polarity = max([p for (w,p) 
                                 in polarity_list 
                                 if p > 0])
else: 
    max_positive_polarity = 0
max_positive_polarity

0.6

In [33]:
# average polarity of negative words
if [p for (w,p) in polarity_list if p < 0]:
    avg_negative_polarity = np.mean([p for (w,p) 
                                     in polarity_list 
                                     if p < 0])
else:
    avg_negative_polarity = 0
avg_negative_polarity

0

In [34]:
# minimum polarity of negative words
if [p for (w,p) in polarity_list if p < 0]:
    min_negative_polarity = min([p for (w,p) 
                                 in polarity_list 
                                 if p < 0])
else:
    min_negative_polarity = 0
min_negative_polarity

0

In [35]:
# maximum polarity of negative words
if [p for (w,p) in polarity_list if p < 0]:
    max_negative_polarity = max([p for (w,p) 
                             in polarity_list 
                             if p < 0])
else:
    max_negative_polarity = 0
max_negative_polarity

0

In [36]:
# abs maximum polarity, sum of abs of max positive and abs of min negative polarity
max_abs_polarity = max_positive_polarity + abs(min_negative_polarity)
max_abs_polarity

0.6

In [37]:
from textstat.textstat import textstat

In [38]:
# Flesch Reading Ease
global_reading_ease = textstat.flesch_reading_ease(content.decode('utf-8'))
global_reading_ease

62.68

In [39]:
# Flesch Kincaid Grade Level
global_grade_level = textstat.flesch_kincaid_grade(content.decode('utf-8'))
global_grade_level

8.7