# Using Scattertext to Explore the Effectiveness of Headlines
### Jason S. Kessler ([@jasonkessler](http://www.twitter.com/JasonKessler))

The code in this notebook shows how you can use the Python package Scattertext to explore how language used in headlines 
can correlate with social engagement.

For background on the term-class association scores used and semiotic squares, please see https://github.com/JasonKessler/PuPPyTalk and https://github.com/JasonKessler/SemioticSquaresTalk

This notebook makes heavy use of the library Scattertext (https://github.com/JasonKessler/scattertext) for language processing and visualizations.

The data used were scraped from Facebook by Max Woolf.  Please see his original notebook at https://github.com/minimaxir/clickbait-cluster.

In [1]:
import pandas as pd
import numpy as np
import sys
import umap
import spacy
import scattertext as st
from gensim.models import word2vec
import re
from glob import glob
from scipy.stats import rankdata
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
import matplotlib.pyplot as plt

In [2]:
nlp = spacy.load('en')

In [6]:
# you may need to clone https://github.com/JasonKessler/PuPPyTalk to find this data
df = pd.concat([pd.read_csv(fn, sep='\t')
                .assign(publication=fn.split('/')[-1].split('_')[0]) 
                for fn in glob('../../PuPPyTalk/notebooks/fb_headlines/*')]).reset_index()
df['status_published'] = pd.to_datetime(df.status_published)

In [8]:
df_2016 = df[df.status_published.apply(lambda x: x.year >= 2016)].drop_duplicates()
df_2016 = df_2016.loc[df_2016['link_name'].dropna().index]
df_2016.publication.value_counts()

NYTimes     10326
CNN          9284
BuzzFeed     5386
Upworthy      824
Name: publication, dtype: int64

In [9]:
df_2016['parse'] = df_2016['link_name'].apply(nlp)

In [10]:
# Restrict to headlines longer than two words
df_2016 = df_2016[df_2016['parse'].apply(len) > 2]

In [11]:
from scipy.stats import rankdata
df_2016['reaction_percentile'] = df_2016.groupby('publication')['num_reactions'].apply(lambda x: pd.Series(rankdata(x)/len(x), index=x.index))
df_2016['reaction_bin'] = df_2016.reaction_percentile.apply(lambda x: 'Hi' if x > 2./3 else 'Lo' if x < 1./3 else 'Mid')

In [26]:
df_2016 = df_2016[df_2016.page_id.isin(['BuzzFeed', 'NYTimes'])]

In [33]:
help(st.ClassPercentageCompactor)

Help on class ClassPercentageCompactor in module scattertext.termcompaction.ClassPercentageCompactor:

class ClassPercentageCompactor(builtins.object)
 |  Methods defined here:
 |  
 |  __init__(self, term_ranker=<class 'scattertext.termranking.AbsoluteFrequencyRanker.AbsoluteFrequencyRanker'>, term_count=2)
 |      Limit terms to ones that make up a minimum percentage
 |      of documents in a category.  Given a term_count, set the threshold
 |      to that of the smallest class.
 |      
 |      Parameters
 |      ----------
 |      term_ranker : TermRanker
 |      term_count : int
 |  
 |  compact(self, term_doc_matrix)
 |      Parameters
 |      -------
 |      term_doc_matrix : TermDocMatrix
 |      
 |      Returns
 |      -------
 |      New term doc matrix
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of 

In [None]:
reaction_corpus = (st.CorpusFromParsedDocuments(df_2016, 
                                                parsed_col='parse', 
                                                category_col='reaction_bin',
                                                feats_from_spacy_doc=st.PhraseMachinePhrases()
                                               )
                   .build()
                   .compact(st.CompactTerms(slack=5)))

In [36]:
reaction_corpus

Unnamed: 0_level_0,Lo freq,Mid freq,Hi freq
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
michael phelps,2,3,13
19 tweets,0,5,8
disney princess,7,3,0
taylor swift,9,26,23
hilarious tweets,6,15,21
23 things,10,4,11
25 things,4,9,4
pokémon go,9,14,26
donald trump,157,216,263
simone biles,1,3,9


In [34]:
def get_metadata_from_corpus(corpus):
    df = corpus.get_df()
    return (df.page_id + ', ' 
            + df.reaction_percentile.apply(lambda x: str(int(x * 100)) + '%') + ', ' 
            + df.status_published.apply(lambda x: str(x.date())))

In [35]:
html = st.produce_frequency_explorer(reaction_corpus,
                                     category='Hi',
                                     not_categories=['Lo'],
                                     neutral_categories=['Mid'],
                                     minimum_term_frequency=1,
                                     pmi_filter_thresold=0,
                                     use_full_doc = True,
                                     term_scorer = st.ScaledFScorePresets(beta=1, one_to_neg_one=True),
                                     grey_threshold=0,
                                     width_in_pixels=1000,
                                     metadata=get_metadata_from_corpus(reaction_corpus),
                                     show_characteristic=False)
file_name = 'reaction_scaled_f_score.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

In [21]:
# Eliminate other categories from dataset (e.g., Upworthy or mid-engagment)
df_2016['category'] = df_2016.publication + ' ' + df_2016.reaction_bin
df_2016_four_square = df_2016[df_2016.publication.isin(['BuzzFeed', 'NYTimes']) 
                              & df_2016.reaction_bin.isin(['Hi', 'Lo'])]
# Create corpus and filter terms
four_square_corpus = (st.CorpusFromParsedDocuments(df_2016_four_square, category_col = 'category', parsed_col = 'parse')
                      .build()
                      .compact(st.CompactTerms(minimum_term_count=2, slack=5))
                      .compact(st.ClassPercentageCompactor(term_count=2)))

In [None]:
html = st.produce_projection_explorer(reaction_corpus,
                                      category='Hi', 
                                      not_categories=['Lo'], 
                                      neutral_categories=['Mid'],
                                      term_scorer = st.RankDifference(),
                                      neutral_category_name='Mid',
                                      width_in_pixels=1000,
                                      use_full_doc=True,
                                      projection_model = umap.UMAP(metric='cosine'),
                                      term_acceptance_re=re.compile(''),
                                      metadata=get_metadata_from_corpus(reaction_corpus))
file_name = 'output/reaction_umap_projection.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)