In [2]:
import scattertext as st
import tarfile, urllib, io
import pandas as pd
from IPython.display import IFrame
from IPython.core.display import display, HTML
import spacy
display(HTML("<style>.container { width:90% !important; }</style>"))

In [3]:
'''From Bo Pang's website: https://www.cs.cornell.edu/people/pabo/movie-review-data/

Data from:
A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization 
Based on Minimum Cuts'', Proceedings of the ACL, 2004
'''
SUBJECTIVITY_URL = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz'
data = io.BytesIO(urllib.request.urlopen(SUBJECTIVITY_URL).read())
tarball = tarfile.open(fileobj=data, mode = 'r:gz')
readme = tarball.extractfile('subjdata.README.1.0').read()
quote = tarball.extractfile('quote.tok.gt9.5000').read()
plot = tarball.extractfile('plot.tok.gt9.5000').read()

In [4]:
#print(readme.decode('utf-8'))

In [5]:
# Examples of subjective sentences in corpus
quote.decode('utf-8', errors='ignore').split('\n')[:3]

['smart and alert , thirteen conversations about one thing is a small gem . ',
 'color , musical bounce and warm seas lapping on island shores . and just enough science to send you home thinking . ',
 'it is not a mass-market entertainment but an uncompromising attempt by one artist to think about another . ']

In [6]:
'''Construct subjective vs. objective pandas dataframe, 
treating review quotes as subjective, and plot points as objective.  
Use spaCy to parse documents before putting them into the dataframe.

Print out the first line.
'''
nlp = spacy.en.English()
df = pd.DataFrame(
    [{'text': nlp(text.strip()), 'label': 'subjective'} 
     for text 
     in quote.decode('utf-8', errors='ignore').split('\n') 
     if text.strip()] 
    + [{'text': nlp(text.strip()), 'label': 'objective'} for text 
       in plot.decode('utf-8', errors='ignore').split('\n') 
       if text.strip()]
)
df.iloc[0]

label                                           subjective
text     (smart, and, alert, ,, thirteen, conversations...
Name: 0, dtype: object

In [7]:
'''Look at the size of the corpus'''
print("Number documents:", len(df))
print()
print("Subjective vs. objective document counts:")
print(df['label'].value_counts())
print()
print("Subjective vs. objective word counts:")
df.groupby('label').apply(lambda subdf: subdf['text'].apply(len).sum())

Number documents: 10000

Subjective vs. objective document counts:
objective     5000
subjective    5000
Name: label, dtype: int64

Subjective vs. objective word counts:


label
objective     130336
subjective    121183
dtype: int64

In [8]:
'''Convert Pandas dataframe to a term-document matrix, indicating
the category column is "label" and the text column name is "text".'''

corpus = st.CorpusFromParsedDocuments(df, category_col = 'label', parsed_col = 'text').build()

In [9]:
'''
Filter out bigrams with PMI < 2 * 5, and unigrams and bigrams that occur less than 10 times.  
The variable html is a string containing the HTML that makes up the scattertext visualization
'''
html = st.produce_scattertext_explorer(corpus, 
                                       category='subjective', 
                                       category_name='Subjective', 
                                       not_category_name='Objective',
                                       protocol='https',
                                       pmi_filter_thresold=4,
                                       minimum_term_frequency=10,
                                       width_in_pixels=1200)

# Hack to display HTML with D3 in Jupyter Notebook
open('subj_obj_scatter.html', 'wb').write(html.encode('utf-8'))
IFrame(src='subj_obj_scatter.html', width = 1400, height=1000)

In [11]:
'''
See how violence (or another word of your choice) is used in subjective or objective contexts.
'''
target_term = 'violence'
html = st.word_similarity_explorer(corpus, 
                                   category='subjective', 
                                   category_name='Subjective', 
                                   not_category_name='Objective',
                                   protocol='https',
                                   target_term = target_term,
                                   pmi_filter_thresold=4,
                                   minimum_term_frequency=10,
                                   width_in_pixels=1200,
                                   alpha=0.01,
                                   max_p_val=0.05)

# Hack to display HTML with D3 in Jupyter Notebook
open('subj_obj_scatter.html', 'wb').write(html.encode('utf-8'))
IFrame(src='subj_obj_scatter.html', width = 1400, height=1000)