In [2]:
import scattertext as st
import tarfile, urllib, io
import pandas as pd
from IPython.display import IFrame
from IPython.core.display import display, HTML
import spacy
display(HTML("<style>.container { width:90% !important; }</style>"))

# Exploring subjective vs. objective language in Movie Reviews

From Bo Pang's website: https://www.cs.cornell.edu/people/pabo/movie-review-data/

Data from:
A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization 
Based on Minimum Cuts'', Proceedings of the ACL, 2004

In [20]:
SUBJECTIVITY_URL = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz'
data = io.BytesIO(urllib.request.urlopen(SUBJECTIVITY_URL).read())
tarball = tarfile.open(fileobj=data, mode = 'r:gz')
readme = tarball.extractfile('subjdata.README.1.0').read()
quote = tarball.extractfile('quote.tok.gt9.5000').read()
plot = tarball.extractfile('plot.tok.gt9.5000').read()

In [21]:
#print(readme.decode('utf-8'))

In [22]:
# Examples of subjective sentences in corpus
quote.decode('utf-8', errors='ignore').split('\n')[:3]

['smart and alert , thirteen conversations about one thing is a small gem . ',
 'color , musical bounce and warm seas lapping on island shores . and just enough science to send you home thinking . ',
 'it is not a mass-market entertainment but an uncompromising attempt by one artist to think about another . ']

Construct subjective vs. objective pandas dataframe, 
treating review quotes as subjective, and plot points as objective.  
Use spaCy to parse documents before putting them into the dataframe.

Print out the first and last few lines (note that text is in spaCy 
format), and examine the size of the corpus.

In [42]:
nlp = spacy.en.English()
df = pd.DataFrame(
    [{'text': nlp(text.strip()), 'label': 'subjective'} 
     for text 
     in quote.decode('utf-8', errors='ignore').split('\n') 
     if text.strip()] 
    + [{'text': nlp(text.strip()), 'label': 'objective'} for text 
       in plot.decode('utf-8', errors='ignore').split('\n') 
       if text.strip()]
)
pd.concat([df.iloc[:3], df.iloc[-3:]])

Unnamed: 0,label,text
0,subjective,"(smart, and, alert, ,, thirteen, conversations..."
1,subjective,"(color, ,, musical, bounce, and, warm, seas, l..."
2,subjective,"(it, is, not, a, mass, -, market, entertainmen..."
9997,objective,"(enter, the, beautiful, and, mysterious, secre..."
9998,objective,"(after, listening, to, a, missionary, from, ch..."
9999,objective,"(looking, for, a, short, cut, to, fame, ,, gla..."


In [43]:
print("Number documents:", len(df))
print()
print("Subjective vs. objective document counts:")
print(df['label'].value_counts())
print()
print("Subjective vs. objective word counts:")
df.groupby('label').apply(lambda subdf: subdf['text'].apply(len).sum())

Number documents: 10000

Subjective vs. objective document counts:
objective     5000
subjective    5000
Name: label, dtype: int64

Subjective vs. objective word counts:


label
objective     130336
subjective    121183
dtype: int64

Convert Pandas dataframe to a term-document matrix, indicating
the category column is "label" and the text column name is "text"

In [44]:
corpus = st.CorpusFromParsedDocuments(df, category_col = 'label', parsed_col = 'text').build()

Filter out bigrams with PMI < 2 * 5, and unigrams and bigrams that occur less than 10 times.  The variable html is a string containing the HTML that makes up the scattertext visualization.   In order to get the visualization to render in a notebook, we need to write it to a file and  read it back in.

In [45]:
html = st.produce_scattertext_explorer(corpus, 
                                       category='subjective', 
                                       category_name='Subjective', 
                                       not_category_name='Objective',
                                       protocol='https',
                                       pmi_filter_thresold=4,
                                       minimum_term_frequency=10,
                                       width_in_pixels=1200)

# Hack to display HTML with D3 in Jupyter Notebook
open('subj_obj_scatter.html', 'wb').write(html.encode('utf-8'))
IFrame(src='subj_obj_scatter.html', width = 1400, height=1000)

Alternatively, view this on a log axis.  This disproportionally emphasises stopwords.

In [39]:
html = st.produce_scattertext_explorer(corpus, 
                                       category='subjective', 
                                       category_name='Subjective', 
                                       not_category_name='Objective',
                                       protocol='https',
                                       pmi_filter_thresold=4,
                                       minimum_term_frequency=10,
                                       width_in_pixels=1200,
                                       transform=st.Scalers.log_scale_standardize)
# Hack to display HTML with D3 in Jupyter Notebook
open('subj_obj_scatter_log.html', 'wb').write(html.encode('utf-8'))
IFrame(src='subj_obj_scatter_log.html', width = 1400, height=1000)

See how "war" (or another word of your choice) is used in subjective or objective contexts.  This uses glove word vectors via spaCy.

In [27]:
target_term = 'war'
html = st.word_similarity_explorer(corpus, 
                                   category='subjective', 
                                   category_name='Subjective', 
                                   not_category_name='Objective',
                                   protocol='https',
                                   target_term = target_term,
                                   pmi_filter_thresold=4,
                                   minimum_term_frequency=10,
                                   width_in_pixels=1200,
                                   alpha=0.01,
                                   max_p_val=0.05)

# Hack to display HTML with D3 in Jupyter Notebook
open('subj_obj_scatter_war.html', 'wb').write(html.encode('utf-8'))
IFrame(src='subj_obj_scatter_war.html', width = 1400, height=1000)