In [11]:
import scattertext as st
import tarfile, urllib, io
import pandas as pd
from IPython.display import IFrame
from IPython.core.display import display, HTML
import spacy
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
'''From Bo Pang's website: https://www.cs.cornell.edu/people/pabo/movie-review-data/

Data from:
A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization 
Based on Minimum Cuts'', Proceedings of the ACL, 2004
'''
SUBJECTIVITY_URL = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz'
data = io.BytesIO(urllib.request.urlopen(SUBJECTIVITY_URL).read())
tarball = tarfile.open(fileobj=data, mode = 'r:gz')
readme = tarball.extractfile('subjdata.README.1.0').read()
quote = tarball.extractfile('quote.tok.gt9.5000').read()
plot = tarball.extractfile('plot.tok.gt9.5000').read()

In [3]:
# Examples of subjective sentences in corpus
quote.decode('utf-8', errors='ignore').split('\n')[:3]

['smart and alert , thirteen conversations about one thing is a small gem . ',
 'color , musical bounce and warm seas lapping on island shores . and just enough science to send you home thinking . ',
 'it is not a mass-market entertainment but an uncompromising attempt by one artist to think about another . ']

In [65]:
'''Construct subjective vs. objective pandas dataframe, 
treating review quotes as subjective, and plot points as objective.  
Use spaCy to parse documents before putting them into the dataframe.

Print out the first line.
'''
nlp = spacy.en.English()
df = pd.DataFrame(
    [{'text': nlp(text.strip()), 'label': 'subjective'} 
     for text 
     in quote.decode('utf-8', errors='ignore').split('\n') 
     if text.strip()] 
    + [{'text': nlp(text.strip()), 'label': 'objective'} for text 
       in plot.decode('utf-8', errors='ignore').split('\n') 
       if text.strip()]
)
df.iloc[0]

label                                           subjective
text     (smart, and, alert, ,, thirteen, conversations...
Name: 0, dtype: object

In [66]:
'''Look at the size of the corpus'''
print("Number documents:", len(df))
print()
print("Subjective vs. objective document counts:")
print(df['label'].value_counts())
print()
print("Subjective vs. objective word counts:")
df.groupby('label').apply(lambda subdf: subdf['text'].apply(len).sum())



Number documents: 10000

Subjective vs. objective document counts:
subjective    5000
objective     5000
Name: label, dtype: int64

Subjective vs. objective word counts:


label
objective     130336
subjective    121183
dtype: int64

In [67]:
'''Convert Pandas dataframe to a term-document matrix, indicating
the category column is "label" and the text column name is "text".'''

corpus = st.CorpusFromParsedDocuments(df, category_col = 'label', parsed_col = 'text').build()

In [68]:
'''
Filter out bigrams with PMI < 3, and unigrams and bigrams that occur less than 20 times.  
The variable html is a string containing the HTML that makes up the scattertext visualization
'''
html = st.produce_scattertext_explorer(corpus, 
                                       category='subjective', 
                                       category_name='Subjective', 
                                       not_category_name='Objective',
                                       protocol='https',
                                       pmi_filter_thresold=3,
                                       minimum_term_frequency=20,
                                       width_in_pixels=1000)

# Hack to display HTML with D3 in Jupyter Notebook
open('subj_obj_scatter.html', 'wb').write(html.encode('utf-8'))
IFrame(src='subj_obj_scatter.html', width = 1200, height=1000)

In [72]:
''' Display top 20 terms that are characteristic of a subjective document-label (using Scaled F-Score) and their frequencies.
'''
term_freq_df = corpus.get_term_freq_df()
corpus.get_scaled_f_scores('subjective')#, scaler_algo='percentile')
term_freq_df['Subjective Score'] = corpus.get_scaled_f_scores('subjective', scaler_algo='percentile')
term_freq_df = term_freq_df.sort_values(by='Subjective Score', ascending=False)
term_freq_df.iloc[:20]

Unnamed: 0_level_0,objective freq,subjective freq,Subjective Score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
movie that,0,73,0.802352
it does,0,51,0.77937
entertaining,2,73,0.771975
film 's,2,69,0.767833
but it,6,156,0.76711
i,13,273,0.756298
interesting,3,70,0.752625
film that,4,77,0.745429
performances,5,89,0.743702
of its,6,103,0.742833


In [78]:
''' Display top 20 terms that are characteristic of being related to movies.
'''
term_freq_df = corpus.get_scaled_f_scores_vs_background()
term_freq_df.iloc[:20]


Unnamed: 0,corpus,background,Scaled f-score
nete,12.0,0.0,0.000113
roxie,19.0,211370.0,8.9e-05
rupi,9.0,0.0,8.4e-05
clichs,9.0,0.0,8.4e-05
stifler,10.0,34086.0,8.1e-05
huppert,10.0,69896.0,7.1e-05
melodrama,22.0,422158.0,6.9e-05
forgettable,13.0,170591.0,6.8e-05
devdas,9.0,56498.0,6.7e-05
cinematic,49.0,1255895.0,6.7e-05
