In [3]:
%load_ext autoreload
%autoreload 2
import os, json, re, sys, time, warnings, datetime, glob
sys.path.insert(0, '../scattertext/')

import scipy.stats as ss    
import pandas as pd
import scattertext as st
import numpy as np
import spacy

from tqdm.auto import tqdm
tqdm.pandas()

from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
nlp = spacy.load('en_core_web_sm')

# Load the Rotten Tomatoes Movie Review Data Set (Pang and Lee 2002)

We parse each review with spaCy. On a 2018 Macbook Pro, this takes about 31 seconds. The progress bar tells us it has processed documents. Reviews include polarity (positive or negative) and the name of the movie being reviewed.  4,866 reviews are retained after filtering for plot descriptions.

Data set is from http://www.cs.cornell.edu/people/pabo/movie-review-data/

References:
* Bo Pang, Lillian Lee, and Shivakumar Vaithyanathan, Thumbs up? Sentiment Classification using Machine Learning Techniques, Proceedings of EMNLP 2002.

* Bo Pang and Lillian Lee, A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization Based on Minimum Cuts, Proceedings of ACL 2004.

In [5]:
movie_df = st.SampleCorpora.RottenTomatoes.get_data().assign(
    category = lambda df: df.category.apply(
        lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x]),
    SpacyParse=lambda df: df.text.progress_apply(nlp)
)[lambda df: df.category.isin(['Negative', 'Positive'])]

  0%|          | 0/5022 [00:00<?, ?it/s]

In [6]:
movie_df.describe()

Unnamed: 0,category,filename,text,movie_name,SpacyParse
count,4866,4866,4866,4866,4866
unique,2,156,4863,156,4866
top,Positive,subjectivity_html/subj/2002/Abandon_2002.html,What we have is a character faced with the pos...,abandon,"(Gaghan, captures, the, half, -, lit, ,, somet..."
freq,2455,35,2,35,1


## Creating a corpus object in Scattertext
- The most straightforward way to visualize documents in Scattertext is to create a corpus object from a Pandas data frame. Each row corresponds to a single document, while columns indicate other data about each document. Scattertext has numerous classes available to represent corpora, but the one we will begin with is a `ParsedCorpus` generated by the `CorpusFromParsedDocuments` factory.
- Typically, each document has a category about which Scattertext can generate various keyness metrics and visualizations. The columns containing the category and parse are passed as `parse_col` and `parsed_col`, respectively.
- The corpus object contains feature representations of each document. These either be tokens that occur in the document (using a case-insensitive search) or other features, some of which are linked to character offsets or a searchable lexicon. These non-searchable features are called "non_text" features. For now, we'll generate simple token features. 
- To turn each document into a feature vector, we will use the `FlexibleNGrams` class. Here, we output all unigrams found in the spaCy parses, filtering out blank spaces. We also exclude terms used in less than 6 documents.

In [7]:
unigram_corpus = st.CorpusFromParsedDocuments(
    movie_df,
    category_col='category',
    parsed_col='SpacyParse',
    feats_from_spacy_doc=st.FlexibleNGrams(ngram_sizes=[1])
).build().filter_out(
    lambda x: len(x.strip()) < 1
).remove_terms_used_in_less_than_num_docs(
    threshold=6
)


## Exploring the corpus

In [8]:
print("Number of terms in corpus:", unigram_corpus.get_num_terms())
print("Number of reviews in corpus:", unigram_corpus.get_num_docs())
print("Number of categories in corpus:", unigram_corpus.get_num_categories())
print("Categories in corpus:", unigram_corpus.get_categories())


Number of terms in corpus: 2083
Number of reviews in corpus: 4866
Number of categories in corpus: 2
Categories in corpus: ['Negative', 'Positive']


## Statistics about terms in the corpus 

= We can retrieve the term-document matrix as a sparse matrix using the `get_term_doc_mat` method.

In [9]:
tdm = unigram_corpus.get_term_doc_mat()
tdm.todense().shape

(4866, 2083)

## Statistics about terms in the corpus 

- We can use classes called `TermRanker`s to get term frequency statistics from each document. 
- N.B.
    - Currently, only three are implemented, and the ones which rely on document size use feature counts found in the term-document matrix. 
    - This means that if >1-grams are present or features have been removed, these will provide inaccurate estimates of document size. Use caution when using these, but they may still yield interesting results.
- The first and simplest is called the `AbsoluteFrequencyRanker`, where the sum of the number of terms in each category is returned. The `label_append` is a string concatenated to each category's name. The `label_append` is useful if other category-specific metrics will be added to the data frame.

In [21]:
st.AbsoluteFrequencyRanker(unigram_corpus).get_ranks(label_append=' Count').head()

Unnamed: 0_level_0,Negative Count,Positive Count
term,Unnamed: 1_level_1,Unnamed: 2_level_1
captures,1,5
the,2287,2351
half,27,14
-,949,902
",",2148,2466


`OncePerDocFrequencyRanker` ignores terms that occur in more than one document.

In [20]:
st.OncePerDocFrequencyRanker(unigram_corpus).get_ranks(label_append=' Documents').head()

Unnamed: 0_level_0,Negative Documents,Positive Documents
term,Unnamed: 1_level_1,Unnamed: 2_level_1
captures,1,5
the,1364,1414
half,26,13
-,634,642
",",1366,1488


`DocLengthNormalizedFrequencyRanker` weights each term by its document percentage, i.e., 

$$\mbox{weight}_{t,c} = \sum_{\mbox{doc } d \ \in \mbox{category } c} \frac{\#(t, d)}{|d|}$$

In [13]:
st.DocLengthNormalizedFrequencyRanker(unigram_corpus).get_ranks(label_append=' Percentages').head()

  return np.true_divide(self.todense(), other)


Unnamed: 0,Negative Percentages,Positive Percentages
captures,0.041667,0.298333
the,115.924483,117.177789
half,1.340761,0.711467
-,50.754314,45.370685
",",108.902535,123.622976


### Custom TermRanker

We can implement our own term ranker by creating a subclass of `scattertext.termranking.TermRanker.TermRanker` or `st.TermRanker`. 

This custom ranker will return mean tf.idf scores for each document in a category. Note that the term frequencies are square-root scaled.

The `TermRanker.get_term_doc_mat()` returns a document-row/term-column CSR sparse matrix, with values indicating the number of times a term appeared in that document.

A data frame should be returned indexed on terms and with the index named "term".

In [59]:
class MeanTFIDF(st.TermRanker):
    def get_ranks(self, label_append: str=' mean tf.idf') -> pd.DataFrame:
        sqrt_tf = np.sqrt(self.get_term_doc_mat())
        idf = np.log(self._corpus.get_num_docs()/(tdm>0).sum(axis=0).A1)
        tfidf = sqrt_tf.multiply(idf).tocsr()
        y = self._corpus.get_category_ids()        
        return pd.DataFrame({
            cat+label_append : tfidf[y == cat_i, :].mean(axis=0).A1
            for cat_i, cat 
            in enumerate(self._corpus.get_categories())
        }).assign(
            term = self._corpus.get_terms()
        ).set_index('term')
            
        

In [60]:
MeanTFIDF(unigram_corpus).get_ranks().head()

Unnamed: 0_level_0,Negative mean tf.idf,Positive mean tf.idf
term,Unnamed: 1_level_1,Unnamed: 2_level_1
captures,0.002778,0.013642
the,0.397556,0.403445
half,0.052877,0.026372
-,0.416547,0.403839
",",0.368237,0.403435


## Plotting language difference between positive and negative reviews

- Scattertext projects unigrams on a scatterplot based on the rank of their frequencies in each category. 
- The placement and color of each term are given in the `plot_df` data frame, where `Xpos` and `Ypos` indicate term coordinates ($\in [0,1]^2$).
- Scattertext attempts to label as many points as possible on the plot.
- The ColorScore indicates which terms receive bluer (more positive) or more red (more negative) point colors. 
  - The score is based on the difference between the scaled category frequency ranks.
  - Terms receiving the highest and lowest scores are labeled "Top Newer" and "Top Older."
- The unusual terms in the corpus (as judged by a set of general-purpose term frequencies) are displayed in the Characteristic column. 
- The newer function `dataframe_scattertext` function renders the scatter plot.
- Clicking on a term displays its usage in context.
- The get_heading(corpus) function returns the metadata text for each document.

In [25]:
def get_heading(corpus: st.ParsedCorpus):
    return corpus.get_df().movie_name

plot_df = st.AbsoluteFrequencyRanker(unigram_corpus).get_ranks(label_append='').assign(
    X=lambda df: df.Positive,
    Y=lambda df: df.Negative,
    PosRank = lambda df: ss.rankdata(df.X, method='dense'),
    NegRank = lambda df: ss.rankdata(df.Y, method='dense'),
    Xpos=lambda df: st.scale(df.NegRank),
    Ypos=lambda df: st.scale(df.PosRank),
    ColorScore=lambda df: st.Scalers.scale_center_zero(df.Ypos - df.Xpos),
)

line_df = pd.DataFrame({
    'x': np.arange(0, 1, 0.01),
    'y' :np.arange(0, 1, 0.01),
})


html = st.dataframe_scattertext(
    unigram_corpus,
    plot_df=plot_df,
    category='Positive', 
    category_name='Positive',
    not_category_name='Negative',
    width_in_pixels=1000, 
    ignore_categories=False,    
    metadata=get_heading,
    color_score_column='ColorScore',
    left_list_column='ColorScore',
    show_characteristic=False,
    y_label='Positive Frequency Rank',
    x_label='Negative Frequency Rank',
    tooltip_columns=['PosRank', 'NegRank'],
    header_names={'upper': 'Top Positive', 'lower': 'Top Negative'},
    line_coordinates = line_df.to_dict('records'),   
)

fn = 'movie_denserank_unigrams.html'
with open(fn, 'w') as of:
    of.write(html)


IFrame(src=fn, width = 1300, height=700)

  vec_ss = (vec_ss - vec_ss.min()) * 1. / (vec_ss.max() - vec_ss.min())


Let's perform the same action, but let's swap absolute frequencies for Mean TF.IDF.

In [64]:
def get_heading(corpus: st.ParsedCorpus):
    return corpus.get_df().movie_name

plot_df = MeanTFIDF(unigram_corpus).get_ranks(label_append='').assign(
    X=lambda df: df.Positive,
    Y=lambda df: df.Negative,
    PosRank = lambda df: ss.rankdata(df.X, method='dense'),
    NegRank = lambda df: ss.rankdata(df.Y, method='dense'),
    Xpos=lambda df: st.scale(df.NegRank),
    Ypos=lambda df: st.scale(df.PosRank),
    ColorScore=lambda df: st.Scalers.scale_center_zero(df.X - df.Y),
)

line_df = pd.DataFrame({
    'x': np.arange(0, 1, 0.01),
    'y' :np.arange(0, 1, 0.01),
})


html = st.dataframe_scattertext(
    unigram_corpus,
    plot_df=plot_df,
    category='Positive', 
    category_name='Positive',
    not_category_name='Negative',
    width_in_pixels=1000, 
    ignore_categories=False,    
    metadata=get_heading,
    color_score_column='ColorScore',
    left_list_column='ColorScore',
    show_characteristic=False,
    y_label='Positive Mean TF.IDF Rank',
    x_label='Negative Mean TF.IDF Ranak',
    tooltip_columns=['Positive', 'Negative'],
    header_names={'upper': 'Top Positive', 'lower': 'Top Negative'},
    line_coordinates = line_df.to_dict('records'),   
)

fn = 'movie_mean_tf_idf_unigrams.html'
with open(fn, 'w') as of:
    of.write(html)


IFrame(src=fn, width = 1300, height=700)

In [50]:
plot_df

Unnamed: 0_level_0,Negative,Positive,X,Y,Xpos,Ypos,ColorScore
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
captures,0.998378,1.001622,1.001622,0.998378,0.320782,0.679218,0.507622
the,0.989773,1.010227,1.010227,0.989773,0.376565,0.623435,0.548060
half,1.005489,0.994511,0.994511,1.005489,0.274688,0.725312,0.442662
-,1.001454,0.998546,0.998546,1.001454,0.300840,0.699160,0.484807
",",0.960460,1.039540,1.039540,0.960460,0.566590,0.433410,0.685813
...,...,...,...,...,...,...,...
unfaithful,1.000045,0.999955,0.999955,1.000045,0.309979,0.690021,0.499534
lyne,1.000437,0.999563,0.999563,1.000437,0.307434,0.692566,0.495434
wallace,1.001267,0.998733,0.998733,1.001267,0.302057,0.697943,0.486768
oleander,0.999200,1.000800,1.000800,0.999200,0.315452,0.684548,0.503759


# Keyness in Scattertext: Term Scorers

- Given a corpus, we can produce scores and statistics for how associated features are to a particular category.
- Subclasses of `CorpusBasedTermScorer` can produce these scores and use a `TermRanker` in the process.
- We will look at some built-in term scorers and see how to write our own.
- First, let's examine a simple term scorer, the smoothed log odds ratio, implemented in the `LogOddsRatioScorer` class.

The log odds ratio, comparing a category $a$ against a category $b$, is defined as:

$$ \mbox{Log-Odds-Ratio}(\mbox{term}_i, \mbox{category}_a, \mbox{category}_b) = \log \frac{\#_{ai}}{|a| - \#_{ai}} - \log \frac{\#_{bi}}{|b| - \#_{bi}} $$

where $\#_{\mbox{category}, \mbox{term index}}$ is the number of times a term occurred in a category, and $|\mbox{category}|$ is the number of terms in a category.

- Pseudo counts of a small $\alpha$ (often 0.01) are added to each term count to prevent undefined values when a term does not appear in a category,

- A single "positive" category and multiple "negative" categories are added to the TermScorer. By default, all categories which are not positive are considered as negative categories.

We will first view the scores as a data frame, then plot them.


In [28]:
st.LogOddsRatioScorer(
    unigram_corpus,
    constant=0.001
).set_term_ranker(
    term_ranker=st.AbsoluteFrequencyRanker
).set_categories(
    category_name='Positive',
    not_category_names=['Negative']
).get_score_df()

Unnamed: 0_level_0,Negative freq,Positive freq,Smoothed Log Odds Ratio
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
boring,17,0,-9.776894
seagal,15,0,-9.651694
benigni,14,0,-9.582684
pinocchio,13,0,-9.508559
stale,12,0,-9.428501
...,...,...,...
touching,0,12,9.357511
jones,0,12,9.357511
haynes,0,12,9.357511
riveting,0,12,9.357511


# Score-centered visualization

- Alternatively, we can visualize various keyness metrics against term frequency. 
- We can use `dataframe_scattertext`, or `produce_frequency_explorer`, which is both more conscise at the expense of being and less customizable.
- Points are colored by the term scores, with bluer points with higher scores and redder points with lower scors.
- It takes the corpus, names of the positive and negative categories, and the list of negative categories (the `not_categories` parameter).
- It includes a `minimum_term_frequency` threshold (you should typically set this to zero) and a `grey threshold` parameter, which gives an absolute value of range of scores to gery out. This is useful when the score ia p-value or z-score and a significance threshold is used.
- Examining the plot, we can see that terms closely related to films, such as actors and directors score highly. These are mostly associated with movies which had exclusively positive or negative. "Segal" for example, appeared in all reviews for Half Past Dead, a movie which was panned in our corpus.

In [42]:
html = st.produce_frequency_explorer(
    unigram_corpus,
    category='Positive',
    category_name='Positive',
    not_category_name='Negative',
    not_categories=['Negative'],
    term_scorer=st.LogOddsRatioScorer,
    term_ranker=st.AbsoluteFrequencyRanker,
    metadata=get_heading,
    minimum_term_frequency=0,
    grey_threshold=0,
)

fn = 'movies_log_odds_ratio_smoothed.html'
with open(fn, 'w') as of:
    of.write(html)


IFrame(src=fn, width = 1300, height=700)

x


## Alternative: Cohen's $d$

Cohen's $d$ computes an effect size for each term. An effect size is the number of pooled standard deviations which separate the category means.


In [None]:
html = st.produce_frequency_explorer(
    unigram_corpus,
    category='Positive',
    category_name='Negative',
    not_category_name='Negative',
    term_scorer=st.CohensD,
    metadata=get_heading,
    minimum_term_frequency=0,
    grey_threshold=0,
)

fn = 'movies_cohensd.html'
with open(fn, 'w') as of:
    of.write(html)


IFrame(src=fn, width = 1300, height=700)

In [None]:
html = st.produce_frequency_explorer(
    unigram_corpus,
    category='Positive',
    category_name='Negative',
    not_category_name='Negative',
    term_scorer=st.CohensD,
    metadata=get_heading,
    minimum_term_frequency=0,
    grey_threshold=0,
)

fn = 'movies_cohensd.html'
with open(fn, 'w') as of:
    of.write(html)


IFrame(src=fn, width = 1300, height=700)

# Available term scorers:

- `st.BetaPosterior` Beta Posterior (Bamman et al 2014) as reported by (Chang and McKeown, 2019).
- `st.RelativeEntropy` Relative Entropy (Fankhauser et al 2014)
- `st.BNSScorer` Bi-normal separation (Forman 2008)
- `st.CohensD` Cohen's d; Hedges r, p-values and z-scores available in `CohensD.get_score_df`
- `st.HedgesR` Hedge's r
- `st.LogOddsRatio` Log odds ratio
- `st.DeltaJSDivergenceScrorer` JS Divergence
- `st.CraigsZetaScorer



David Bamman, Jacob Eisenstein, and Tyler Schnoebelen.  GENDER IDENTITY AND LEXICAL VARIATION IN SOCIAL MEDIA. 2014.

Serina Chang and Kathleen McKeown. Automatically Inferring Gender Associations from Language. EMNLP 2019

Peter Fankhauser, Jorg Knappen, Elke Teich. Exploring and visualizing variation in language resources. LREC 2014.
    
George Forman. BNS feature scaling: an improved representation over tf-idf for svm text classification. CIKM 2008.

In [None]:
html = st.produce_frequency_explorer(
    unigram_corpus,
    category='Positive',
    category_name='Positive',
    not_category_name='Negative',
    term_scorer=st.DeltaJSDivergenceScorer,
    term_ranker=MeanTFIDF,
    metadata=get_heading,
    minimum_term_frequency=0,
    grey_threshold=0,
)

fn = 'movies_deltajsd.html'
with open(fn, 'w') as of:
    of.write(html)


IFrame(src=fn, width = 1300, height=700)

In [43]:
html = st.produce_frequency_explorer(
    unigram_corpus,
    category='Positive',
    category_name='Negative',
    not_category_name='Negative',
    term_scorer=st.LogOddsRatioUniformativePriorScorer,
    metadata=get_heading,
    minimum_term_frequency=0,
    grey_threshold=0,
)

fn = 'movies_lorups.html'
with open(fn, 'w') as of:
    of.write(html)


IFrame(src=fn, width = 1300, height=700)

TypeError: Can't instantiate abstract class LogOddsRatioUniformativePriorScorer with abstract methods _set_scorer_args

## Or just use the dense rank difference in the original chart



In [None]:
html = st.produce_frequency_explorer(
    unigram_corpus,
    category='Positive',
    category_name='Negative',
    not_category_name='Negative',
    term_scorer=st.RankDifferensceScorer(unigram_corpus),
    metadata=get_heading,
    minimum_term_frequency=0,
    grey_threshold=0,
)

fn = 'movies_rank_diff.html'
with open(fn, 'w') as of:
    of.write(html)


IFrame(src=fn, width = 1300, height=700)

In [None]:
movie_corpus = unigram_corpus.recategorize(unigram_corpus.get_df()['movie_name'])

dispersion = st.Dispersion(
    movie_corpus,
    use_categories=True
)
dispersion_df=dispersion.get_df().assign(
    DA=dispersion.dp(),
    X=lambda df: df.Frequency,
    Xpos=lambda df: st.Scalers.log_scale(df.X),
    Y=lambda df: df.DA,
    Ypos=lambda df: st.Scalers.scale(df.Y)
)



In [None]:
movie_corpus = unigram_corpus.recategorize(unigram_corpus.get_df()['movie_name'])

dispersion = st.Dispersion(
    movie_corpus,
    use_categories=True
)
dispersion_df=dispersion.get_df().assign(
    DA=dispersion.dp(),
    X=lambda df: df.Frequency,
    Xpos=lambda df: st.Scalers.log_scale(df.X),
    Y=lambda df: df.DA,
    Ypos=lambda df: st.Scalers.scale(df.Y),
    Expected=lambda df: st.smoothing.lowess.Lowess().fit(
        np.array([df.X.values]).T,
        df.Y.values
    ).predict(np.array([df.X.values]).T).T[0],
    Residual=lambda df: df.Ypos - st.scale(df.Y, df.Expected),
    ColorScore=lambda df: 1 #st.Scalers.scale(df.DA)
)

line_df = pd.DataFrame({
    'x': dispersion_df.Xpos.values,
    'y': dispersion_df.Expected.values,
}).sort_values(by='x')

html = st.dataframe_scattertext(
    movie_corpus,
    plot_df=dispersion_df,
    metadata=get_heading,
    ignore_categories=False,
    x_label='Log Frequency',
    y_label='DA',
    y_axis_labels=['More Dispersion', 'Medium', 'Less Dispersion'],
    color_score_column='ColorScore',
    tooltip_columns=['Frequency', 'DA'],
    header_names={'upper': 'Top DA', 'lower': 'Bottom DA'},
    left_list_column='DA',
    show_characteristic=False,
    line_coordinates = line_df.to_dict('records')
)

fn = 'movie_dispersion.html'
open(fn, 'w').write(html)
IFrame(src=fn, width = 1300, height=700)

In [None]:

residual_df = dispersion_df.assign(
    Y = lambda df: df.Residual,
    Ypos = lambda df: st.Scalers.scale_center_zero(df.Y)
)

line_df = pd.DataFrame({
    'x': residual_df.Xpos.values,
    'y': 0.5,
}).sort_values(by='x')

html = st.dataframe_scattertext(
    unigram_corpus,
    category='Positive',
    category_name='Positive',
    not_category_name='Negative',
    plot_df=residual_df,
    metadata=get_heading,
    ignore_categories=False,
    sort_doc_labels_by_name=True,
    x_label='Log Frequency',
    y_label='Residual: DA - E-hat[DA]',
    y_axis_labels=['More Dispersion', 'Medium', 'Less Dispersion'],
    color_score_column='ColorScore',
    tooltip_columns=['Frequency', 'Residual'],
    header_names={'upper': 'Lower than Expected', 'lower': 'More than Expected', 'right': 'Frequency'},
    left_list_column='Residual',
    right_order_column='Frequency',    
    line_coordinates = line_df.to_dict('records'),
    show_corpus_stats=False
)

fn = 'movie_dispersion_residual.html'
open(fn, 'w').write(html)
IFrame(src=fn, width = 1300, height=700)

In [None]:
dispersion_df=st.get_category_dispersion(
    corpus=unigram_corpus,
    corpus_to_parts=lambda corpus: corpus.get_df()['movie_name'],
    metric='DA',
    non_text=False
)

coordinates = st.Scalers.scale_jointly(
    x=st.Scalers.log_scale(dispersion_df.Negative_DA + 0.01), 
    y=st.Scalers.log_scale(dispersion_df.Positive_DA + 0.01)
)

dispersion_df = dispersion_df.assign(
    X=lambda df: df.Negative_DA,
    Xpos=lambda df: coordinates.x,
    Y=lambda df: df.Positive_DA,
    Ypos=lambda df: coordinates.y,
    ColorScore=lambda df: st.Scalers.scale_center_zero(df.Y-df.X),
    Frequency=lambda df: df.Positive_Frequency+df.Negative_Frequency
)

html = st.dataframe_scattertext(
    unigram_corpus,
    category='Positive',
    category_name='Positive',
    not_category_name='Negative',    
    plot_df=dispersion_df,
    metadata=get_heading,
    ignore_categories=False,
    x_label='Negative Log DA',
    y_label='Positive Log DA',
    y_axis_labels=['Less Dispersion', 'Medium', 'More Dispersion'],
    x_axis_labels=['Less Dispersion', 'Medium', 'More Dispersion'],
    color_score_column='ColorScore',
    tooltip_columns=['Positive_DA', 'Negative_DA'],
    header_names={'upper': 'Top Positive', 'lower': 'Top Negative', 'right': 'Frequency'},
    left_list_column='ColorScore',
    right_order_column='Frequency',    
)

fn = 'movie_pos_da_neg_da.html'
open(fn, 'w').write(html)
IFrame(src=fn, width = 1300, height=700)

In [None]:
positive_corpus = unigram_corpus.remove_categories(['Negative']).recategorize(
    lambda corpus: corpus.get_df()['movie_name']
)

plot_df = st.Dispersion(
    positive_corpus, use_categories=True
).get_adjusted_metric_df()

plot_df = plot_df.assign(
    X=lambda df: df.Frequency,
    Xpos=lambda df: st.Scalers.log_scale(df.X),
    Y=lambda df: df.Residual,
    Ypos=lambda df: st.Scalers.scale_center_zero_abs(df.Residual),
    ColorScore=lambda df: st.Scalers.scale_center_zero_abs(df.Residual),
)

html = st.dataframe_scattertext(
    positive_corpus,
    plot_df=plot_df,
    metadata=get_heading,
    unified_context=True,
    ignore_categories=False,
    show_corpus_stats=False,
    x_label='Log Frequency',
    y_label='Residual DA',
    y_axis_labels=['Less Dispersion', 'Medium', 'More Dispersion'],
    #x_axis_labels=['Less Dispersion', 'Medium', 'More Dispersion'],
    color_score_column='ColorScore',
    tooltip_columns=['Frequency', 'Residual'],
    header_names={'upper': 'Top Residual', 'lower': 'Bottom Residual', 'right': 'Frequency'},
    left_list_column='ColorScore',
    right_order_column='Frequency',    
)

fn = 'movie_pos_residual.html'
open(fn, 'w').write(html)
IFrame(src=fn, width = 1300, height=700)

In [None]:
negative_corpus = unigram_corpus.remove_categories(['Positive']).recategorize(
    lambda corpus: corpus.get_df()['movie_name']
)

plot_df = st.Dispersion(
    negative_corpus, use_categories=True
).get_adjusted_metric_df()

plot_df = plot_df.assign(
    X=lambda df: df.Frequency,
    Xpos=lambda df: st.Scalers.log_scale(df.X),
    Y=lambda df: df.Residual,
    Ypos=lambda df: st.Scalers.scale_center_zero_abs(df.Residual),
    ColorScore=lambda df: st.Scalers.scale_center_zero_abs(df.Residual),
)

html = st.dataframe_scattertext(
    negative_corpus,
    plot_df=plot_df,
    metadata=get_heading,
    unified_context=True,
    ignore_categories=False,
    show_corpus_stats=False,
    x_label='Log Frequency',
    y_label='Residual DA',
    y_axis_labels=['Less Dispersion', 'Medium', 'More Dispersion'],
    #x_axis_labels=['Less Dispersion', 'Medium', 'More Dispersion'],
    color_score_column='ColorScore',
    tooltip_columns=['Frequency', 'Residual'],
    header_names={'upper': 'Top Residual', 'lower': 'Bottom Residual', 'right': 'Frequency'},
    left_list_column='ColorScore',
    right_order_column='Frequency',    
)

fn = 'movie_neg_residual.html'
open(fn, 'w').write(html)
IFrame(src=fn, width = 1300, height=700)

In [None]:

plot_df = st.Dispersion(
    positive_corpus, use_categories=True
).get_adjusted_metric_df()

plot_df = plot_df.assign(
    X=lambda df: df.Frequency,
    Xpos=lambda df: st.Scalers.log_scale(df.X),
    Y=lambda df: df.Residual,
    Ypos=lambda df: st.Scalers.scale_center_zero_abs(df.Residual),
    ColorScore=lambda df: st.Scalers.scale_center_zero_abs(df.Residual),
)

html = st.dataframe_scattertext(
    positive_corpus,
    plot_df=plot_df,
    metadata=get_heading,
    unified_context=True,
    ignore_categories=False,
    show_corpus_stats=False,
    x_label='Log Frequency',
    y_label='Residual DA',
    y_axis_labels=['Less Dispersion', 'Medium', 'More Dispersion'],
    #x_axis_labels=['Less Dispersion', 'Medium', 'More Dispersion'],
    color_score_column='ColorScore',
    tooltip_columns=['Frequency', 'Residual'],
    header_names={'upper': 'Top Residual', 'lower': 'Bottom Residual', 'right': 'Frequency'},
    left_list_column='ColorScore',
    right_order_column='Frequency',    
)

fn = 'movie_pos_residual.html'
open(fn, 'w').write(html)
IFrame(src=fn, width = 1300, height=700)

In [None]:
st.Scalers