In [540]:
%load_ext autoreload
%autoreload 2
import os, json, re, sys, time, warnings, datetime, glob
#sys.path.insert(0, '../scattertext/')
import spacy 
import pandas as pd
import numpy as np
import scattertext as st

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [541]:
nlp = spacy.blank('en')
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x7fa9fad22500>

In [542]:
# From https://www.kaggle.com/datasets/gpreda/bbc-news
bbc_df = pd.read_csv('bbc_news.csv.gz').assign(
    Date = lambda df: pd.to_datetime(df.pubDate),
    Parse = lambda df: df.description.progress_apply(nlp),
    DateStr=lambda df: df.Date.apply(lambda x: x.strftime("%Y-%m-%d")),    
)

  0%|          | 0/13955 [00:00<?, ?it/s]

In [543]:
bbc_df = bbc_df[lambda df: df.DateStr > '2022-02-17'].sort_values(by='Date')

In [544]:
word_number_matcher = re.compile('^[A-Za-z0-9 ]+$')

def exclude_ngrams_which_do_not_start_and_end_with_function_words(ngram: spacy.tokens.Span) -> bool:
    return any([ngram[0].lower_.strip() in st.MY_ENGLISH_STOP_WORDS,
                ngram[-1].lower_.strip() in st.MY_ENGLISH_STOP_WORDS,
                word_number_matcher.match(ngram[0].lower_.strip()) is None,
                word_number_matcher.match(ngram[-1].lower_.strip()) is None])

corpus = st.OffsetCorpusFactory(
    bbc_df,
    category_col='DateStr',
    parsed_col='Parse',
    feat_and_offset_getter=st.FlexibleNGramFeatures(
        ngram_sizes=[1, 2, 3, 4, 5],
        exclude_ngram_filter = exclude_ngrams_which_do_not_start_and_end_with_function_words
    )
).build().compact(
    compactor=st.NPMICompactor(
        minimum_term_count = 3,
        number_terms_per_length = 2000,
    ),
    non_text=True
).compact(
    st.NgramPercentageCompactor(
        usage_portion=0.6,
    ),
    non_text=True
).filter_out(
    lambda x: len(x) == 1,
    non_text=True
).compact(
    compactor=st.AssociationCompactor(
        2000,
        scorer=st.DeltaJSDivergenceScorer,
        term_ranker=st.OncePerDocFrequencyRanker,
        use_non_text_features=True,
    ),
    non_text=True
)

  0%|          | 0/203879 [00:00<?, ?it/s]

  0%|          | 0/203879 [00:00<?, ?it/s]

In [559]:
def get_heading(corpus):
    df = corpus.get_df()
    return df['Date'].astype(str) + ': <a href="' + df.link + '">' + df.title + '</a>'

In [560]:
divided_corpus = corpus.recategorize(
    np.array(['Pre-Sept 2022', 'Sept 2022 and after'])[
        np.array(corpus.get_category_names_by_row() > '2022-09-00').astype(int)])


html = st.produce_scattertext_explorer(
    divided_corpus,
    category='Pre-Sept 2022', 
    category_name='Pre-Sept 2022', 
    not_category_name='Sept 2022 and after',
    minimum_term_frequency=0,
    pmi_threshold_coefficient=0,
    use_offsets=True,
    use_non_text_features=True,
    width_in_pixels=1000, 
    metadata=get_heading,
    transform=st.Scalers.dense_rank
)
fn = 'bbc_divided.html'
open(fn, 'w').write(html)


8373763

In [558]:
from time import time
t0 = time()
html = st.produce_scattertext_table(
    corpus=corpus,
    #heading_categories = heading_categories,
    #heading_category_order = heading_category_order,
    category_order=list(sorted(corpus.get_categories())),
    all_category_scorer=lambda c: st.AllCategoryTermScorer(c, term_scorer=st.DeltaJSDivergenceScorer),
    metadata = lambda c: c.get_df()['Date'].astype(str),
    ignore_categories=False,
    plot_width=1000,
    sort_doc_labels_by_name=True,
    use_offsets=True,
    non_text=True,
    trend_plot_settings=st.DispersionPlotSettings(
        category_order=list(sorted(corpus.get_categories())),
        metric='DP',
        use_residual=True,
        #dispersion_scaler=st.Scalers.log_scale_with_negatives
    )
)
print(time() - t0)
fn = 'demo_dispersion_table.html'
with open(fn, 'w') as of:
    of.write(html)



  vec_ss = (vec_ss - vec_ss.min()) * 1. / (vec_ss.max() - vec_ss.min())


103.74240303039551


In [549]:
from time import time
t0 = time()
category_order = list(sorted(corpus.get_categories()))
html = st.produce_scattertext_table(
    corpus=corpus,
    category_order=category_order,
    all_category_scorer=lambda c: st.AllCategoryTermScorer(c, term_scorer=st.DeltaJSDivergenceScorer),
    metadata = lambda c: c.get_df()['Date'].astype(str),
    ignore_categories=False,
    plot_width=1000,
    sort_doc_labels_by_name=True,
    use_offsets=True,
    non_text=True,
    trend_plot_settings=st.DispersionPlotSettings(
        category_order=category_order,
        metric='DA',
        use_residual=False,
    )
)
print(time() - t0)
fn = 'bbc_dispersion_table.html'
with open(fn, 'w') as of:
    of.write(html)



  vec_ss = (vec_ss - vec_ss.min()) * 1. / (vec_ss.max() - vec_ss.min())


120.83623814582825


In [561]:
grouper = st.CharacteristicGrouper(
    corpus, 
    non_text=True,
    rank_embedder=st.RankEmbedder(
        term_scorer=st.DeltaJSDivergenceScorer(corpus), 
        rank_threshold=10
    ),
    window_size=1,
    to_text=' to '
)

heading_categories, heading_category_order = grouper.get_new_doc_categories(
    number_of_splits=5,
    category_order=list(sorted(corpus.get_categories())),
    verbose=True
)

  0%|          | 0/360 [00:00<?, ?it/s]

In [563]:
from time import time
t0 = time()
html = st.produce_scattertext_table(
    corpus=corpus,
    heading_categories = heading_categories,
    heading_category_order = heading_category_order,
    category_order=list(sorted(corpus.get_categories())),
    all_category_scorer=lambda c: st.AllCategoryTermScorer(c, term_scorer=st.DeltaJSDivergenceScorer),
    metadata = lambda c: c.get_df()['Date'].astype(str),
    ignore_categories=False,
    plot_width=700,
    plot_height=400,
    top_terms_length=5,
    sort_doc_labels_by_name=True,
    use_offsets=True,
    non_text=True,
    trend_plot_settings=st.DispersionPlotSettings(
        category_order=list(sorted(corpus.get_categories())),
        metric='DA',
        use_residual=False,
    )
)
print(time() - t0)
fn = 'bbc_grouped_dispersion_table.html'
with open(fn, 'w') as of:
    of.write(html)



  vec_ss = (vec_ss - vec_ss.min()) * 1. / (vec_ss.max() - vec_ss.min())


3.5177550315856934


In [554]:
from time import time
t0 = time()
html = st.produce_scattertext_table(
    corpus=corpus,
    heading_categories = heading_categories,
    heading_category_order = heading_category_order,
    category_order=list(sorted(corpus.get_categories())),
    all_category_scorer=lambda c: st.AllCategoryTermScorer(c, term_scorer=st.DeltaJSDivergenceScorer),
    metadata = lambda c: c.get_df()['Date'].astype(str),
    ignore_categories=False,
    plot_width=700,
    plot_height=400,
    top_terms_length=8,
    sort_doc_labels_by_name=True,
    use_offsets=True,
    non_text=True,
    trend_plot_settings=st.DispersionPlotSettings(
        category_order=list(sorted(corpus.get_categories())),
        metric='DA',
        use_residual=True,
    )
)
print(time() - t0)
fn = 'bbc_grouped_residual_dispersion_table.html'
with open(fn, 'w') as of:
    of.write(html)

  vec_ss = (vec_ss - vec_ss.min()) * 1. / (vec_ss.max() - vec_ss.min())


4.820051908493042


In [564]:
from time import time
t0 = time()
html = st.produce_scattertext_table(
    corpus=corpus,
    heading_categories = heading_categories,
    heading_category_order = heading_category_order,
    category_order=list(sorted(corpus.get_categories())),
    all_category_scorer=lambda c: st.AllCategoryTermScorer(c, term_scorer=st.DeltaJSDivergenceScorer),
    metadata = lambda c: c.get_df()['Date'].astype(str),
    ignore_categories=False,
    plot_width=700,
    plot_height=400,
    top_terms_length=8,
    sort_doc_labels_by_name=True,
    use_offsets=True,
    non_text=True,
    trend_plot_settings=st.TimePlotSettings(
        category_order=list(sorted(corpus.get_categories())),
        y_axis_metric='DA',
        dispersion_scaler=st.Scalers.dense_rank
        #dispersion_scaler=st.Scalers.log_scale_with_negatives
    )
)
print(time() - t0)
fn = 'bbc_grouped_time_table.html'
with open(fn, 'w') as of:
    of.write(html)



2.5676259994506836


In [556]:
from time import time
t0 = time()
html = st.produce_scattertext_table(
    corpus=corpus,
    heading_categories = heading_categories,
    heading_category_order = heading_category_order,
    category_order=list(sorted(corpus.get_categories())),
    all_category_scorer=lambda c: st.AllCategoryTermScorer(c, term_scorer=st.DeltaJSDivergenceScorer),
    metadata = lambda c: c.get_df()['Date'].astype(str),
    ignore_categories=False,
    plot_width=1000,
    plot_height=400,
    top_terms_length=8,
    sort_doc_labels_by_name=True,
    use_offsets=True,
    non_text=True,
    trend_plot_settings=st.TimePlotSettings(
        category_order=list(sorted(corpus.get_categories())),
        y_axis_metric='DA',
        dispersion_scaler=st.Scalers.dense_rank
        #dispersion_scaler=st.Scalers.log_scale_with_negatives
    )
)
print(time() - t0)
fn = 'bbc_grouped_time_table_bigger.html'
with open(fn, 'w') as of:
    of.write(html)

4.784108638763428


In [557]:
from time import time
t0 = time()
html = st.produce_scattertext_table(
    corpus=corpus,
    heading_categories = heading_categories,
    heading_category_order = heading_category_order,
    category_order=list(sorted(corpus.get_categories())),
    all_category_scorer=lambda c: st.AllCategoryTermScorer(c, term_scorer=st.DeltaJSDivergenceScorer),
    metadata = lambda c: c.get_df()['Date'].astype(str),
    ignore_categories=False,
    plot_width=1000,
    sort_doc_labels_by_name=True,
    use_offsets=True,
    non_text=True,
    trend_plot_settings=st.CorrelationPlotSettings(
        category_order=list(sorted(corpus.get_categories()))
    )
)
print(time() - t0)
fn = 'bbc_grouped_correlation_table_bigger.html'
with open(fn, 'w') as of:
    of.write(html

9.761425971984863
