In [1]:
%load_ext autoreload
%autoreload 2
import os, json, re, sys, time, warnings, datetime, glob
sys.path.insert(0, '../scattertext/')
import spacy 
import pandas as pd
import numpy as np
import scattertext as st
from tqdm.auto import tqdm
tqdm.pandas()

from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

assert st.version >= [0, 1, 15] 

In [2]:
nlp = spacy.blank('en')
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x7fb75dadbf50>

In [3]:
# From https://www.kaggle.com/datasets/gpreda/bbc-news
bbc_df = pd.read_csv('bbc_news.csv.gz').assign(
    Date = lambda df: pd.to_datetime(df.pubDate),
    Parse = lambda df: df.description.progress_apply(nlp),
    DateStr=lambda df: df.Date.apply(lambda x: x.strftime("%Y-%m-%d")),    
)

  0%|          | 0/13955 [00:00<?, ?it/s]

In [4]:
bbc_df = bbc_df[lambda df: df.DateStr > '2022-02-17'].sort_values(by='Date')

In [5]:
word_number_matcher = re.compile('^[A-Za-z0-9 ]+$')

def exclude_ngrams_which_do_not_start_and_end_with_function_words(ngram: spacy.tokens.Span) -> bool:
    return any([ngram[0].lower_.strip() in st.MY_ENGLISH_STOP_WORDS,
                ngram[-1].lower_.strip() in st.MY_ENGLISH_STOP_WORDS,
                word_number_matcher.match(ngram[0].lower_.strip()) is None,
                word_number_matcher.match(ngram[-1].lower_.strip()) is None])

corpus = st.OffsetCorpusFactory(
    bbc_df,
    category_col='DateStr',
    parsed_col='Parse',
    feat_and_offset_getter=st.FlexibleNGramFeatures(
        ngram_sizes=[1, 2, 3, 4, 5],
        exclude_ngram_filter = exclude_ngrams_which_do_not_start_and_end_with_function_words
    )
).build().compact(
    compactor=st.NPMICompactor(
        minimum_term_count = 3,
        number_terms_per_length = 2000,
    ),
    non_text=True
).compact(
    st.NgramPercentageCompactor(
        usage_portion=0.6,
    ),
    non_text=True
).filter_out(
    lambda x: len(x) == 1,
    non_text=True
).compact(
    compactor=st.AssociationCompactor(
        2000,
        scorer=st.DeltaJSDivergenceScorer,
        term_ranker=st.OncePerDocFrequencyRanker,
        use_non_text_features=True,
    ),
    non_text=True
)

  0%|          | 0/203879 [00:00<?, ?it/s]

  0%|          | 0/203879 [00:00<?, ?it/s]

In [6]:
def get_heading(corpus):
    df = corpus.get_df()
    return df['Date'].astype(str) + ': <a href="' + df.link + '">' + df.title + '</a>'

In [7]:
divided_corpus = corpus.recategorize(
    np.array(['Pre-Sept 2022', 'Sept 2022 and after'])[
        np.array(corpus.get_category_names_by_row() > '2022-09-00').astype(int)])


html = st.produce_scattertext_explorer(
    divided_corpus,
    category='Pre-Sept 2022', 
    category_name='Pre-Sept 2022', 
    not_category_name='Sept 2022 and after',
    minimum_term_frequency=0,
    pmi_threshold_coefficient=0,
    use_offsets=True,
    use_non_text_features=True,
    width_in_pixels=1000, 
    metadata=get_heading,
    transform=st.Scalers.dense_rank
)
fn = 'bbc_divided.html'
with open(fn, 'w') as of:
    of.write(html)
IFrame(src=fn, width = 1300, height=700)


In [8]:

html = st.produce_frequency_explorer(
    divided_corpus,
    category='Pre-Sept 2022', 
    category_name='Pre-Sept 2022', 
    not_category_name='Sept 2022 and after',
    minimum_term_frequency=0,
    pmi_threshold_coefficient=0,
    use_offsets=True,
    use_non_text_features=True,
    width_in_pixels=1000, 
    metadata=get_heading,
    term_scorer=st.DeltaJSDivergence(),
    transform=st.Scalers.dense_rank
)
fn = 'bbc_divided_jsd.html'
with open(fn, 'w') as of:
    of.write(html)
IFrame(src=fn, width = 1300, height=700)


In [9]:

html = st.produce_frequency_explorer(
    divided_corpus,
    category='Pre-Sept 2022', 
    category_name='Pre-Sept 2022', 
    not_category_name='Sept 2022 and after',
    minimum_term_frequency=0,
    pmi_threshold_coefficient=0,
    use_offsets=True,
    use_non_text_features=True,
    width_in_pixels=1000, 
    metadata=get_heading,
    term_scorer=st.LogOddsRatioSmoothed(),
    transform=st.Scalers.dense_rank
)
fn = 'bbc_divided_lor.html'
with open(fn, 'w') as of:
    of.write(html)
IFrame(src=fn, width = 1300, height=700)


In [10]:
from time import time
t0 = time()
category_order = list(sorted(corpus.get_categories()))
html = st.produce_scattertext_table(
    corpus=corpus,
    category_order=category_order,
    all_category_scorer=lambda c: st.AllCategoryTermScorer(c, term_scorer=st.DeltaJSDivergenceScorer),
    metadata = lambda c: c.get_df()['Date'].astype(str),
    ignore_categories=False,
    plot_width=1000,
    sort_doc_labels_by_name=True,
    use_offsets=True,
    non_text=True,
    trend_plot_settings=st.DispersionPlotSettings(
        category_order=category_order,
        metric='DA',
        use_residual=False,
    )
)
print(time() - t0)
fn = 'bbc_dispersion_table.html'
with open(fn, 'w') as of:
    of.write(html)
IFrame(src=fn, width = 1300, height=700)


  vec_ss = (vec_ss - vec_ss.min()) * 1. / (vec_ss.max() - vec_ss.min())


120.16814517974854


In [11]:
grouper = st.CharacteristicGrouper(
    corpus, 
    non_text=True,
    rank_embedder=st.RankEmbedder(
        term_scorer=st.DeltaJSDivergenceScorer(corpus), 
        rank_threshold=10
    ),
    window_size=1,
    to_text=' to '
)

heading_categories, heading_category_order = grouper.get_new_doc_categories(
    number_of_splits=5,
    category_order=list(sorted(corpus.get_categories())),
    verbose=True
)

In [22]:
from time import time
t0 = time()
html = st.produce_scattertext_table(
    corpus=corpus,
    heading_categories = heading_categories,
    heading_category_order = heading_category_order,
    category_order=list(sorted(corpus.get_categories())),
    all_category_scorer=lambda c: st.AllCategoryTermScorer(c, term_scorer=st.DeltaJSDivergenceScorer),
    metadata = lambda c: c.get_df()['Date'].astype(str),
    ignore_categories=False,
    plot_width=700,
    plot_height=400,
    top_terms_length=8,
    sort_doc_labels_by_name=True,
    use_offsets=True,
    non_text=True,
    trend_plot_settings=st.DispersionPlotSettings(
        category_order=list(sorted(corpus.get_categories())),
        metric='DA',
        use_residual=False,
    )
)
print(time() - t0)
fn = 'bbc_grouped_dispersion_table.html'
with open(fn, 'w') as of:
    of.write(html)
IFrame(src=fn, width = 1300, height=700)



  vec_ss = (vec_ss - vec_ss.min()) * 1. / (vec_ss.max() - vec_ss.min())


3.9367170333862305


In [18]:
from time import time
t0 = time()
html = st.produce_scattertext_table(
    corpus=corpus,
    heading_categories = heading_categories,
    heading_category_order = heading_category_order,
    category_order=list(sorted(corpus.get_categories())),
    all_category_scorer=lambda c: st.AllCategoryTermScorer(c, term_scorer=st.DeltaJSDivergenceScorer),
    metadata = lambda c: c.get_df()['Date'].astype(str),
    ignore_categories=False,
    plot_width=700,
    plot_height=400,
    top_terms_length=8,
    sort_doc_labels_by_name=True,
    use_offsets=True,
    non_text=True,
    trend_plot_settings=st.DispersionPlotSettings(
        category_order=list(sorted(corpus.get_categories())),
        metric='DA',
        use_residual=True,
    )
)
print(time() - t0)
fn = 'bbc_grouped_residual_dispersion_table.html'
with open(fn, 'w') as of:
    of.write(html)
IFrame(src=fn, width = 1300, height=700)


  vec_ss = (vec_ss - vec_ss.min()) * 1. / (vec_ss.max() - vec_ss.min())


4.476346015930176


In [25]:
from time import time
t0 = time()
html = st.produce_scattertext_table(
    corpus=corpus,
    heading_categories = heading_categories,
    heading_category_order = heading_category_order,
    category_order=list(sorted(corpus.get_categories())),
    all_category_scorer=lambda c: st.AllCategoryTermScorer(c, term_scorer=st.DeltaJSDivergenceScorer),
    metadata = lambda c: c.get_df()['Date'].astype(str),
    ignore_categories=False,
    plot_width=700,
    plot_height=400,
    top_terms_length=8,
    sort_doc_labels_by_name=True,
    use_offsets=True,
    non_text=True,
    trend_plot_settings=st.TimePlotSettings(
        category_order=list(sorted(corpus.get_categories())),
        dispersion_metric='DA',
        dispersion_scaler=st.Scalers.dense_rank
    )
)
print(time() - t0)
fn = 'bbc_grouped_time_table.html'
with open(fn, 'w') as of:
    of.write(html)
IFrame(src=fn, width = 1300, height=700)



2.7514150142669678


In [26]:
from time import time
t0 = time()
html = st.produce_scattertext_table(
    corpus=corpus,
    heading_categories = heading_categories,
    heading_category_order = heading_category_order,
    category_order=list(sorted(corpus.get_categories())),
    all_category_scorer=lambda c: st.AllCategoryTermScorer(c, term_scorer=st.DeltaJSDivergenceScorer),
    metadata = lambda c: c.get_df()['Date'].astype(str),
    ignore_categories=False,
    plot_width=1000,
    plot_height=400,
    top_terms_length=8,
    sort_doc_labels_by_name=True,
    use_offsets=True,
    non_text=True,
    trend_plot_settings=st.TimePlotSettings(
        category_order=list(sorted(corpus.get_categories())),
        dispersion_metric='DA',
        dispersion_scaler=st.Scalers.dense_rank
    )
)
print(time() - t0)
fn = 'bbc_grouped_time_table_bigger.html'
with open(fn, 'w') as of:
    of.write(html)
IFrame(src=fn, width = 1300, height=700)


2.5758211612701416


In [24]:
from time import time
t0 = time()
html = st.produce_scattertext_table(
    corpus=corpus,
    heading_categories = heading_categories,
    heading_category_order = heading_category_order,
    category_order=list(sorted(corpus.get_categories())),
    all_category_scorer=lambda c: st.AllCategoryTermScorer(c, term_scorer=st.DeltaJSDivergenceScorer),
    metadata = lambda c: c.get_df()['Date'].astype(str),
    ignore_categories=False,
    plot_width=1000,
    sort_doc_labels_by_name=True,
    use_offsets=True,
    non_text=True,
    trend_plot_settings=st.CorrelationPlotSettings(
        category_order=list(sorted(corpus.get_categories()))
    )
)
print(time() - t0)
fn = 'bbc_grouped_correlation_table_bigger.html'
with open(fn, 'w') as of:
    of.write(html)

SyntaxError: unexpected EOF while parsing (<ipython-input-24-21f217abd976>, line 22)