In [99]:
import os
import sys
import re

sys.path.insert(0, '../scattertext/')
%load_ext autoreload
%autoreload 2

import pandas as pd
import scattertext as st
import numpy as np
import spacy

from typing import List, Dict, Optional, Union, Tuple

from tqdm.auto import tqdm
tqdm.pandas()

from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

assert st.version >= [0, 1, 17] 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
nlp = spacy.load('en_core_web_sm')

## Arthur Conan Doyle Corpus

Theme corpus of the Using and Developing Software for Keyness Analysis workshop.


In [5]:
doyle_whole_df = pd.read_csv('doyle/metadata.csv', sep='\t').assign(
    Text = lambda df: df.idno.apply(lambda x: open(os.path.join('doyle/corpus', x + '.txt')).read()),
    Parse = lambda df: df.Text.progress_apply(nlp)
)

  0%|          | 0/12 [00:00<?, ?it/s]

In [6]:
doyle_whole_df[['title', 'pubyear', 'subgenre', 'narration']].sort_values(by='pubyear')

Unnamed: 0,title,pubyear,subgenre,narration
0,StudyScarlet,1887,detective,homodiegetic
8,MysteryCloomber,1889,other,homodiegetic
1,SignFour,1890,detective,homodiegetic
9,FirmGirdlestone,1890,other,heterodiegetic
4,WhiteCompany,1891,historical,heterodiegetic
6,RafflesHaw,1891,horror,homodiegetic
5,Refugees,1893,historical,heterodiegetic
7,Parasite,1894,horror,homodiegetic
3,HoundBaskervilles,1902,detective,homodiegetic
10,LostWorld,1912,adventure,homodiegetic


## Task 1: explore summarizing Study Scarlet 
## Task 2: understand how the style of Doyle's writing changed

Before beginning task 1, let's break up the books into chunks at most 500 words long. We'll ensure that chunks obey sentence boundaries so we can properly part-of-speech tag all text in each chunk.

To do this, we use the `SentenceSequenceSegmenter` class.

The chunking takes 1 minute 30 seconds on my 2018 MacbookPro.

Note that we incldue linguistic process in our spaCy model (i.e., we are using "en_core_web_sm" instead of a "blank" model.

In [7]:
doyle_segment_df = st.SentenceSequenceSegmenter(
    segment_length=500
).whole_df_to_segmented_df(
    df=doyle_whole_df,
    parsed_col='Parse',
    verbose=True
).assign(
    Parse = lambda df: df.Text.progress_apply(nlp)
)



  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/1873 [00:00<?, ?it/s]

We merge back in metadata about the other ACD books to the data frame.

In [9]:
doyle_segment_augmented_df = pd.merge(
    doyle_segment_df,
    doyle_whole_df[['idno', 'author-name', 'author-gender', 'title',
                    'pubyear', 'genre', 'subgenre', 'detective', 'horror', 'historical',
                    'adventure', 'narration', 'availability', 'decade']], 
    left_on='DocIdx',
    right_index=True,
)

doyle_segment_augmented_df = pd.merge(
    doyle_segment_augmented_df,
    pd.DataFrame({'NumSegments':doyle_segment_augmented_df.groupby('title').SegmentIdx.max()}),
    left_on='title',
    right_index=True
).assign(
    Third = lambda df: (df.SegmentIdx/df.NumSegments).apply(
        lambda x: 'First' if x < 1/3 else 'Second' if x < 2/3 else 'Third')
)

We'll write crude HTML versions of each book, with chunk boundaries marked, so we can load them from the HTML files of our visiualizations.

In [10]:
!mkdir -p book_html
for title, book_df in doyle_segment_augmented_df.groupby('title'):
    book_fn = f'book_html/{title}.html' 
    with open(book_fn, 'w') as of:
        of.write(f'<h2>Title: {title}</h2>' +
            '\n'.join(book_df.apply(
                lambda row: f'<h3><a name="section{row.SegmentIdx}">Section {row.SegmentIdx}</a></h3><div style="white-space: pre-wrap;">{str(row.Parse)}</div>',
                axis=1
            ))
        )

## Offset-based feature identification

As opposed to the unigram Part 1, we will experiment with n-gram features in when studying Doyle.

We create an `OffsetCorpus`, which can include character offsets that indicate the location of features in the corpus. Like the lexicon memberships we saw in Part 1, these are considered "non-text" features.

The `OffsetCorpusFactory` is used to build an offset corpus.

A `FeatAndOffsetGetter` descendent, `FlexibleNGramFeatures`, is used to build these features. The class creates n-grams of the specified number of tokens from a document. It does not break sentence boundaries when extracting features.
- The n-gram sizes are given in the `ngram_sizes` parameter. 
- Tokens are excluded from n-gram construction if they fail the `validate_token` filter. Since the start and end character offsets will be used to represent the found n-grams, filtered tokens can still be parts of these n-grams, though they will not be part of the name of the n-gram not count toward its size.
- For each n-gram, a spaCy span is created, and the function `exclude_ngram_filter` is run to determine if the n-gram should be discarded. Here, we are excluding n-grams that do not start or end with a function word.

After the corpus creation, there were 12,037 features created. As a rule of thumb, visualizing more than 4,000 features will result in a very long load time, and about 2,000 or under is ideal. Moreover, having multiple parts of a collocation present can serve as a distraction when examining the associations of these features.

The next cells involve reducing this feature count.

In [51]:
word_number_matcher = re.compile('^[A-Za-z0-9]+$')

def exclude_ngrams_which_do_not_start_and_end_with_function_words(ngram: spacy.tokens.Span) -> bool:
    return any([
        ngram[0].lower_.strip() in st.MY_ENGLISH_STOP_WORDS,
        ngram[-1].lower_.strip() in st.MY_ENGLISH_STOP_WORDS,
        word_number_matcher.match(ngram[0].lower_.strip()) is None,
        word_number_matcher.match(ngram[-1].lower_.strip()) is None,
        len(set([ngram[0].pos_, ngram[-1].pos_]) & {"DET", "AUX", "ADP", "AUX", "PRON", "PUNCT", 'CCONJ', 'PART'})
    ])

offset_feature_getter = st.FlexibleNGramFeatures(
    ngram_sizes=[1, 2, 3, 4, 5],
    exclude_ngram_filter = exclude_ngrams_which_do_not_start_and_end_with_function_words,
    text_from_token = lambda tok: tok.lower_,
    validate_token = lambda tok: tok.lower_.strip() != ''
)


In [53]:
offset_feature_getter.get_metadata_offsets(nlp("This is a very big test and Joe Biden is presidenet."))

[('big', [1, [(15, 18)]]),
 ('test', [1, [(19, 23)]]),
 ('joe', [1, [(28, 31)]]),
 ('biden', [1, [(32, 37)]]),
 ('presidenet', [1, [(41, 51)]]),
 ('big test', [1, [(15, 23)]]),
 ('joe biden', [1, [(28, 37)]]),
 ('test and joe', [1, [(19, 31)]]),
 ('biden is presidenet', [1, [(32, 51)]]),
 ('big test and joe', [1, [(15, 31)]]),
 ('test and joe biden', [1, [(19, 37)]]),
 ('joe biden is presidenet', [1, [(28, 51)]]),
 ('big test and joe biden', [1, [(15, 37)]])]

# Sidebar: making custom feature representations

This is implemented as a subclass of `FeatAndOffsetGetter`

The class will contain a method `get_metadata_offsets`, which returns a list of tuples formatted as: 

```
[
    (Feature Name 1, [Count of number of occurrences, [
        (Sample occurrence start character offset 1, Sample occurrence start character offset 2),
        ...
    ]),
    ...
]
```

This allows you to use feature names that may not appear in the text (lemmata, parts-of-speech, etc.) but are linked to examples. Note that this structure will be stored in memory when the visualization is loaded. 

In [104]:
class RegexFeatAndOffsetGetter(st.FeatAndOffsetGetter):
    def __init__(self, regex_dict: Dict[str, str]):
        '''
        :param regex_dict: dict
            This should be a dictionary mapping a label to the text of a regular expression
            e.g., {'AorB': r'(a|b)', 'Number: r'\d+'}
        '''
        self.labeled_regexes = [[k, re.compile(v, re.I | re.M)] for k, v in regex_dict.items()]

    def get_metadata_offsets(
        self, 
        doc: spacy.tokens.doc.Doc
    ) -> List[Tuple[str, List[Union[int, List[Tuple[int, int]]]]]]:
        text = str(doc)
        offset_tokens = {}
        for label, regex in self.labeled_regexes:
            for match in regex.finditer(text):
                token_stats = offset_tokens.setdefault(label, [0, []])
                token_stats[0] += 1
                token_stats[1].append(match.span())
        return list(offset_tokens.items())

In [105]:
RegexFeatAndOffsetGetter({'Detective':'(watson|sherlock)'}).get_metadata_offsets(
    nlp("watson told sherlock that he was a being a little pedantic.")
)

[('Detective', [2, [(0, 6), (12, 20)]])]

## Running the feature extractor

In [106]:
studyscarlet_corpus = st.OffsetCorpusFactory(
    doyle_segment_augmented_df[lambda df: df.title == 'StudyScarlet'],
    category_col='SegmentIdx',
    parsed_col='Parse',
    feat_and_offset_getter=offset_feature_getter
).build(
    show_progress=True
)

print('# features in after initial creation', studyscarlet_corpus.get_num_terms(non_text=True))


  0%|          | 0/99 [00:00<?, ?it/s]

# features in after initial creation 21486


Next, let's apply an NPMI (Gerlof 2009) scoring on all n-grams and, for each token length, keep the top 3000 n-grams. NPMI stands for normalized pointwise mutual information. 

Where PMI is the log ratio of a whole n-gram probability to the product of its component unigram probabilities, NPMI scales this ratio by the negative log probability of the entire n-gram. 

NPMI has the effect of ranking more frequent collocation candidates a little higher than PMI. 

Instead of setting an NPMI threshold, we select the K highest scoring n-grams of each length (other than 1), which occur at least twice. We will then go on to remove 

Bouma, Gerlof. Normalized (Pointwise) Mutual Information in Collocation Extraction. GSCL. 2009. 

In [107]:
npmi_studyscarlet_corpus = studyscarlet_corpus.compact(
    compactor=st.NPMICompactor(
        minimum_term_count = 2,
        number_terms_per_length = 3000,        
        show_progress=True,
    ),
    non_text=True
)
print('# features after NPMI compaction', npmi_studyscarlet_corpus.get_num_terms(non_text=True))

  0%|          | 0/16284 [00:00<?, ?it/s]

  0%|          | 0/16284 [00:00<?, ?it/s]

# features after NPMI compaction 2600


We then remove features which are either one character long or where 60% of their usages are in the same longer (token-wise) phrase.

In [108]:
filtered_npmi_studyscarlet_corpus = npmi_studyscarlet_corpus.compact(
    st.NgramPercentageCompactor(
        usage_portion=0.6, 
    ), 
    non_text=True
).filter_out(
    lambda x: len(x) == 1, 
    non_text=True
)
print("# features after usage compaction and single-letter filtering", 
      filtered_npmi_studyscarlet_corpus.get_num_terms(non_text=True))

# features after usage compaction and single-letter filtering 2455


We can see the features retained:

In [109]:
pd.DataFrame({
    'Frequency': filtered_npmi_studyscarlet_corpus.get_freq_df(True).sum(axis=1)
}).reset_index().assign(
    NumTokens = lambda ser: ser.term.str.split().apply(len)
).sort_values(by='NumTokens', ascending=False).groupby('NumTokens').apply(
    lambda gdf: gdf.sort_values(by='Frequency', ascending=False).head(3)
)

Unnamed: 0_level_0,Unnamed: 1_level_0,term,Frequency,NumTokens
NumTokens,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,155,said,200,1
1,87,man,148,1
1,188,holmes,91,1
2,190,sherlock holmes,45,2
2,1901,jefferson hope,31,2
2,2087,john ferrier,22,2
3,2161,salt lake city,9,3
3,942,said sherlock holmes,7,3
3,530,sitting - room,5,3
4,331,sprang to his feet,5,4


In [110]:
def book_metadata_factory(book_fn):
    return lambda c: (f'<a href="{book_fn}#section' + c.get_df()['SegmentIdx'].astype(str) + '"" target="_"> Segment ' 
                              + c.get_df()['SegmentIdx'].astype(str) 
                              + ' of ' + str(len(c.get_df()['SegmentIdx'])) + '</a>')

## Scatterchron

Let's create a Scatterchron plot of the sections in the compacted version of Study Scarlet. A lot of sections will be displayed in the parallel tag cloud at the top of the visualiztion. You can scroll horizontally on them.

Below, terms are positioned at their average time stop, and their height is the dense rank of the DAs.

In [111]:
ordered_categories = list(sorted(filtered_npmi_studyscarlet_corpus.get_categories()))

In [115]:
from time import time
t0 = time()
html = st.produce_scattertext_table(
    corpus=filtered_npmi_studyscarlet_corpus,
    category_order=ordered_categories,
    all_category_scorer=lambda corpus: st.AllCategoryTermScorer(corpus, term_scorer=st.DeltaJSDivergenceScorer),
    metadata = book_metadata_factory('./book_html/StudyScarlet.html'),
    top_terms_length=8,
    ignore_categories=False,
    plot_width=1000,
    plot_height=400,
    sort_doc_labels_by_name=True,
    use_offsets=True,
    non_text=True,
    use_full_doc=True,
    trend_plot_settings=st.TimePlotSettings(
        category_order=ordered_categories,
        dispersion_metric='DA',
        dispersion_scaler=st.Scalers.dense_rank,
        use_residual=False
    )
)
print(time() - t0)
fn = 'acd_studyscarlet_timeplot_table.html'
with open(fn, 'w') as of:
    of.write(html)

IFrame(src=fn, width = 1300, height=700)


19.286139011383057


To deal with this issue, find the top 10 break points in the parallel tag cloud, and use them as cluster boundaries.

In [113]:
grouper = st.CharacteristicGrouper(
    filtered_npmi_studyscarlet_corpus, 
    non_text=True,
    rank_embedder=st.RankEmbedder(
        term_scorer=st.DeltaJSDivergenceScorer, 
        rank_threshold=10
    ),
    to_text=' to '
)

heading_categories, heading_category_order = grouper.get_new_doc_categories(
    number_of_splits=16,
    category_order=ordered_categories
)

## The resulting Scattechron is more easy to navigate

In [119]:
from time import time
t0 = time()
html = st.produce_scattertext_table(
    corpus=filtered_npmi_studyscarlet_corpus,
    category_order=ordered_categories,
    heading_categories=heading_categories,
    heading_category_order=heading_category_order,
    all_category_scorer=lambda corpus: st.AllCategoryTermScorer(corpus, term_scorer=st.DeltaJSDivergenceScorer),
    metadata = book_metadata_factory('./book_html/StudyScarlet.html'),
    top_terms_length=8,    
    ignore_categories=False,
    plot_width=1000,
    plot_height=400,
    sort_doc_labels_by_name=True,
    use_offsets=True,
    non_text=True,
    use_full_doc=True,
    trend_plot_settings=st.TimePlotSettings(
        category_order=ordered_categories,
        dispersion_metric='DA',
        dispersion_scaler=st.Scalers.dense_rank,
        use_residual=False
    )
)
print(time() - t0)
fn = 'acd_studyscarlet_grouped_timeplot_table.html'
with open(fn, 'w') as of:
    of.write(html)

IFrame(src=fn, width = 1300, height=700)


4.05072283744812


## Stylistic change of Doyle over time
We can model Doyle's style by looking at trigrams of either parts-of-speech or, in the case of closed-class words, their lowercased form.

We filter out redundant features and pick the 2000 features with the higest JSD association.

We display the Spearman correlation coefficient of each feature, as well as the highest log odds ratio terms during each year.

In [216]:
def get_heading(c):
    return (f'<a href="book_html/' + c.get_df()['title'] + '.html#section' + c.get_df()['SegmentIdx'].astype(str) + '"" target="_"> Segment ' 
           + c.get_df()['SegmentIdx'].astype(str) 
           + ' of ' + str(len(c.get_df()['SegmentIdx'])) + '</a>')

In [219]:
doyle_style_corpus = st.OffsetCorpusFactory(
    doyle_segment_augmented_df,
    category_col='pubyear',
    parsed_col='Parse',
    feat_and_offset_getter=st.FlexibleNGramFeatures(
        ngram_sizes=[1, 2, 3],
        text_from_token=(
            lambda tok: (tok.tag_
                         if (tok.lower_ not in st.MY_ENGLISH_STOP_WORDS
                             or tok.tag_[:2] in ['VB', 'NN', 'JJ', 'RB', 'FW'])
                         else tok.lower_)
        )
    )
).build(
    show_progress=True
)

print('# features in after initial creation', doyle_style_corpus.get_num_terms(non_text=True))


  0%|          | 0/1873 [00:00<?, ?it/s]

# features in after initial creation 89451


In [221]:
doyle_style_corpus_compact = doyle_style_corpus.compact(
    compactor=st.NPMICompactor(
        minimum_term_count = 2,
        number_terms_per_length = 3000,        
        show_progress=True,
    ),
    non_text=True    
).compact(
    st.NgramPercentageCompactor(
        usage_portion=0.6, 
    ), 
    non_text=True
).compact(
    compactor=st.AssociationCompactor(2000, scorer=st.DeltaJSDivergenceScorer, use_non_text_features=True),
    non_text=True
)

  0%|          | 0/89232 [00:00<?, ?it/s]

  0%|          | 0/89232 [00:00<?, ?it/s]

In [222]:
from time import time
t0 = time()
html = st.produce_scattertext_table(
    corpus=doyle_style_corpus_compact,
    category_order=ordered_categories,
    all_category_scorer=lambda corpus: st.AllCategoryTermScorer(corpus, term_scorer=st.LogOddsRatio),
    metadata = get_heading,
    top_terms_length=8,    
    ignore_categories=False,
    plot_width=1000,
    plot_height=400,
    sort_doc_labels_by_name=True,
    use_offsets=True,
    non_text=True,
    use_full_doc=True,
    trend_plot_settings=st.CorrelationPlotSettings(
        category_order=ordered_categories
    )
)
print(time() - t0)
fn = 'acd_style_timeplot_table.html'
with open(fn, 'w') as of:
    of.write(html)

IFrame(src=fn, width = 1300, height=700)


8.264957904815674
