In [None]:
#*
# If you are NOT using Google Colab you'll need to run this cell to install spacy and its model
import sys
!{sys.executable} -m pip install spacy
!{sys.executable} -m spacy download en_core_web_sm

# Applying it to Guardian Data

`poverty_articles.parquet` is a dataset of articles from The Guardian API, retrieved and prepped using the processes we used in SC207.
- Retrieving from the API using the simple query of `"poverty"` with a limit of 3,000 articles, ordered newest first
- Unpacking nested data into its own columns and setting the correct data types
- Removing articles that were outliers such as sponsored content


In [1]:
import pandas as pd
from bs4 import BeautifulSoup

articles = pd.read_parquet('dataset.parquet')
articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   id                  500 non-null    object             
 1   type                500 non-null    object             
 2   sectionId           500 non-null    object             
 3   sectionName         500 non-null    object             
 4   webPublicationDate  500 non-null    datetime64[ns, UTC]
 5   webTitle            500 non-null    object             
 6   webUrl              500 non-null    object             
 7   apiUrl              500 non-null    object             
 8   tags                500 non-null    object             
 9   isHosted            500 non-null    bool               
 10  pillarId            500 non-null    object             
 11  pillarName          500 non-null    object             
 12  byline              500 non-null    

In [2]:
#*
# We turn our pandas column of texts into a simpler list to make it compatible with BeautifulSoup and Spacy
texts = articles['body'].tolist()

In [3]:
#*
# For teaching purposes only - finds first article with an <aside> element in
idx = articles[articles['body'].str.contains('<aside')].first_valid_index()
test_text = texts[idx]


# Prints out the URL of the story so we can view it as it's meant to look and compare to the text we have.
print(articles.loc[idx,'webUrl'])
print('----')
print(test_text)


https://www.theguardian.com/politics/live/2025/aug/11/palestine-action-protest-arrests-reform-labour-keir-starmer-uk-politics-live-latest-updates-news
----
<div id="block-689a00788f08c69345b795cf" class="block is-summary" data-block-contributor=""> <p class="block-time published-time"> <time datetime="2025-08-11T14:46:35.747Z">3.46pm <span class="timezone">BST</span></time> </p>   <h2 class="block-title">Closing summary</h2>  <div class="block-elements">  <ul> <li><p>Downing Street defended the controversial proscription of <strong>Palestine Action</strong>, labelling the protest group as “violent”, with the justice minister saying supporters of the “terrorist organisation” will be subjected to the “full force of the law”. No 10 said Palestine Action has committed “significant injury” as well as criminal damage after more than 500 arrests were made at a protest linked to the group in London over the weekend. </p></li> <li><p><strong>Kemi Badenoch </strong>appeared to suggest the settin

If a text contains more complex elements these will be wrapped in different tags that help lay it out on the website, change it's formatting etc. We simply want the text inside the most basic 'paragraph' `<p>` elements. There may even be `<p>` elements that do extra things. These will have an associated `class` which tells the website to format it differently.

Sometimes there will be other elements *inside* `p` elements, such as sidebar related stories. Generally these are wrapped in `span` or `aside` tags. We will manually `decompose` these from the text - i.e. cut them out, before then identifying all the `p` elements and getting their text.

Generally for text analysis we want the content text rather than headings, web addresses, embedded side content etc. Every website will differ in the best way to extract this material. Though there are general standards of tagging HTML elements it is usually necessary to customise what elements you decompose, what you keep and in what order to maximise the content you want to retain.

In [4]:
# We'll remove span and aside elements
soup = BeautifulSoup(test_text, 'html.parser')

remove_elements = ('span','aside')
[e.decompose() for e in soup.find_all() if e.name in remove_elements]

# and we'll then retain the text associated with any p element that has no associated class
paras = [p.text for p in soup.find_all('p', class_=None)]
cleaned_item ='\n'.join(paras)
print(cleaned_item)

Downing Street defended the controversial proscription of Palestine Action, labelling the protest group as “violent”, with the justice minister saying supporters of the “terrorist organisation” will be subjected to the “full force of the law”. No 10 said Palestine Action has committed “significant injury” as well as criminal damage after more than 500 arrests were made at a protest linked to the group in London over the weekend. 
Kemi Badenoch appeared to suggest the setting up of “camps” when speaking about possible alternatives to using hotels to house asylum seekers while out in Essex on a media event. 
The education secretary, Bridget Phillipson, reaffirmed her commitment to try to reduce the number of children missing classes at schools, and pledged to make a priority of tackling British white working-class young people falling behind their peers in the year ahead.
The prison population of England and Wales has jumped to the highest number in nearly a year and is nearing record le

We can do this for every article in our list. First we'll build a function to do the job of cleaning, then we'll apply it to every item in the list of texts.

In [5]:
def clean_guardian_text(text, remove_elements=('span','aside')):
    soup = BeautifulSoup(text, 'html.parser')
    [e.decompose() for e in soup.find_all() if e.name in remove_elements]
    paras = [p.text for p in soup.find_all('p', class_=None)]
    cleaned_item ='\n'.join(paras)
    return cleaned_item

cleaned_texts = [clean_guardian_text(t) for t in texts]

In [6]:
print(cleaned_texts[0])

Downing Street defended the controversial proscription of Palestine Action, labelling the protest group as “violent”, with the justice minister saying supporters of the “terrorist organisation” will be subjected to the “full force of the law”. No 10 said Palestine Action has committed “significant injury” as well as criminal damage after more than 500 arrests were made at a protest linked to the group in London over the weekend. 
Kemi Badenoch appeared to suggest the setting up of “camps” when speaking about possible alternatives to using hotels to house asylum seekers while out in Essex on a media event. 
The education secretary, Bridget Phillipson, reaffirmed her commitment to try to reduce the number of children missing classes at schools, and pledged to make a priority of tackling British white working-class young people falling behind their peers in the year ahead.
The prison population of England and Wales has jumped to the highest number in nearly a year and is nearing record le

In [10]:
articles['cleaned_text'] = cleaned_texts
articles.to_parquet('dataset_cleaned.parquet')

# Tokenising

In [None]:
#*
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(cleaned_texts[0])

In [12]:
# Spacy can tell us how many 'tokens' are in the document - i.e. how many words (but also other things)
len(doc)

4702

In [15]:
# How many sentences in the document?
len(list(doc.sents))

150

In [16]:
#*
# Tokens are units of text in natural language processing. Exactly how a text is 'tokenised' varies depending on the tool
# and many debates are had about the best way to do it.

# The goal is to render a text down into individual units of information that can be processed by different analysis techniques

# This is how spacy breaks up the document
[w.text for w in doc]

['Downing',
 'Street',
 'defended',
 'the',
 'controversial',
 'proscription',
 'of',
 'Palestine',
 'Action',
 ',',
 'labelling',
 'the',
 'protest',
 'group',
 'as',
 '“',
 'violent',
 '”',
 ',',
 'with',
 'the',
 'justice',
 'minister',
 'saying',
 'supporters',
 'of',
 'the',
 '“',
 'terrorist',
 'organisation',
 '”',
 'will',
 'be',
 'subjected',
 'to',
 'the',
 '“',
 'full',
 'force',
 'of',
 'the',
 'law',
 '”',
 '.',
 'No',
 '10',
 'said',
 'Palestine',
 'Action',
 'has',
 'committed',
 '“',
 'significant',
 'injury',
 '”',
 'as',
 'well',
 'as',
 'criminal',
 'damage',
 'after',
 'more',
 'than',
 '500',
 'arrests',
 'were',
 'made',
 'at',
 'a',
 'protest',
 'linked',
 'to',
 'the',
 'group',
 'in',
 'London',
 'over',
 'the',
 'weekend',
 '.',
 '\n',
 'Kemi',
 'Badenoch',
 'appeared',
 'to',
 'suggest',
 'the',
 'setting',
 'up',
 'of',
 '“',
 'camps',
 '”',
 'when',
 'speaking',
 'about',
 'possible',
 'alternatives',
 'to',
 'using',
 'hotels',
 'to',
 'house',
 'asylum',


In [14]:

#*# Spacy uses the context of the surrounding words and grammar to work out if the word is a noun, verb, adjective etc.
# They call this the 'part-of-speech' or POS
[(w.text, w.pos_) for w in doc]

[('More', 'ADJ'),
 ('than', 'ADP'),
 ('200', 'NUM'),
 ('refugee', 'NOUN'),
 ('organisations', 'NOUN'),
 (',', 'PUNCT'),
 ('charities', 'NOUN'),
 ('and', 'CCONJ'),
 ('trade', 'NOUN'),
 ('unions', 'NOUN'),
 ('have', 'AUX'),
 ('signed', 'VERB'),
 ('an', 'DET'),
 ('open', 'ADJ'),
 ('letter', 'NOUN'),
 ('calling', 'VERB'),
 ('on', 'ADP'),
 ('Britain', 'PROPN'),
 ('’s', 'PART'),
 ('political', 'ADJ'),
 ('leaders', 'NOUN'),
 ('to', 'PART'),
 ('end', 'VERB'),
 ('“', 'PUNCT'),
 ('pernicious', 'ADJ'),
 ('and', 'CCONJ'),
 ('insidious', 'ADJ'),
 ('currents', 'NOUN'),
 ('”', 'PUNCT'),
 ('of', 'ADP'),
 ('racism', 'NOUN'),
 ('and', 'CCONJ'),
 ('hatred', 'NOUN'),
 ('that', 'PRON'),
 ('underpin', 'VERB'),
 ('a', 'DET'),
 ('slew', 'NOUN'),
 ('of', 'ADP'),
 ('anti', 'ADJ'),
 ('-', 'ADJ'),
 ('migrant', 'ADJ'),
 ('protests', 'NOUN'),
 ('.', 'PUNCT'),
 ('\n', 'SPACE'),
 ('The', 'DET'),
 ('letter', 'NOUN'),
 (',', 'PUNCT'),
 ('coordinated', 'VERB'),
 ('by', 'ADP'),
 ('the', 'DET'),
 ('campaign', 'NOUN'),
 ('

In [17]:
#*
# Spacy tokens have helpful attributes...
# Is it alphabetical (i.e not numerical or punctuation)
[(w.text, w.is_alpha) for w in doc]

[('Downing', True),
 ('Street', True),
 ('defended', True),
 ('the', True),
 ('controversial', True),
 ('proscription', True),
 ('of', True),
 ('Palestine', True),
 ('Action', True),
 (',', False),
 ('labelling', True),
 ('the', True),
 ('protest', True),
 ('group', True),
 ('as', True),
 ('“', False),
 ('violent', True),
 ('”', False),
 (',', False),
 ('with', True),
 ('the', True),
 ('justice', True),
 ('minister', True),
 ('saying', True),
 ('supporters', True),
 ('of', True),
 ('the', True),
 ('“', False),
 ('terrorist', True),
 ('organisation', True),
 ('”', False),
 ('will', True),
 ('be', True),
 ('subjected', True),
 ('to', True),
 ('the', True),
 ('“', False),
 ('full', True),
 ('force', True),
 ('of', True),
 ('the', True),
 ('law', True),
 ('”', False),
 ('.', False),
 ('No', True),
 ('10', False),
 ('said', True),
 ('Palestine', True),
 ('Action', True),
 ('has', True),
 ('committed', True),
 ('“', False),
 ('significant', True),
 ('injury', True),
 ('”', False),
 ('as', Tr

In [16]:
#*
# Is it punctuation? 
[(w.text, w.is_punct) for w in doc]

[('More', False),
 ('than', False),
 ('200', False),
 ('refugee', False),
 ('organisations', False),
 (',', True),
 ('charities', False),
 ('and', False),
 ('trade', False),
 ('unions', False),
 ('have', False),
 ('signed', False),
 ('an', False),
 ('open', False),
 ('letter', False),
 ('calling', False),
 ('on', False),
 ('Britain', False),
 ('’s', False),
 ('political', False),
 ('leaders', False),
 ('to', False),
 ('end', False),
 ('“', True),
 ('pernicious', False),
 ('and', False),
 ('insidious', False),
 ('currents', False),
 ('”', True),
 ('of', False),
 ('racism', False),
 ('and', False),
 ('hatred', False),
 ('that', False),
 ('underpin', False),
 ('a', False),
 ('slew', False),
 ('of', False),
 ('anti', False),
 ('-', True),
 ('migrant', False),
 ('protests', False),
 ('.', True),
 ('\n', False),
 ('The', False),
 ('letter', False),
 (',', True),
 ('coordinated', False),
 ('by', False),
 ('the', False),
 ('campaign', False),
 ('coalition', False),
 ('Together', False),
 ('Wit

In [18]:
#*
# # Is it a stop word? 
[(w.text, w.is_stop) for w in doc]

[('Downing', False),
 ('Street', False),
 ('defended', False),
 ('the', True),
 ('controversial', False),
 ('proscription', False),
 ('of', True),
 ('Palestine', False),
 ('Action', False),
 (',', False),
 ('labelling', False),
 ('the', True),
 ('protest', False),
 ('group', False),
 ('as', True),
 ('“', False),
 ('violent', False),
 ('”', False),
 (',', False),
 ('with', True),
 ('the', True),
 ('justice', False),
 ('minister', False),
 ('saying', False),
 ('supporters', False),
 ('of', True),
 ('the', True),
 ('“', False),
 ('terrorist', False),
 ('organisation', False),
 ('”', False),
 ('will', True),
 ('be', True),
 ('subjected', False),
 ('to', True),
 ('the', True),
 ('“', False),
 ('full', True),
 ('force', False),
 ('of', True),
 ('the', True),
 ('law', False),
 ('”', False),
 ('.', False),
 ('No', True),
 ('10', False),
 ('said', False),
 ('Palestine', False),
 ('Action', False),
 ('has', True),
 ('committed', False),
 ('“', False),
 ('significant', False),
 ('injury', False),

### Stop Words?
Stop words are typically defined as the most common words in a language. Often incredibly common words can make it harder to find patterns in text. For example the most common words in a piece of text might be 'the', 'a', 'and' etc. That doesn't tell us much about the text even though the result is correct.

In [19]:
#*
# These are the stop words for this model
print(nlp.Defaults.stop_words)


{'more', 'first', 'beforehand', 'however', 'an', 'fifteen', 'would', 'does', 'many', 'nothing', 'anyway', 'therefore', 'meanwhile', 'otherwise', 'seemed', 'will', 'between', 'onto', 'three', '’ll', 'becomes', 'whereafter', 'one', 'formerly', 'moreover', 'due', 'if', 'along', 'everything', 'empty', 'regarding', 'all', 'been', "'ll", 'how', 'sometime', 'per', 'latterly', 'can', 'their', 'whom', 'there', 'have', 'being', 'whereupon', 'she', 'back', 'ever', 'hereby', 'beyond', 'nor', 'thereupon', 'mine', 'sometimes', 'almost', 'six', 'into', 'somewhere', 'never', 'though', 'at', 'rather', 'for', 'indeed', 'most', 'than', '’m', '‘d', 'towards', '’s', 'call', 'some', 'by', 'them', 'his', 'might', 'namely', 'us', 'take', 'four', 'really', 'now', 'forty', 'off', 'although', 'down', 'make', 'using', 'whereas', 'doing', 'becoming', 'i', 'name', 'has', 'get', 'since', 'did', 'elsewhere', 'no', 'or', 'much', 'twelve', 'could', 'eight', 'anything', 'last', 'myself', 'became', 'five', '‘ve', 'only',

In [20]:
# We can use these token attributes to filter our text based on what type of token it is

# This ensures only alphabetical tokens that aren't stop words are retained.
[w.text for w in doc if w.is_alpha and not w.is_stop]

['Downing',
 'Street',
 'defended',
 'controversial',
 'proscription',
 'Palestine',
 'Action',
 'labelling',
 'protest',
 'group',
 'violent',
 'justice',
 'minister',
 'saying',
 'supporters',
 'terrorist',
 'organisation',
 'subjected',
 'force',
 'law',
 'said',
 'Palestine',
 'Action',
 'committed',
 'significant',
 'injury',
 'criminal',
 'damage',
 'arrests',
 'protest',
 'linked',
 'group',
 'London',
 'weekend',
 'Kemi',
 'Badenoch',
 'appeared',
 'suggest',
 'setting',
 'camps',
 'speaking',
 'possible',
 'alternatives',
 'hotels',
 'house',
 'asylum',
 'seekers',
 'Essex',
 'media',
 'event',
 'education',
 'secretary',
 'Bridget',
 'Phillipson',
 'reaffirmed',
 'commitment',
 'try',
 'reduce',
 'number',
 'children',
 'missing',
 'classes',
 'schools',
 'pledged',
 'priority',
 'tackling',
 'British',
 'white',
 'working',
 'class',
 'young',
 'people',
 'falling',
 'peers',
 'year',
 'ahead',
 'prison',
 'population',
 'England',
 'Wales',
 'jumped',
 'highest',
 'number',

In [21]:
# This allows numbers as well, but filters out space symbols like \r and \n and punctuation

[w.text for w in doc if not w.is_space and not w.is_punct and not w.is_stop]

['Downing',
 'Street',
 'defended',
 'controversial',
 'proscription',
 'Palestine',
 'Action',
 'labelling',
 'protest',
 'group',
 'violent',
 'justice',
 'minister',
 'saying',
 'supporters',
 'terrorist',
 'organisation',
 'subjected',
 'force',
 'law',
 '10',
 'said',
 'Palestine',
 'Action',
 'committed',
 'significant',
 'injury',
 'criminal',
 'damage',
 '500',
 'arrests',
 'protest',
 'linked',
 'group',
 'London',
 'weekend',
 'Kemi',
 'Badenoch',
 'appeared',
 'suggest',
 'setting',
 'camps',
 'speaking',
 'possible',
 'alternatives',
 'hotels',
 'house',
 'asylum',
 'seekers',
 'Essex',
 'media',
 'event',
 'education',
 'secretary',
 'Bridget',
 'Phillipson',
 'reaffirmed',
 'commitment',
 'try',
 'reduce',
 'number',
 'children',
 'missing',
 'classes',
 'schools',
 'pledged',
 'priority',
 'tackling',
 'British',
 'white',
 'working',
 'class',
 'young',
 'people',
 'falling',
 'peers',
 'year',
 'ahead',
 'prison',
 'population',
 'England',
 'Wales',
 'jumped',
 'highe

### Lemmatization

A word's lemma is the simpler 'root' word that best represents the word's meaning. It reduces the possible range of words whilst still ensuring the words left convey the appropriate meaning.

To make this clearer we can use some examples:

In [22]:
#*
# Here we have essentially the same sentences, just a variation in that one uses a contraction "don't" rather than "do not".
rabbit_1 = nlp("I don't like rabbits in space")
rabbit_2 = nlp("I do not like rabbits in space")
print( [token.lemma_ for token in rabbit_1])
print( [token.lemma_ for token in rabbit_2])


['I', 'do', 'not', 'like', 'rabbit', 'in', 'space']
['I', 'do', 'not', 'like', 'rabbit', 'in', 'space']


In [23]:
#*
# Even differing text can be brought at least closer in similarity using lemmas, reducing loving to love
rabbit_1 = nlp("I'm loving these rabbits")
rabbit_2 = nlp("I love this rabbit!")

print( [token.lemma_ for token in rabbit_1])
print( [token.lemma_ for token in rabbit_2])

['I', 'be', 'love', 'these', 'rabbit']
['I', 'love', 'this', 'rabbit', '!']


If you are doing any text analysis that counts the frequency of words, relies on word similarity etc, it is usually a good idea to reduce the range of words being used so long as it can retain the same underlying semantic meaning.

In [24]:
filtered_tokens = [w.lemma_.lower() for w in doc if not w.is_stop and  w.is_alpha]
filtered_tokens

['downing',
 'street',
 'defend',
 'controversial',
 'proscription',
 'palestine',
 'action',
 'label',
 'protest',
 'group',
 'violent',
 'justice',
 'minister',
 'say',
 'supporter',
 'terrorist',
 'organisation',
 'subject',
 'force',
 'law',
 'say',
 'palestine',
 'action',
 'commit',
 'significant',
 'injury',
 'criminal',
 'damage',
 'arrest',
 'protest',
 'link',
 'group',
 'london',
 'weekend',
 'kemi',
 'badenoch',
 'appear',
 'suggest',
 'setting',
 'camp',
 'speak',
 'possible',
 'alternative',
 'hotel',
 'house',
 'asylum',
 'seeker',
 'essex',
 'medium',
 'event',
 'education',
 'secretary',
 'bridget',
 'phillipson',
 'reaffirm',
 'commitment',
 'try',
 'reduce',
 'number',
 'child',
 'miss',
 'class',
 'school',
 'pledge',
 'priority',
 'tackle',
 'british',
 'white',
 'working',
 'class',
 'young',
 'people',
 'fall',
 'peer',
 'year',
 'ahead',
 'prison',
 'population',
 'england',
 'wales',
 'jump',
 'high',
 'number',
 'nearly',
 'year',
 'near',
 'record',
 'level',

In [25]:
from collections import Counter
counts = Counter(filtered_tokens)
counts.most_common(10)

[('say', 34),
 ('government', 22),
 ('action', 16),
 ('minister', 16),
 ('year', 16),
 ('palestine', 14),
 ('people', 13),
 ('uk', 13),
 ('group', 12),
 ('criminal', 12)]

In [26]:
# If you want to convert your filtered tokens to text you simply join them together again


filtered_text = " ".join(filtered_tokens)
filtered_text

'downing street defend controversial proscription palestine action label protest group violent justice minister say supporter terrorist organisation subject force law say palestine action commit significant injury criminal damage arrest protest link group london weekend kemi badenoch appear suggest setting camp speak possible alternative hotel house asylum seeker essex medium event education secretary bridget phillipson reaffirm commitment try reduce number child miss class school pledge priority tackle british white working class young people fall peer year ahead prison population england wales jump high number nearly year near record level despite early release ten thousand offender official figure show foreign criminal country include india bulgaria australia face deportation chance appeal decision remove widening government deport appeal later scheme thank join close blog find late coverage uk politic detail downing street view palestine action courtesy pa news agency post comment 

# Tokenising in bulk
Spacy does some pretty heavy lifting so we should tokenise once, and then save the result to avoid having to rerun thr process again. Spacy also has a method that speeds up tokenising on large numbers of documents. Now we're getting into analysis we're going to start encountering the actual nuts and bolts of using a computer because the size of our datasets and the complexity of what we're doing can put a real strain on the actual hardware used.

Depending on what kind of computer we have available we may have to tweak different settings to avoid analysis failing or hardware crashing. Often the things we have to balance are...
- How much information can the computer keep in its memory at one time (RAM) controlled by `batch_size=`
- How many workers can run at the same time (CPUS) controlled by `n_process=`
- How long are things going to take to finish (Your patience) controlled by `how_close_the_deadline_is=`<sup>*</sup>

Spacy's `.pipe` method can help us here. It can take a stack of texts and we can tell it how many workers to start running and how many texts each worker should handle at a time.

 Generally if you're using Google Colab it takes around 4 minutes to process 500 articles. To avoid the hardware being overloaded and failing to finish you should set the batch_size to be between 150 and 200 and leave it using just 1 worker. 
 
If you have a more powerful laptop with multiple cores you can increase the number of workers and if you have a lot of RAM you can increase the batch size.

<sub>* Unfortunately not a real argument</sub>

In [28]:
import pandas as pd
import spacy

articles = pd.read_parquet('dataset_cleaned.parquet')
cleaned_texts = articles['cleaned_text'].tolist()
nlp = spacy.load('en_core_web_sm')

In [None]:
def tokenise_doc(doc):
    tokens = [w.lemma_.lower() for w in doc if not w.is_stop and w.is_alpha]
    return ' '.join(tokens)

BATCH_SIZE = 150
WORKERS = 1


tokens = []
for doc in nlp.pipe(cleaned_texts, batch_size=BATCH_SIZE, n_process=WORKERS):
    tokens.append(tokenise_doc(doc))

articles['tokens'] = tokens
articles.to_parquet('dataset_cleaned_tokens.parquet')


In [32]:
for toks in tokens[:5]:
    print(Counter(toks.split()).most_common(10))

[('say', 34), ('government', 22), ('action', 16), ('minister', 16), ('year', 16), ('palestine', 14), ('people', 13), ('uk', 13), ('group', 12), ('criminal', 12)]
[('refugee', 10), ('uk', 8), ('country', 7), ('end', 5), ('protest', 5), ('asylum', 5), ('say', 5), ('claim', 5), ('people', 5), ('letter', 4)]
[('food', 13), ('price', 12), ('inflation', 11), ('labour', 8), ('cost', 7), ('year', 5), ('bad', 5), ('bank', 5), ('britain', 5), ('high', 5)]
[('nomad', 6), ('capitalist', 6), ('event', 6), ('global', 6), ('kwarteng', 5), ('international', 5), ('speaker', 4), ('tax', 4), ('mobility', 4), ('say', 4)]
[('crew', 7), ('say', 6), ('lowe', 5), ('royal', 5), ('migrant', 5), ('charity', 4), ('illegal', 4), ('boat', 4), ('coastguard', 4), ('ask', 4)]
