# Text Preprocessing

In [6]:
import nltk
import string
from collections import Counter

In [7]:
raw_text = "Rick and Morty is an American adult animated science fiction sitcom created by Justin Roiland and Dan Harmon for Cartoon Network's nighttime programming block Adult Swim. The series follows the misadventures of Rick Sanchez, a cynical mad scientist, and his good-hearted but fretful grandson Morty Smith, who split their time between domestic life and interdimensional adventures that take place across an infinite number of realities, often traveling to other planets and dimensions through portals and on Rick's flying saucer. The general concept of Rick and Morty relies on two conflicting scenarios: domestic family drama, and a misanthropic grandfather dragging his grandson into hijinks.Roiland voiced the eponymous characters, with Chris Parnell, Spencer Grammer, and Sarah Chalke voicing the rest of Rick and Morty's family. The series originated from an animated short parody film of Back to the Future created by Roiland for Channel 101, a short-film festival cofounded by Harmon. Since its debut, the series has received critical acclaim for its originality, creativity, and humor. It has been nominated for three Primetime Emmy Awards for Outstanding Animated Program and won the award in 2018 and 2020. The series has also received two Annie Awards. At times, the series has been the most viewed television comedy for adults between 18 and 24. The popularity of Rick and Morty has made it a hundred-million dollar merchandising and media franchise.The sixth season premiered on September 4, 2022, and consists of ten episodes. A seventh season was confirmed as part of a long-term deal with Cartoon Network that ordered 70 new episodes, which renewed the series through to a tenth season. Adult Swim cut ties with Roiland in 2023 amid allegations of domestic abuse and announced it would be recasting his roles with soundalike actors."

In [8]:
print(raw_text)

Rick and Morty is an American adult animated science fiction sitcom created by Justin Roiland and Dan Harmon for Cartoon Network's nighttime programming block Adult Swim. The series follows the misadventures of Rick Sanchez, a cynical mad scientist, and his good-hearted but fretful grandson Morty Smith, who split their time between domestic life and interdimensional adventures that take place across an infinite number of realities, often traveling to other planets and dimensions through portals and on Rick's flying saucer. The general concept of Rick and Morty relies on two conflicting scenarios: domestic family drama, and a misanthropic grandfather dragging his grandson into hijinks.Roiland voiced the eponymous characters, with Chris Parnell, Spencer Grammer, and Sarah Chalke voicing the rest of Rick and Morty's family. The series originated from an animated short parody film of Back to the Future created by Roiland for Channel 101, a short-film festival cofounded by Harmon. Since its

## Sentence splitting

Splitting text into sentences.

In [9]:
from nltk.tokenize import sent_tokenize

In [10]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jaspe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
sentences = sent_tokenize(raw_text)

In [12]:
print(f'There are {len(sentences)} sentences')

There are 11 sentences


## Tokenisation

Dividing a string into a list of tokens.

In [13]:
from nltk.tokenize import word_tokenize

In [14]:
tokens_list = [word_tokenize(s) for s in sentences]

In [15]:
tokens_list

[['Rick',
  'and',
  'Morty',
  'is',
  'an',
  'American',
  'adult',
  'animated',
  'science',
  'fiction',
  'sitcom',
  'created',
  'by',
  'Justin',
  'Roiland',
  'and',
  'Dan',
  'Harmon',
  'for',
  'Cartoon',
  'Network',
  "'s",
  'nighttime',
  'programming',
  'block',
  'Adult',
  'Swim',
  '.'],
 ['The',
  'series',
  'follows',
  'the',
  'misadventures',
  'of',
  'Rick',
  'Sanchez',
  ',',
  'a',
  'cynical',
  'mad',
  'scientist',
  ',',
  'and',
  'his',
  'good-hearted',
  'but',
  'fretful',
  'grandson',
  'Morty',
  'Smith',
  ',',
  'who',
  'split',
  'their',
  'time',
  'between',
  'domestic',
  'life',
  'and',
  'interdimensional',
  'adventures',
  'that',
  'take',
  'place',
  'across',
  'an',
  'infinite',
  'number',
  'of',
  'realities',
  ',',
  'often',
  'traveling',
  'to',
  'other',
  'planets',
  'and',
  'dimensions',
  'through',
  'portals',
  'and',
  'on',
  'Rick',
  "'s",
  'flying',
  'saucer',
  '.'],
 ['The',
  'general',
  'c

The top-10 most common tokens.

In [16]:
Counter([w for x in tokens_list for w in x]).most_common(10)

[('and', 18),
 (',', 16),
 ('.', 11),
 ('the', 9),
 ('of', 9),
 ('Rick', 6),
 ('for', 6),
 ('series', 6),
 ('a', 6),
 ('Morty', 5)]

## Removing punctuation and stop words

Stopwords and punctuation are usually not helpful for many IR tasks, and removing them can reduce the number of tokens we need to process. 

In [17]:
from nltk.corpus import stopwords

In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jaspe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
stopwords_en = set(stopwords.words('english'))

In [20]:
stopwords_en

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [21]:
tokens_list[:] = [[w for w in x if w not in string.punctuation and w not in stopwords_en] for x in tokens_list]

In [22]:
tokens_list

[['Rick',
  'Morty',
  'American',
  'adult',
  'animated',
  'science',
  'fiction',
  'sitcom',
  'created',
  'Justin',
  'Roiland',
  'Dan',
  'Harmon',
  'Cartoon',
  'Network',
  "'s",
  'nighttime',
  'programming',
  'block',
  'Adult',
  'Swim'],
 ['The',
  'series',
  'follows',
  'misadventures',
  'Rick',
  'Sanchez',
  'cynical',
  'mad',
  'scientist',
  'good-hearted',
  'fretful',
  'grandson',
  'Morty',
  'Smith',
  'split',
  'time',
  'domestic',
  'life',
  'interdimensional',
  'adventures',
  'take',
  'place',
  'across',
  'infinite',
  'number',
  'realities',
  'often',
  'traveling',
  'planets',
  'dimensions',
  'portals',
  'Rick',
  "'s",
  'flying',
  'saucer'],
 ['The',
  'general',
  'concept',
  'Rick',
  'Morty',
  'relies',
  'two',
  'conflicting',
  'scenarios',
  'domestic',
  'family',
  'drama',
  'misanthropic',
  'grandfather',
  'dragging',
  'grandson',
  'hijinks.Roiland',
  'voiced',
  'eponymous',
  'characters',
  'Chris',
  'Parnell',

The top-10 most common tokens.

In [23]:
Counter([w for x in tokens_list for w in x]).most_common(10)

[('Rick', 6),
 ('series', 6),
 ('Morty', 5),
 ('The', 5),
 ('Roiland', 3),
 ("'s", 3),
 ('domestic', 3),
 ('season', 3),
 ('animated', 2),
 ('created', 2)]

## Stemming

Turning words into stems.

In [24]:
from nltk.stem import PorterStemmer

In [25]:
stemmer = PorterStemmer()

In [26]:
tokens_stem = [stemmer.stem(w) for x in tokens_list for w in x]

In [27]:
tokens_stem

['rick',
 'morti',
 'american',
 'adult',
 'anim',
 'scienc',
 'fiction',
 'sitcom',
 'creat',
 'justin',
 'roiland',
 'dan',
 'harmon',
 'cartoon',
 'network',
 "'s",
 'nighttim',
 'program',
 'block',
 'adult',
 'swim',
 'the',
 'seri',
 'follow',
 'misadventur',
 'rick',
 'sanchez',
 'cynic',
 'mad',
 'scientist',
 'good-heart',
 'fret',
 'grandson',
 'morti',
 'smith',
 'split',
 'time',
 'domest',
 'life',
 'interdimension',
 'adventur',
 'take',
 'place',
 'across',
 'infinit',
 'number',
 'realiti',
 'often',
 'travel',
 'planet',
 'dimens',
 'portal',
 'rick',
 "'s",
 'fli',
 'saucer',
 'the',
 'gener',
 'concept',
 'rick',
 'morti',
 'reli',
 'two',
 'conflict',
 'scenario',
 'domest',
 'famili',
 'drama',
 'misanthrop',
 'grandfath',
 'drag',
 'grandson',
 'hijinks.roiland',
 'voic',
 'eponym',
 'charact',
 'chri',
 'parnel',
 'spencer',
 'grammer',
 'sarah',
 'chalk',
 'voic',
 'rest',
 'rick',
 'morti',
 "'s",
 'famili',
 'the',
 'seri',
 'origin',
 'anim',
 'short',
 'paro

In [28]:
Counter(tokens_stem).most_common(10)

[('rick', 6),
 ('seri', 6),
 ('morti', 5),
 ('the', 5),
 ('adult', 4),
 ('anim', 3),
 ('roiland', 3),
 ("'s", 3),
 ('domest', 3),
 ('award', 3)]

### Exercise

Try other NLTK stemmers (e.g. SnowballStemmer, RegexpStemmer), you may need to download additional data packages, see https://www.nltk.org/data.html

In [29]:
from nltk.stem import SnowballStemmer, RegexpStemmer

stemmer_snow = SnowballStemmer('english')
tokens_stem_snow = [stemmer_snow.stem(w) for x in tokens_list for w in x]
print(tokens_stem_snow)

['rick', 'morti', 'american', 'adult', 'anim', 'scienc', 'fiction', 'sitcom', 'creat', 'justin', 'roiland', 'dan', 'harmon', 'cartoon', 'network', "'s", 'nighttim', 'program', 'block', 'adult', 'swim', 'the', 'seri', 'follow', 'misadventur', 'rick', 'sanchez', 'cynic', 'mad', 'scientist', 'good-heart', 'fret', 'grandson', 'morti', 'smith', 'split', 'time', 'domest', 'life', 'interdimension', 'adventur', 'take', 'place', 'across', 'infinit', 'number', 'realiti', 'often', 'travel', 'planet', 'dimens', 'portal', 'rick', "'s", 'fli', 'saucer', 'the', 'general', 'concept', 'rick', 'morti', 'reli', 'two', 'conflict', 'scenario', 'domest', 'famili', 'drama', 'misanthrop', 'grandfath', 'drag', 'grandson', 'hijinks.roiland', 'voic', 'eponym', 'charact', 'chris', 'parnel', 'spencer', 'grammer', 'sarah', 'chalk', 'voic', 'rest', 'rick', 'morti', "'s", 'famili', 'the', 'seri', 'origin', 'anim', 'short', 'parodi', 'film', 'back', 'futur', 'creat', 'roiland', 'channel', '101', 'short-film', 'festiv'

## Lemmatisation

Turning words into lemmas (entries in a dictionary). It requires knowledge of the context (typically the intended
Part-of-Speech of a word in the context).

In [30]:
from nltk.stem import WordNetLemmatizer

In [31]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jaspe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\jaspe\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

POS tagging for lemmatisation.

In [32]:
nltk.download('averaged_perceptron_tagger')
tags_list = nltk.pos_tag_sents(tokens_list)
# tags_list

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jaspe\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


A heuristic to convert POS tags to the [four syntactic categories that wordnet recognizes (i.e. **noun**, **verb**, **adj** and **adv**)](https://wordnet.princeton.edu/):
- `n` for nouns
- `v` for verbs
- `a` for adjectives
- `r` for adverbs

In [33]:
tags_list

[[('Rick', 'NNP'),
  ('Morty', 'NNP'),
  ('American', 'NNP'),
  ('adult', 'NN'),
  ('animated', 'VBD'),
  ('science', 'NN'),
  ('fiction', 'NN'),
  ('sitcom', 'NN'),
  ('created', 'VBD'),
  ('Justin', 'NNP'),
  ('Roiland', 'NNP'),
  ('Dan', 'NNP'),
  ('Harmon', 'NNP'),
  ('Cartoon', 'NNP'),
  ('Network', 'NNP'),
  ("'s", 'POS'),
  ('nighttime', 'NN'),
  ('programming', 'NN'),
  ('block', 'NN'),
  ('Adult', 'NNP'),
  ('Swim', 'NNP')],
 [('The', 'DT'),
  ('series', 'NN'),
  ('follows', 'VBZ'),
  ('misadventures', 'NNS'),
  ('Rick', 'NNP'),
  ('Sanchez', 'NNP'),
  ('cynical', 'JJ'),
  ('mad', 'JJ'),
  ('scientist', 'NN'),
  ('good-hearted', 'JJ'),
  ('fretful', 'JJ'),
  ('grandson', 'NN'),
  ('Morty', 'NNP'),
  ('Smith', 'NNP'),
  ('split', 'NN'),
  ('time', 'NN'),
  ('domestic', 'JJ'),
  ('life', 'NN'),
  ('interdimensional', 'JJ'),
  ('adventures', 'NNS'),
  ('take', 'VBP'),
  ('place', 'NN'),
  ('across', 'IN'),
  ('infinite', 'JJ'),
  ('number', 'NN'),
  ('realities', 'NNS'),
  ('ofte

In [34]:
wordnet_tag = lambda t: 'a' if t == 'j' else (t if t in ['n', 'v', 'r'] else 'n')

Lemmatising

In [35]:
lemmatizer = WordNetLemmatizer()

In [36]:
tokens_lemma = [lemmatizer.lemmatize(w.lower(), pos=wordnet_tag(t[0].lower())) for x in tags_list for (w, t) in x]

In [37]:
tokens_lemma

['rick',
 'morty',
 'american',
 'adult',
 'animate',
 'science',
 'fiction',
 'sitcom',
 'create',
 'justin',
 'roiland',
 'dan',
 'harmon',
 'cartoon',
 'network',
 "'s",
 'nighttime',
 'programming',
 'block',
 'adult',
 'swim',
 'the',
 'series',
 'follow',
 'misadventure',
 'rick',
 'sanchez',
 'cynical',
 'mad',
 'scientist',
 'good-hearted',
 'fretful',
 'grandson',
 'morty',
 'smith',
 'split',
 'time',
 'domestic',
 'life',
 'interdimensional',
 'adventure',
 'take',
 'place',
 'across',
 'infinite',
 'number',
 'reality',
 'often',
 'travel',
 'planet',
 'dimension',
 'portal',
 'rick',
 "'s",
 'fly',
 'saucer',
 'the',
 'general',
 'concept',
 'rick',
 'morty',
 'rely',
 'two',
 'conflict',
 'scenario',
 'domestic',
 'family',
 'drama',
 'misanthropic',
 'grandfather',
 'drag',
 'grandson',
 'hijinks.roiland',
 'voice',
 'eponymous',
 'character',
 'chris',
 'parnell',
 'spencer',
 'grammer',
 'sarah',
 'chalke',
 'voice',
 'rest',
 'rick',
 'morty',
 "'s",
 'family',
 'the'

In [39]:
Counter(tokens_lemma).most_common(10)

[('rick', 6),
 ('series', 6),
 ('morty', 5),
 ('the', 5),
 ('adult', 4),
 ('roiland', 3),
 ("'s", 3),
 ('domestic', 3),
 ('award', 3),
 ('season', 3)]