In [1]:
#!pip install -U pip
!pip3 install -U dill
!pip3 install -U nltk==3.8

Collecting dill
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
   ---------------------------------------- 0.0/116.3 kB ? eta -:--:--
   ---------- ----------------------------- 30.7/116.3 kB 1.4 MB/s eta 0:00:01
   -------------------- ------------------ 61.4/116.3 kB 656.4 kB/s eta 0:00:01
   -------------------------------------- - 112.6/116.3 kB 1.1 MB/s eta 0:00:01
   -------------------------------------- 116.3/116.3 kB 757.2 kB/s eta 0:00:00
Installing collected packages: dill
Successfully installed dill-0.3.8



[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting nltk==3.8
  Downloading nltk-3.8-py3-none-any.whl.metadata (2.8 kB)
Collecting regex>=2021.8.3 (from nltk==3.8)
  Downloading regex-2023.12.25-cp311-cp311-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     --------------------------- ---------- 30.7/42.0 kB 640.0 kB/s eta 0:00:01
     -------------------------------------- 42.0/42.0 kB 503.7 kB/s eta 0:00:00
Collecting tqdm (from nltk==3.8)
  Downloading tqdm-4.66.2-py3-none-any.whl.metadata (57 kB)
     ---------------------------------------- 0.0/57.6 kB ? eta -:--:--
     ---------------------------------------- 57.6/57.6 kB 1.5 MB/s eta 0:00:00
Downloading nltk-3.8-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   - -------------------------------------- 0.1/1.5 MB ? eta -:--:--
   - -------------------------------------- 0.1/1.5 MB ? eta -:--:--
   ---- ----------------------------------- 0.2/1.5 MB 1.3 MB/s eta 0:00


[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from google.colab import drive
drive.mount('/content/drive')

path = 'drive/MyDrive/Colab Notebooks/'

Mounted at /content/drive


# N-grams Language Models (N-grams LM)

Nowadays, everything seems to be going neural...

Traditionally, we can use n-grams to generate language models to predict which word comes next given a history of words.

We'll use the `lm` module in `nltk` to get a sense of how non-neural language modelling is done.

(**Source:** The content in this notebook is largely based on [language model tutorial in NLTK documentation by Ilia Kurenkov](https://github.com/nltk/nltk/blob/develop/nltk/lm/__init__.py))

In [3]:
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten

If we want to train a bigram model, we need to turn this text into bigrams. Here's what the first sentence of our text would look like if we use the `ngrams` function from NLTK for this.

In [4]:
text = [['a', 'b', 'c'], ['a', 'c', 'd', 'c', 'e', 'f']]

In [5]:
list(bigrams(text[0]))

[('a', 'b'), ('b', 'c')]

In [6]:
list(ngrams(text[1], n=3))

[('a', 'c', 'd'), ('c', 'd', 'c'), ('d', 'c', 'e'), ('c', 'e', 'f')]

Notice how "b" occurs both as the first and second member of different bigrams but "a" and "c" don't?

Wouldn't it be nice to somehow indicate how often sentences start with "a" and end with "c"?


A standard way to deal with this is to add special "padding" symbols to the sentence before splitting it into ngrams. Fortunately, NLTK also has a function for that, let's see what it does to the first sentence.


In [7]:
from nltk.util import pad_sequence
list(pad_sequence(text[0],
                  pad_left=True, left_pad_symbol="<s>",
                  pad_right=True, right_pad_symbol="</s>",
                  n=3)) # The n order of n-grams, if it's 2-grams, you pad once, 3-grams pad twice, etc.

['<s>', '<s>', 'a', 'b', 'c', '</s>', '</s>']

In [8]:
padded_sent = list(pad_sequence(text[0], pad_left=True, left_pad_symbol="<s>",
                                pad_right=True, right_pad_symbol="</s>", n=2))
list(ngrams(padded_sent, n=2))

[('<s>', 'a'), ('a', 'b'), ('b', 'c'), ('c', '</s>')]

In [9]:
list(pad_sequence(text[0],
                  pad_left=True, left_pad_symbol="<s>",
                  pad_right=True, right_pad_symbol="</s>",
                  n=3)) # The n order of n-grams, if it's 2-grams, you pad once, 3-grams pad twice, etc.

['<s>', '<s>', 'a', 'b', 'c', '</s>', '</s>']

In [10]:
padded_sent = list(pad_sequence(text[0], pad_left=True, left_pad_symbol="<pad>",
                                pad_right=True, right_pad_symbol="</pad>", n=3))
list(ngrams(padded_sent, n=3))

[('<pad>', '<pad>', 'a'),
 ('<pad>', 'a', 'b'),
 ('a', 'b', 'c'),
 ('b', 'c', '</pad>'),
 ('c', '</pad>', '</pad>')]

Note the `n` argument, that tells the function we need padding for bigrams.

Now, passing all these parameters every time is tedious and in most cases they can be safely assumed as defaults anyway.

Thus the `nltk.lm` module provides a convenience function that has all these arguments already set while the other arguments remain the same as for `pad_sequence`.

In [11]:
from nltk.lm.preprocessing import pad_both_ends
list(pad_both_ends(text[0], n=2))


['<s>', 'a', 'b', 'c', '</s>']

Combining the two parts discussed so far we get the following preparation steps for one sentence.

In [12]:
from nltk.util import bigrams

list(bigrams(pad_both_ends(text[0], n=2)))

[('<s>', 'a'), ('a', 'b'), ('b', 'c'), ('c', '</s>')]

To make our model more robust we could also train it on unigrams (single words) as well as bigrams, its main source of information.
NLTK once again helpfully provides a function called `everygrams`.

While not the most efficient, it is conceptually simple.

In [13]:
from nltk.util import everygrams
padded_bigrams = list(pad_both_ends(text[0], n=2))
list(everygrams(padded_bigrams, max_len=2))

[('<s>',),
 ('<s>', 'a'),
 ('a',),
 ('a', 'b'),
 ('b',),
 ('b', 'c'),
 ('c',),
 ('c', '</s>'),
 ('</s>',)]

We are almost ready to start counting ngrams, just one more step left.

During training and evaluation our model will rely on a vocabulary that defines which words are "known" to the model.

To create this vocabulary we need to pad our sentences (just like for counting ngrams) and then combine the sentences into one flat stream of words.


In [14]:
from nltk.lm.preprocessing import flatten
list(flatten(pad_both_ends(sent, n=2) for sent in text))

['<s>', 'a', 'b', 'c', '</s>', '<s>', 'a', 'c', 'd', 'c', 'e', 'f', '</s>']

In most cases we want to use the same text as the source for both vocabulary and ngram counts.

Now that we understand what this means for our preprocessing, we can simply import a function that does everything for us.

In [15]:
from nltk.lm.preprocessing import padded_everygram_pipeline
train, vocab = padded_everygram_pipeline(2, text)

So as to avoid re-creating the text in memory, both `train` and `vocab` are lazy iterators. They are evaluated on demand at training time.

For the sake of understanding the output of `padded_everygram_pipeline`, we'll "materialize" the lazy iterators by casting them into a list.

In [16]:
training_ngrams, padded_sentences = padded_everygram_pipeline(2, text)
for ngramlize_sent in training_ngrams:
    print(list(ngramlize_sent))
    print()
print('#############')
list(padded_sentences)

[('<s>',), ('<s>', 'a'), ('a',), ('a', 'b'), ('b',), ('b', 'c'), ('c',), ('c', '</s>'), ('</s>',)]

[('<s>',), ('<s>', 'a'), ('a',), ('a', 'c'), ('c',), ('c', 'd'), ('d',), ('d', 'c'), ('c',), ('c', 'e'), ('e',), ('e', 'f'), ('f',), ('f', '</s>'), ('</s>',)]

#############


['<s>', 'a', 'b', 'c', '</s>', '<s>', 'a', 'c', 'd', 'c', 'e', 'f', '</s>']

In [17]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize, sent_tokenize

    # Testing whether it works.
    # Sometimes it doesn't work on some machines because of setup issues.
print(word_tokenize(sent_tokenize("This is a foobar sentence. Yes it is.")[1]))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['Yes', 'it', 'is', '.']


## Lets get some real data and tokenize it

In [18]:
try: # Use the default NLTK tokenizer.
    from nltk import word_tokenize, sent_tokenize
    # Testing whether it works.
    # Sometimes it doesn't work on some machines because of setup issues.
    print(word_tokenize(sent_tokenize("This is a foobar sentence. Yes it is.")[0]))
except: # Use a naive sentence tokenizer and toktok.
    import re
    from nltk.tokenize import ToktokTokenizer
    # See https://stackoverflow.com/a/25736515/610569
    sent_tokenize = lambda x: re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', x)
    # Use the toktok tokenizer that requires no dependencies.
    toktok = ToktokTokenizer()
    word_tokenize = word_tokenize = toktok.tokenize

['This', 'is', 'a', 'foobar', 'sentence', '.']


In [19]:
path = 'drive/MyDrive/Colab Notebooks/monash/NLP/week2/'
print(path)

drive/MyDrive/Colab Notebooks/monash/NLP/week2/


In [20]:
import os
import requests


# Text version of https://kilgarriff.co.uk/Publications/2005-K-lineer.pdf
#if os.path.isfile('language-never-random.txt'):
with open(path+'haranhui_had.txt', encoding='utf8') as fin:
  text = fin.read()
# else:
#     url = "https://gist.githubusercontent.com/alvations/53b01e4076573fea47c6057120bb017a/raw/b01ff96a5f76848450e648f35da6497ca9454e4a/language-never-random.txt"
#     text = requests.get(url).content.decode('utf8')
#     with io.open('language-never-random.txt', 'w', encoding='utf8') as fout:
#         fout.write(text)

In [21]:
tokenized_text = []
list_sentences = sent_tokenize(text)

for sent in list_sentences:
  word_list = word_tokenize(sent)
  output_words = []
  for word in word_list:
    word = word.lower()
    output_words.append(word)
  tokenized_text.append(output_words)

In [22]:
#Tokenize the text.
tokenized_text = [list(map(str.lower, word_tokenize(sent)))
                  for sent in sent_tokenize(text)]

In [23]:
tokenized_text[0]

['д.нацагдорж',
 'харанхуй',
 'хад',
 'зуны',
 'шөнө',
 'богино',
 'тул',
 ',',
 'өглөөний',
 'найман',
 'цагт',
 'нар',
 'нэгэнт',
 'дээр',
 'гарчээ',
 '.']

In [24]:
print(text[:500])

Д.Нацагдорж

Харанхуй хад

Зуны шөнө богино тул, өглөөний найман цагт нар нэгэнт дээр гарчээ. Унтсан нойрноос арайхан сэрмэгц нэг янжуур асааж, шившив. Чилсэн биеийг талбируулан түр зуур хэвтэхийн завсар энэ өдрийн элдэв хэргийг бодох бөгөөд өдрийн тэмдэглэлийн дэвтрээ дэрэн доороосоо авч үзвэл, химич харандаагаар бичсэн хэдэн үсэг бараг бүрэг үзэгдэхэд түүнийг шүлсээрээ норгон улмаар ажиглавал: «Наймдугаар сарын гучны Бямба гариг, Харанхуй хад Ина » хэмээх хэдэн үгсийг тэмдэглэсэн байх нь утга 


In [25]:
# Preprocess the tokenized text for 3-grams language modelling
n = 3
train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

# Training an N-gram Model

Having prepared our data we are ready to start training a model. As a simple example, let us train a Maximum Likelihood Estimator (MLE).

We only need to specify the highest ngram order to instantiate it.

In [26]:
from nltk.lm import MLE
model = MLE(n) # Lets train a 3-grams model, previously we set n=3

Initializing the MLE model, creates an empty vocabulary

In [27]:
len(model.vocab)

0

... which gets filled as we fit the model.

In [28]:
model.fit(train_data, padded_sents)
print(model.vocab)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 938 items>


In [29]:
it = 0
for word in model.vocab:
  print(it, word)
  it+=1

0 <s>
1 д.нацагдорж
2 харанхуй
3 хад
4 зуны
5 шөнө
6 богино
7 тул
8 ,
9 өглөөний
10 найман
11 цагт
12 нар
13 нэгэнт
14 дээр
15 гарчээ
16 .
17 </s>
18 унтсан
19 нойрноос
20 арайхан
21 сэрмэгц
22 нэг
23 янжуур
24 асааж
25 шившив
26 чилсэн
27 биеийг
28 талбируулан
29 түр
30 зуур
31 хэвтэхийн
32 завсар
33 энэ
34 өдрийн
35 элдэв
36 хэргийг
37 бодох
38 бөгөөд
39 тэмдэглэлийн
40 дэвтрээ
41 дэрэн
42 доороосоо
43 авч
44 үзвэл
45 химич
46 харандаагаар
47 бичсэн
48 хэдэн
49 үсэг
50 бараг
51 бүрэг
52 үзэгдэхэд
53 түүнийг
54 шүлсээрээ
55 норгон
56 улмаар
57 ажиглавал
58 :
59 «
60 наймдугаар
61 сарын
62 гучны
63 бямба
64 гариг
65 ина
66 »
67 хэмээх
68 үгсийг
69 тэмдэглэсэн
70 байх
71 нь
72 утга
73 ба
74 учир
75 тодорхойгүй
76 юун
77 тухай
78 огт
79 мартагдсан
80 дахин
81 удаа
82 алгуурхнаар
83 уншиж
84 эцсийн
85 хүрвээс
86 миний
87 хуучин
88 амраг
89 охины
90 нэр
91 цахилгаан
92 адил
93 хоромхон
94 тархины
95 дотор
96 гэрэлтсэн
97 мөнхүү
98 охиныг
99 хэдийнээ
100 хайлаас
101 модны
102 сүүдэрт
103 тэ

The vocabulary helps us handle words that have not occurred during training.

In [30]:
print(model.vocab.lookup(tokenized_text[0]))

('д.нацагдорж', 'харанхуй', 'хад', 'зуны', 'шөнө', 'богино', 'тул', ',', 'өглөөний', 'найман', 'цагт', 'нар', 'нэгэнт', 'дээр', 'гарчээ', '.')


In [33]:
# If we lookup the vocab on unseen sentences not from the training data,
# it automatically replace words not in the vocabulary with `<UNK>`.
print(model.vocab.lookup('харанхуй хад бол сонин аймшигтай .'.split()))

('харанхуй', 'хад', 'бол', '<UNK>', 'аймшигтай', '.')


Moreover, in some cases we want to ignore words that we did see during training but that didn't occur frequently enough, to provide us useful information.

You can tell the vocabulary to ignore such words using the `unk_cutoff` argument for the vocabulary lookup, To find out how that works, check out the docs for the [`nltk.lm.vocabulary.Vocabulary` class](https://github.com/nltk/nltk/blob/develop/nltk/lm/vocabulary.py)

**Note:** For more sophisticated ngram models, take a look at [these objects from `nltk.lm.models`](https://github.com/nltk/nltk/blob/develop/nltk/lm/models.py):

 - `Lidstone`: Provides Lidstone-smoothed scores.
 - `Laplace`: Implements Laplace (add one) smoothing.
 - `InterpolatedLanguageModel`: Logic common to all interpolated language models (Chen & Goodman 1995).
 - `WittenBellInterpolated`: Interpolated version of Witten-Bell smoothing.

# Using the N-gram Language Model

When it comes to ngram models the training boils down to counting up the ngrams from the training corpus.

In [34]:
print(model.counts)

<NgramCounter with 3 ngram orders and 6222 ngrams>


This provides a convenient interface to access counts for unigrams...

In [35]:
model.counts['харанхуй'] # i.e. Count('харанхуй')

0

...and bigrams for the phrase "language is"

In [36]:
model.counts[['харанхуй']]['хад'] # i.e. Count('хад'|'харанхуй')

7

... and trigrams for the phrase "language is never"

In [38]:
model.counts[['харанхуй', 'хад']]['хэмээх'] # i.e. Count('хэмээх'|'харанхуй хад')

2

And so on. However, the real purpose of training a language model is to have it score how probable words are in certain contexts.

This being MLE, the model returns the item's relative frequency as its score.

In [39]:
model.score('харанхуй') # P('харанхуй')

0.00510204081632653

In [42]:
model.score('хад', 'харанхуй'.split())  # P('хад'|'харанхуй')

0.6363636363636364

In [44]:
model.score('хэмээх', 'харанхуй хад'.split())  # P('хэмээх'|'харанхуй хад')

0.2857142857142857

Items that are not seen during training are mapped to the vocabulary's "unknown label" token.  This is "<UNK>" by default.


In [45]:
model.score("<UNK>") == model.score("сонин")

True

In [47]:
model.score("<UNK>") == model.score("маш")

True

In [46]:
model.score("<UNK>") == model.score("ерөнхийлөгч")

True

To avoid underflow when working with many small score values it makes sense to take their logarithm.

For convenience this can be done with the `logscore` method.


In [48]:
model.logscore("хэмээх", "харанхуй хад".split())

-1.8073549220576042

# Generation using N-gram Language Model

One cool feature of ngram models is that they can be used to generate text.

In [49]:
print(model.generate(20, random_seed=8))

['<s>', 'үүнд', 'би', 'нэг', 'зэрэг', 'бодож', 'итгэснээр', 'чамайг', 'олж', 'ирэв', '.', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>']


We can do some cleaning to the generated tokens to make it human-like.

In [50]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

detokenize = TreebankWordDetokenizer().detokenize

def generate_sent(model, num_words, random_seed=42):
    """
    :param model: An ngram language model from `nltk.lm.model`.
    :param num_words: Max no. of words to generate.
    :param random_seed: Seed value for random.
    """
    content = []
    for token in model.generate(num_words, random_seed=random_seed):
        if token == '<s>':
            continue
        if token == '</s>':
            break
        content.append(token)
    return detokenize(content)

In [53]:
generate_sent(model, 20, random_seed=8)

'үүнд би нэг зэрэг бодож итгэснээр чамайг олж ирэв.'

In [51]:
generate_sent(model, 20, random_seed=7)

'би баруун хойно нь нэг хар овоохой харагдав.'

In [52]:
print(model.generate(28, random_seed=0))

['хүлээнэ', '.', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>']


In [48]:
generate_sent(model, 28, random_seed=0)

'хүртэл нэг гудмыг өгсөн уруудан нvvж, нэг аймаг улс цөм хучин хvvгийн адил нvдээ аньж чихээ бөглөн ертөнц дээрхийг таньж мэдэхгүй утаат гэр хороололд гэрэл тусаж нэгэн'

In [49]:
generate_sent(model, 20, random_seed=1)

''

In [50]:
generate_sent(model, 20, random_seed=30)

'муу давжаагууд нь өвөр монголд ямаачин, австралид хоничин, америкад адуучин, арабад тэмээчин, энэтхэгт үхэрчин болохоор одсон'

In [None]:
generate_sent(model, 20, random_seed=42)

'more (or cold) weather, or on saturday nights, or by people in (or poorer)'

# Saving the model

The native Python's pickle may not save the lambda functions in the  model, so we can use the `dill` library in place of pickle to save and load the language model.


In [None]:
path

'drive/MyDrive/Colab Notebooks/monash/NLP/week2/'

In [None]:
import dill as pickle

with open(path+'kilgariff_ngram_model.pkl', 'wb') as fout:
    pickle.dump(model, fout)

In [None]:
with open(path+'kilgariff_ngram_model.pkl', 'rb') as fin:
    model_loaded = pickle.load(fin)

In [None]:
generate_sent(model_loaded, 20, random_seed=42)

'more (or cold) weather, or on saturday nights, or by people in (or poorer)'

# Lets try some generating with Donald Trump data!!!


**Dataset:** https://www.kaggle.com/kingburrito666/better-donald-trump-tweets#Donald-Tweets!.csv


In this part, I'll be munging that data as how I would be doing it at work.
I've really no seen the data before but I hope this session would be helpful for you to see how to approach new datasets with the skills you have.

In [None]:
# from google.colab import drive
# drive.mount('/gdrive')
# #replace the following path according to your Google Drive path
# %cd/gdrive/My Drive/Monash-FIT-S1-2022/week_3

In [None]:
import pandas as pd
df = pd.read_csv(path+'Donald-Tweets!.csv')
df.head()

Unnamed: 0,Date,Time,Tweet_Text,Type,Media_Type,Hashtags,Tweet_Id,Tweet_Url,twt_favourites_IS_THIS_LIKE_QUESTION_MARK,Retweets,Unnamed: 10,Unnamed: 11
0,16-11-11,15:26:37,Today we express our deepest gratitude to all ...,text,photo,ThankAVet,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,127213,41112,,
1,16-11-11,13:33:35,Busy day planned in New York. Will soon be mak...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,141527,28654,,
2,16-11-11,11:14:20,Love the fact that the small groups of protest...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,183729,50039,,
3,16-11-11,2:19:44,Just had a very open and successful presidenti...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/796...,214001,67010,,
4,16-11-11,2:10:46,A fantastic day in D.C. Met with President Oba...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/796...,178499,36688,,


In [None]:
trump_corpus = list(df['Tweet_Text'].apply(word_tokenize))

In [None]:
# Preprocess the tokenized text for 3-grams language modelling
n = 3
train_data, padded_sents = padded_everygram_pipeline(n, trump_corpus)

In [None]:
from nltk.lm import MLE
trump_model = MLE(n) # Lets train a 3-grams model, previously we set n=3
trump_model.fit(train_data, padded_sents)

In [None]:
generate_sent(trump_model, num_words=1000, random_seed=42)

'call!'

In [None]:
generate_sent(trump_model, num_words=10, random_seed=0)

'picks it up! Democrats numbers are down big in'

In [None]:
generate_sent(trump_model, num_words=50, random_seed=10)

'"@ ajbruno14: @ realDonaldTrump beautiful family! Best #SNL with @ realDonaldTrump You are a total joke . No clue on immigration now because he REPLACED his LEGAL cellphone?'

In [None]:
print(generate_sent(trump_model, num_words=100, random_seed=52))

will MAKE AMERICA GREAT AGAIN! https: /_


In [None]:
khuyagbaatar.batsuren@monash.edu