# N-Gram Language Model

In [124]:
import path_imports
import tiktoken
import nltk
from nltk.lm.preprocessing import padded_everygram_pipeline
from sklearn.model_selection import train_test_split
from src.read_corpus import read_corpus
from src.preprocessing.regexp_tokenizer import RegexpTokenizer
from src.preprocessing.utils import convertURL
from src.preprocessing.gpt_tokenizer import MyGPTTokenizer
from src.generation.my_n_grams import MyNgramModel

## Create model

In [125]:
categories = ["debate", "speech", "tweet"]
document = read_corpus(categories)["text"]

# Replace each URL link by TOKURL
document = document.apply(convertURL)

100%|██████████| 29/29 [00:03<00:00,  8.60it/s]


### Splitting data for training and testing purposes

In [126]:
train_df, test_df = train_test_split(document, test_size=0.2)

train_df

10374    """Things work out best for those who make the...
5615     ..Country, had to be approved by me, as Presid...
32270    """@BIGDADDY441: @realDonaldTrump run for pres...
20144    """@JHasseyjr: ""@abunnieslife: Mr Trump we ha...
26366    """@JoshuaEnglishh: @realDonaldTrump  every si...
                               ...                        
22757    """@DustinDeMoss: @realDonaldTrump I disagree ...
2146     In the meantime, an awful lot of people were j...
18959                                  @MrJPL It is Trump!
1158     Cierra, you remembered the most important thin...
3280     Iowa caucus-goers take note. Joni Ernst just s...
Name: text, Length: 49840, dtype: object

### Initialize model

In [127]:
n = 5

# Different tokenizers
gpt_tokenizer = MyGPTTokenizer()
#nlpReg_tokenizer = RegexpTokenizer()

lm = MyNgramModel(n, tokenizer=gpt_tokenizer.tokenize, detokenizer=gpt_tokenizer.detokenize)

### Tokenizing and encoding

In [128]:
train_strings = lm.preprocessing(train_df)
test_strings = lm.preprocessing(test_df)
train_strings

[['58174',
  '990',
  '704',
  '1888',
  '369',
  '1884',
  '889',
  '1304',
  '279',
  '1888',
  '315',
  '1268',
  '2574',
  '990',
  '704',
  '11371'],
 ['313',
  '40742',
  '23162',
  '1',
  '5354',
  '11389',
  '11',
  '1047',
  '311',
  '387',
  '12054',
  '555',
  '757',
  '11',
  '439',
  '4872',
  '11',
  '612',
  '1141',
  '26',
  '602',
  '1550',
  '779',
  '2085',
  '65437',
  '220',
  '477',
  '12458',
  '13'],
 ['84270',
  '279',
  '26102',
  '11',
  '602',
  '6612',
  '433',
  '574',
  '1664',
  '45547',
  '13'],
 ['72',
  '1524',
  '3288',
  '3805',
  '5457',
  '832',
  '311',
  '4546',
  '813',
  '2547',
  '11',
  '304',
  '272',
  '9746',
  '11',
  '505',
  '802',
  '15732',
  '311',
  '94771',
  '13'],
 ['275', '574', '856', '16044', '311', '656', '779', '13'],
 ['19171',
  '11',
  '602',
  '2646',
  '2663',
  '497',
  '4304',
  '31',
  '16548',
  '67',
  '23290',
  '18495',
  '25',
  '571',
  '265',
  '4852',
  '80794',
  '376',
  '1538',
  '1629',
  '369',
  '4872'

In [129]:
train, vocab = padded_everygram_pipeline(n, train_strings)

lm.fit(train, vocab)
print("Size of vocabulary: ",len(lm.vocab))
print(lm.counts)

Size of vocabulary:  25877
<NgramCounter with 5 ngram orders and 13770900 ngrams>


### Generating and decoding result

In [130]:
generated_text = lm.generate(50)
generated_text

'she is too easy!'

In [152]:
generated_text_context = lm.generate(50, text_seed=['hello', 'american', ',', 'it'])
generated_text_context

' two of you have taken very different approaches, and this is our opportunity to leave the dark, angry politics of the past.'

### Evaluating

In [132]:
#Filtering empty n-grams to be sure
train_strings_filtered = [ngram for ngram in train_strings if len(ngram) > 0]
test_strings_filtered = [ngram for ngram in test_strings if len(ngram) > 0]

lm.perplexity(train_strings_filtered)

20175.454892549256

In [133]:
lm.perplexity(test_strings_filtered)

21466.561638975534