Step 0: Load Data into variable

In [2]:
from pypdf import PdfReader
reader = PdfReader("war-and-peace.pdf")
text = ""
for page in reader.pages:
    text += page.extract_text()


In [3]:
print("Total number of characters are",len(text))
print(text)

Total number of characters are 3348516
 
War and Peace 
Leo Tolstoy 
 
 
 
 
 
 
 
 
 
 
 
 
 
This eBook was designed and published by Planet PDF. For more 
free eBooks visit our Web site at http://www.planetpdf.com/. To hear 
about our latest releases subscribe to the Planet PDF Newsletter.
War and Peace  
2 of 2882 
BOOK ONE: 1805 
War and Peace  
3 of 2882 
Chapter I 
‘Well, Prince, so Genoa and Lucca are now just family 
estates of the Buonapartes . But I warn you, if you don’t 
tell me that this means war, if you still try to defend the 
infamies and horrors perpetrated by that Antichrist - I 
really believe he is Antichri st - I will have nothing more 
to do with you and you are no longer my friend, no longer 
my ‘faithful slave,’ as you call yourself! But how do you 
do? I see I have frightened you - sit down and tell me all 
the news.’ 
It was in July, 1805, and th e speaker was the well-
known Anna Pavlovna Schere r, maid of honor and 
favorite of the Empress Ma rya Fedorovna

Step 1: Convert sentences into Tokens

In [4]:
import re 
text = text.replace("’","'")
tokens = re.split(r'([,.:;\'?_!’"()] |--|\s)',text)
tokens = [i.strip() for i in tokens if i.strip()]
print("Total number of words are: ",len(tokens))
print(tokens[:99])

Total number of words are:  667281
['War', 'and', 'Peace', 'Leo', 'Tolstoy', 'This', 'eBook', 'was', 'designed', 'and', 'published', 'by', 'Planet', 'PDF', '.', 'For', 'more', 'free', 'eBooks', 'visit', 'our', 'Web', 'site', 'at', 'http://www.planetpdf.com/', '.', 'To', 'hear', 'about', 'our', 'latest', 'releases', 'subscribe', 'to', 'the', 'Planet', 'PDF', 'Newsletter.', 'War', 'and', 'Peace', '2', 'of', '2882', 'BOOK', 'ONE', ':', '1805', 'War', 'and', 'Peace', '3', 'of', '2882', 'Chapter', 'I', '‘Well', ',', 'Prince', ',', 'so', 'Genoa', 'and', 'Lucca', 'are', 'now', 'just', 'family', 'estates', 'of', 'the', 'Buonapartes', '.', 'But', 'I', 'warn', 'you', ',', 'if', 'you', "don't", 'tell', 'me', 'that', 'this', 'means', 'war', ',', 'if', 'you', 'still', 'try', 'to', 'defend', 'the', 'infamies', 'and', 'horrors', 'perpetrated']


Step 2: convert Tokens into TokenID

In [5]:
all_tokens = sorted(list(set(tokens)))
all_tokens.extend(["|endoftext|","<|UNK|>"])
vocab_len = len(all_tokens)
vocab_len

34203

In [6]:
vocab = {tokens:id for id,tokens in enumerate(all_tokens)}
for i,w in enumerate(list(vocab.items())[-5:]):
    print(w)

('‘‘Told', 34198)
('‘‘What', 34199)
('‘‘You', 34200)
('|endoftext|', 34201)
('<|UNK|>', 34202)


In [7]:
import re
class SimpleTokenizerV1:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
        # self.unk_id = vocab.get("|<UNK>|",34202) ##For handling words out of vocab variable
    
    def encode(self,text):
        token = re.split(r'([,.:;\'?_!’"()] |--|\s)',text)
        token = [item.strip() for item in token if item.strip()]
        ids = [self.str_to_int.get(s,34202) for s in token] ##Now will use the None if the word is not preset in vocab
        return ids

    def decode(self,ids):
        tokens = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.:;\'?_!’"()])',r"\1",tokens)
        return text 


In [8]:
tokenizer = SimpleTokenizerV1(vocab)
text = """ His leave was expiring. He spent every
day and whole days at the Karagins’, and every day on
thinking the matter over told himself that he would
propose tomorrow. by Hari"""
text1 = "His leave was exp"
txt = " |endoftext| ".join((text,text1))
txt = txt.replace("’","'")
ids = tokenizer.encode(txt)
print(ids)

[4839, 20118, 31901, 15287, 313, 4761, 28341, 15037, 12644, 8244, 32213, 12657, 8846, 29746, 5089, 305, 8244, 15037, 12644, 22698, 29896, 29746, 21042, 23143, 30215, 17885, 29727, 17639, 32525, 24728, 30228, 313, 10310, 34202, 34201, 4839, 20118, 31901, 15238]


In [9]:
tokenizer.decode(tokenizer.encode(txt))


"His leave was expiring. He spent every day and whole days at the Karagins', and every day on thinking the matter over told himself that he would propose tomorrow. by <|UNK|> |endoftext| His leave was exp"

In [10]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')

In [11]:
integer = tokenizer.encode(text,allowed_special={'<|endoftext|>'})
print(integer)

[2399, 2666, 373, 1033, 3428, 13, 679, 3377, 790, 198, 820, 290, 2187, 1528, 379, 262, 9375, 363, 1040, 447, 247, 11, 290, 790, 1110, 319, 198, 28973, 262, 2300, 625, 1297, 2241, 326, 339, 561, 198, 1676, 3455, 9439, 13, 416, 2113, 72]


In [12]:
tokenizer.decode(integer)

' His leave was expiring. He spent every\nday and whole days at the Karagins’, and every day on\nthinking the matter over told himself that he would\npropose tomorrow. by Hari'

CREATING INPUT TARGET PAIRS

In [13]:
raw_text = ""
for page in reader.pages:
    raw_text += page.extract_text()

In [14]:
enc_text = tokenizer.encode(raw_text,allowed_special={"|endoftext|"})

In [15]:
print(len(enc_text))

953538


In [22]:
enc_sample = enc_text[1500:1550]
tokenizer.decode(enc_sample)

'\nhas refused to evacuate Malta. She wanted to find, and \nstill seeks, some secret motive in our actions. What \nanswer did Novosiltsev get? None. The English have not \nunderstood and cannot understand'

In [23]:
context_size = 5
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}" )
print(f"y: {y}")

x: [198, 10134, 6520, 284, 36316]
y: [10134, 6520, 284, 36316, 35206]


In [24]:
for i in range(1,context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "---->",desired)

[198] ----> 10134
[198, 10134] ----> 6520
[198, 10134, 6520] ----> 284
[198, 10134, 6520, 284] ----> 36316
[198, 10134, 6520, 284, 36316] ----> 35206


In [25]:
for i in range(1,context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context),"---->",tokenizer.decode([desired]))


 ----> has

has ---->  refused

has refused ---->  to

has refused to ---->  evacuate

has refused to evacuate ---->  Malta
