|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 1:</h2>|<h1>Tokenizations and embeddings<h1>|
|<h2>Section:</h2>|<h1>Words to tokens to numbers<h1>|
|<h2>Lecture:</h2>|<h1><b>Preparing text for tokenization<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
# typical libraries...
import numpy as np

# for importing and working with texts
import requests
import re

# Get text from the web

In [None]:
# get raw text from internet
book = requests.get('https://www.gutenberg.org/files/35/35-0.txt')

# extract just the text and have a look at it
text = book.text
print(type(text))
print(len(text))

text[:2000]

In [None]:
# character strings to replace with space
strings2replace = [
                 '\r\n\r\nâ\x80\x9c', # new paragraph
                 'â\x80\x9c',         # open quote
                 'â\x80\x9d',         # close quote
                 '\r\n',              # new line
                 'â\x80\x94',         # hyphen
                 'â\x80\x99',         # single apostrophe
                 'â\x80\x98',         # single quote
                 '_',                 # underscore, used for stressing
                 ]

# e.g., 'â\x80\x9d'.encode('latin1').decode('utf8')

# use regular expression (re) to replace those strings with space
for str2match in strings2replace:
  regexp = re.compile(r'%s'%str2match)
  text = regexp.sub(' ',text)

# remove non-ASCII characters
text = re.sub(r'[^\x00-\x7F]+', ' ', text)

# remove numbers
text = re.sub('\d+','',text)

# and make everything lower-case
text = text.lower()

# let's have a look!
text[:2000]

# Parse text into words

In [None]:
# split by punctuation
import string
print(string.punctuation)
puncts4re = f'[{string.punctuation}\s]+'

words = re.split(puncts4re,text)
words = [item.strip() for item in words if item.strip()]

# remove single-character words
words = [item for item in words if len(item)>1]

# let's have a look!
words[:50]

In [None]:
# create the vocab! (set of unique words)
vocab = sorted(set(words))

# convenience variables for later
nWords = len(words)
nLex = len(vocab)

print(f'{nWords} words')
print(f' {nLex} unique tokens')

# Create token dictionaries and encoder/decoder functions

In [None]:
word2idx = {w:i for i,w in enumerate(vocab)}
idx2word = {i:w for i,w in enumerate(vocab)}

# print out a few
for i in list(word2idx.items())[0:10000:87]:
  print(i)

In [None]:
# encoder function (using for-loop instead of list-comp)
def encoder(words,encode_dict):

  # initialize a vector of numerical indices
  idxs = np.zeros(len(words),dtype=int)

  # loop through the words and find their token in the vocab
  for i,w in enumerate(words):
    idxs[i] = encode_dict[w]

  # return the indices!
  return idxs


# also need a decoder function
def decoder(idxs,decode_dict):
  return ' '.join([decode_dict[i] for i in idxs])


In [None]:
# test the encoder
print(encoder(['the','time','machine'],word2idx))

# test the decoder
print(decoder([1,3,10],idx2word))

In [None]:
# test encode-then-decode

# random start location
startidx = np.random.choice(nWords)

# sequential word indices
idxs = np.arange(startidx,startidx+10)

print('Word indices:')
print(idxs), print('')

print('The words:')
wordseq = [ words[i] for i in idxs ]
print(wordseq), print('')

print('Token indices:')
tokenseq = encoder(wordseq,word2idx)
print(tokenseq), print('')

# decode
print('Decoded text from indices:')
decoder(tokenseq,idx2word)