In [1]:
%cd /content/drive/MyDrive/Colab Notebooks/nlp/apps

/content/drive/MyDrive/Colab Notebooks/nlp/apps


In [2]:
# This is just the first novel, we use it for testing purposes because it is smaller
testing_path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/study in scarlet.txt'

# This is the whole corpus
path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/sherlock_novels.txt'

# Preprocessing the corpus

These are the preprocessing steps that we are going to use:

- lowercase the text
- remove special characters
- split text to list of sentences
- split sentences into list of words

Notice that we will consider each line as a sentences for this language model.

In [3]:
import nltk
import re

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
def remove_special(sentence):
    """
    Takes a sentence and only keeps .,?! and space
    as special characters.
    Args:
        sentence: str
    returns
        sentence: str. The full sentence cleaned of special characters
    """
    sentence = re.sub(r'[^a-zA-Z0-9.,?! ]+', '', sentence)

    return sentence

def get_text(path):
    """
    It reads a txt file and returns a string with all the corpus
    Args:
        path: str
    returns:
        text: str
    """
    with open(path) as f:
        text = f.read()

    return text

def get_sentences(text):
    """
    Takes a whole text removes special characters and divides it by \n
    then it returns a list of list with the sentences
    Args:
        text: str
    returns:
        sentences: list
    """
    text = text.lower()
    sentences = text.split('\n')
    # also removes any empty line
    sentences = [remove_special(sentence.strip()) for sentence in sentences if len(sentence) > 0]

    return sentences

# Uncomment to see the 10 first sentences

# text = get_text(testing_path)
# sentences = get_sentences(text)
# for s in sentences[:10]:
#     print(s)


# Tokenize the corpus

In [5]:
def tokenize(sentences):
    """
    It takes a list of strings that are the sentences
    and returns a list of list of tokens
    Args:
        sentences: list
    returns:
        tokenized_sentences: list
    """
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]

    return tokenized_sentences

# Uncomment to test

# text = get_text(testing_path)
# sentences = get_sentences(text)
# tokenized_sentences = tokenize(sentences[:10])
# for s in tokenized_sentences:
#     print(s)

# Create training and test datsets

In [6]:
import random

# First create a function to get the tokens
def get_tokens(path):
    """
    It takes the path of a txt file and applies 
    get_text(), get_sentences(), and tokenize()
    functions .
    Args:
        path: str
    returns:
        tokenized_sentences: list
    """
    text = get_text(path)
    sentences = get_sentences(text)
    tokenized_sentences = tokenize(sentences)

    return tokenized_sentences

tokenized_sentences = get_tokens(path)
    

In [7]:
random.seed(10)
random.shuffle(tokenized_sentences)
print(f'Amount of sentences {len(tokenized_sentences)}')

Amount of sentences 60198


### Because the corpus is big enough we can test using just 10% of the sentences

In [8]:
size = int(len(tokenized_sentences) * 0.9)
train = tokenized_sentences[:size]
test = tokenized_sentences[size:]
print(f'Training size: {len(train)}')
print(f'Testing size: {len(test)}')

Training size: 54178
Testing size: 6020


# Count words

We are going to pass through each sentence and each token counting each tokens occurrence in the corpus.

This will help us to take the tokens that appear N times in the corpus and also to calculate probabilities

In [9]:
def get_token_counts(tokenized_sentences):
    """
    It takes a list of list of tokens and returns
    a dict where the key are going to be the tokens 
    and the value is how many times it appears
    Args:
        tokenized_sentences: list
    returns:
        token_counts: dict
    """
    token_counts = dict()
    for sentence in tokenized_sentences:
        for token in sentence:
            if token not in token_counts.keys():
                token_counts[token] = 1
            else:
                token_counts[token] += 1
    
    return token_counts

# Uncomment for testing

# to = get_tokens(testing_path)
# counts = get_token_counts(to)
# from collections import Counter
# c = Counter(counts)
# c.most_common(10)     

# Handling out of vocabulary words

Because it is probable that in some point we are going to encounter words that were not in our training dataset, we need to handle out of vocabulary words. Otherwise, we won't be able to predict the next word.

in this case, we are going to add an "unk" token, which is going to replace the words with less than N occurrences in the training data and the words left are going to be our vocab.


In [13]:
threshold = 2

def add_unk_token(tokenized_sentences, vocab, unk_token):
    """
    It updates the tokens that are not in the vocab
    to the unk token
    Args:
        tokenized_sentences: list
        vocab: set
        unk_token: str
    returns:
        tokenized_sentences_with_unk: list. updated list of list of tokens with
            the unk character
    """

    tokenized_sentences_with_unk = []

    for sentence in tokenized_sentences:
        # we need to keep track of the new sentence
        new_sentence = []

        for token in sentence:
            if token in vocab:
                new_sentence.append(token)
            else:
                new_sentence.append(unk_token)
        
        # save the new sentence
        tokenized_sentences_with_unk.append(new_sentence)

    return tokenized_sentences_with_unk

def create_new_sentences(tokenized_sentences, mode='train', vocab=None, threshold=2, unk_token='unk'):
    """
    It takes a list of list of tokens, counts the tokens occurrences and
    search for the tokens with less occurrences than the threshold. Then it
    transform them into the "unk" token.
    Args:
        tokenized_sentences: list
        mode: str. (train or test)
        vocab: list. (we get the vocab from train and use it again for test)
        threshold: int
        unk_token: str
    returns:
        tokenized_sentences_with_unk: dict. Updated with the unk token
        if mode == 'train'
            vocab: set
    """
    if mode == 'train':
        vocab = []
        token_counts = get_token_counts(tokenized_sentences)

        for word, count in token_counts.items():
            # check the threshold
            if count >= threshold:
                vocab.append(word)
        
        # cast the vocab to set. It will allow faster search
        vocab = set(vocab)

        tokenized_sentences_with_unk = add_unk_token(tokenized_sentences, vocab, unk_token)

        return tokenized_sentences_with_unk, vocab

    elif mode == 'test':
        vocab = vocab
        tokenized_sentences_with_unk = add_unk_token(tokenized_sentences, vocab, unk_token)
        
        return tokenized_sentences_with_unk

    else:
        assert mode not in ['train', 'test'], "Wrong mode. options = train or test"

# Uncomment for testing

tokens = get_tokens(testing_path)
updated_tokens, vocab = create_new_sentences(tokens, threshold=threshold)
from collections import Counter
token_counts = get_token_counts(updated_tokens)
c = Counter(token_counts)
print(f'----Training-----')
print(c['unk'])
c.most_common(10)


3239


[('unk', 3239),
 (',', 2953),
 ('the', 2521),
 ('.', 2383),
 ('and', 1348),
 ('of', 1206),
 ('to', 1082),
 ('a', 990),
 ('i', 895),
 ('he', 794)]

# Preprocess the data

In this step, we are going to join the functions that we have been creating to process our train and test datasets.

In [None]:
def preprocess(train, test, mode='train', threshold=2):
    """
    It takes the train and test datasets (list of list of tokens)
    and preprocesses them. We will end with a train and test datasets
    updated with the unk token and the vocab.
    Args:
        train: list
        test: list
        threshold: int
    returns:
        train_sentences: list
        test_sentences: list
        vocab: set
    """
    pass

