In [1]:
%cd /content/drive/MyDrive/Colab Notebooks/nlp/apps

/content/drive/MyDrive/Colab Notebooks/nlp/apps


In [2]:
# This is just the first novel, we use it for testing proposes because it is smaller
testing_path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/study in scarlet.txt'

# This is the whole corpus
path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/sherlock_novels.txt'

# Preprocessing the corpus

These are the preprocessing steps that we are going to use:

- lowercase the text
- remove special characters
- split text to list of sentences
- split sentences into list of words

Notice that we will consider each line as a sentences for this language model.

In [3]:
import nltk
import re

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
def remove_special(sentence):
    """
    Takes a sentence and only keeps .,?! and space
    as special characters.
    Args:
        sentence: str
    returns
        sentence: str. The full sentence cleaned of special characters
    """
    sentence = re.sub(r'[^a-zA-Z0-9.,?! ]+', '', sentence)

    return sentence

def get_text(path):
    """
    It reads a txt file and returns a string with all the corpus
    Args:
        path: str
    returns:
        text: str
    """
    with open(path) as f:
        text = f.read()

    return text

def get_sentences(text):
    """
    Takes a whole text removes special characters and divides it by \n
    then it returns a list of list with the sentences
    Args:
        text: str
    returns:
        sentences: list
    """
    text = text.lower()
    sentences = text.split('\n')
    # also removes any empty line
    sentences = [remove_special(sentence.strip()) for sentence in sentences if len(sentence) > 0]

    return sentences

# Uncomment to see the 10 first sentences

# text = get_text(testing_path)
# sentences = get_sentences(text)
# for s in sentences[:10]:
#     print(s)


# Tokenize the corpus

In [5]:
def tokenize(sentences):
    """
    It takes a list of strings that are the sentences
    and returns a list of list of tokens
    Args:
        sentences: list
    returns:
        tokenized_sentences: list
    """
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]

    return tokenized_sentences

# Uncomment to test

# text = get_text(testing_path)
# sentences = get_sentences(text)
# tokenized_sentences = tokenize(sentences[:10])
# for s in tokenized_sentences:
#     print(s)

# Create training and test datsets

In [6]:
import random

# First create a function to get the tokens
def get_tokens(path):
    """
    It takes the path of a txt file and applies 
    get_text(), get_sentences(), and tokenize()
    functions .
    Args:
        path: str
    returns:
        tokenized_sentences: list
    """
    text = get_text(path)
    sentences = get_sentences(text)
    tokenized_sentences = tokenize(sentences)

    return tokenized_sentences

tokenized_sentences = get_tokens(path)
    

In [7]:
random.seed(10)
random.shuffle(tokenized_sentences)
print(f'Amount of sentences {len(tokenized_sentences)}')

Amount of sentences 60198


### Because the corpus is big enough we can test using just 10% of the sentences

In [10]:
size = int(len(tokenized_sentences) * 0.9)
train = tokenized_sentences[:size]
test = tokenized_sentences[size:]
print(f'Training size: {len(train)}')
print(f'Testing size: {len(test)}')

Training size: 54178
Testing size: 6020
