In [1]:
%cd /content/drive/MyDrive/Colab Notebooks/nlp/apps/name_entity_recognition

/content/drive/MyDrive/Colab Notebooks/nlp/apps/name_entity_recognition


In [2]:
path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/sherlock_novels.txt'
testing_path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/study in scarlet.txt'
ner_path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/ner_dataset.csv'
words_path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/words.txt'
tags_path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/tags.txt'
sentences_path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/sentences.txt'
labels_path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/labels.txt'
output_dir = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/name_entity_recognition/models/lstm_ner'

In [3]:
!pip install -q -U trax

[K     |████████████████████████████████| 471kB 13.7MB/s 
[K     |████████████████████████████████| 174kB 58.6MB/s 
[K     |████████████████████████████████| 2.6MB 50.4MB/s 
[K     |████████████████████████████████| 71kB 12.2MB/s 
[K     |████████████████████████████████| 348kB 22.3MB/s 
[K     |████████████████████████████████| 1.1MB 60.1MB/s 
[K     |████████████████████████████████| 3.7MB 49.1MB/s 
[K     |████████████████████████████████| 1.4MB 50.4MB/s 
[K     |████████████████████████████████| 2.9MB 64.5MB/s 
[K     |████████████████████████████████| 890kB 48.9MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [4]:
import trax
import random
import pandas as pd
import numpy as np




# Get the data

The columns of the dataset are:

- The sentence number
- the word
- the part of speech of the word
- the tags


In [5]:
df = pd.read_csv(ner_path, encoding='ISO-8859-1')
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


# Understanding the tags

The tag_map corresponds to one of the possible tags a word can have. The prepositions in the tags mean:
* I: Token is inside an entity.
* B: Token begins an entity.

# Preprocess

We are going to create two files: one with the unique tags and one with the unique words 


In [6]:
def create_vocab(df, path='/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/'):
    """
    Takes the ner dataset and creates a txt file
    with words that appear at least once.
    """
    # Lowercase and upercase are treated as different words
    counts = df.Word.value_counts()

    # add the unknown and padding tokens
    words = list(counts.index) + ['UNK', '<pad>']
    with open(path + 'words.txt', 'w') as f:
        for word in words:
            f.write(word + '\n')



In [7]:
def create_tags(df, path='/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/'):
    """
    Takes the ner dataset and create a txt file with
    the different tags
    """
    tags = list(df.Tag.value_counts().index)
    with open(path + 'tags.txt', 'w') as f:
        for tag in tags:
            if tag == 'O':
                continue
            f.write(tag + '\n')

        


In [8]:
def create_sentences_labels(df, path='/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/'):
    """
    Takes the ner dataset and extract the sentences
    and the labels to their respective txt file
    """
    df_copy = df.copy()
    # Fill the na values with the sentence #
    df_copy.fillna(method='ffill', inplace=True)

    # Store the unique sentences in a list
    sentences = list(df_copy['Sentence #'].unique())
    df_copy.set_index('Sentence #', drop=True, inplace=True)

    # with the Sentence # column as index is easier to iterate
    # to get the individual sentences
    str_sentences = []
    labels = []
    print(f'Amount of sentences to process: {len(sentences)}')
    for i, sentence in enumerate(sentences):
        try:
            # get the individual words
            words = df_copy.loc[sentence].Word.values

            # get the individual labels
            ind_labels = df_copy.loc[sentence].Tag.values

            # Join the words and ind_labels in their
            # respective string
            str_sentence = ' '.join(words)
            label = ' '.join(ind_labels)
            str_sentences.append(str_sentence)
            labels.append(label)

            if i % 500 == 0:
                print(f'{i} sentences processed')
        except Exception as e:
            print(e)
            print(f'Error in sentence {i}')


    # Save the str_sentences and labels into individual
    # txt files
    print('----Saving sentences----')
    with open(path + 'sentences.txt', 'w') as f:
        for sentence in str_sentences:
            f.write(sentence + '\n')
    print('----Saving labels----')
    with open(path + 'labels.txt', 'w') as f:
        for label in labels:
            f.write(label + '\n')




In [9]:
def words_tags2index(words_path, tags_path):
    """
    Takes the words.txt and tags.txt files and 
    returns a dict mapping each word and token to
    a number
    """
    word_map = dict()
    tag_map = dict()
    with open(words_path) as f:
        for i, word in enumerate(f.readlines(), 1):
            word_map[word.strip()] = i
    
    with open(tags_path) as f:
        for i, tag in enumerate(f.readlines(), 1):
            tag_map[tag.strip()] = i
    
    # set the O tag to 0
    tag_map['O'] = 0
    
    return word_map, tag_map


# Split the data into train, val, test


In [10]:
def split_sentences(sentences_path, labels_path, ratio=0.9):
    """
    """
    with open(sentences_path) as f:
        sentences = f.readlines()

    with open(labels_path) as f:
        labels = f.readlines()

    # 90% train, 5% val and 5% test
    sen_len = len(sentences)
    train_split = int(sen_len * ratio)
    x_train = sentences[:train_split]
    y_train = labels[:train_split]

    val_split = sen_len - train_split
    val_split = int(val_split / 2)
    val_split = train_split + val_split
    
    x_val = sentences[train_split:val_split]
    y_val = labels[train_split:val_split]

    x_test = sentences[val_split:]
    y_test = labels[val_split:]

    return x_train, y_train, x_val, y_val, x_test, y_test
    



# Transform each sentence and labels to numbers


In [11]:
def transform2numbers(word_map, tag_map, sentences, tags):
    """
    """
    data = []
    labels = []
    for sentence, tag in zip(sentences, tags):
        # replace each token by its index
        # if it is in the word_map else
        # use the UNK token
        tokens = [word_map[token] if token in word_map else word_map['UNK'] for token in sentence.strip().split(' ')]
        label = [tag_map[token] for token in tag.strip().split(' ')]
        data.append(tokens)
        labels.append(label)

    return data, labels, len(data)
    


# Data generator

In [12]:
def data_generator(batch_size, x, y, pad, shuffle=False):
    # count the number of sentences
    num_sentences = len(x)

    # create an array with the indexes of the sentences that can be shuffle
    sentences_index = [*range(num_sentences)]
    if shuffle:
        random.shuffle(sentences_index)

    # track current location of x and y
    index = 0
    while True:
        # Temporal array to store the raw x data for this batch
        buffer_x = [0] * batch_size
        
        # Temporal array to store the raw y data for this batch
        buffer_y = [0] * batch_size

        # create the batches
        max_len = 0
        for i in range(batch_size):
            if index >= num_sentences:
                # reset index to 0
                index = 0
                if shuffle:
                    random.shuffle(sentences_index)

            buffer_x[i] = x[sentences_index[index]]
            buffer_y[i] = y[sentences_index[index]]

            # lenght of current x
            lenx = len(buffer_x[i])
            if lenx > max_len:
                max_len = lenx
            
            index += 1
        # create X,Y, NumPy arrays of size (batch_size, max_len) 'full' of pad value
        X = np.full((batch_size, max_len), pad)
        Y = np.full((batch_size, max_len), pad)
        
        # copy values from lists to NumPy arrays. Use the buffered values
        for i in range(batch_size):
            # get the example (sentence as a tensor)
            # in buffer_x at the i index
            x_i = buffer_x[i]

            # similarly, get the example's labels
            # in buffer_y at the i index
            y_i = buffer_y[i]

            # Walk through each word in x_i
            for j in range(len(x_i)):
                # store the word in x_i at position j into X
                X[i, j] = x_i[j]
                
                # store the label in y_i at position j into Y
                Y[i, j] = y_i[j]
        
        yield((X,Y))



# Create model

In [13]:
def create_model(tags=None, vocab_size=35181, emb_dim=100):
    model = trax.layers.Serial(
        trax.layers.Embedding(vocab_size, emb_dim),
        trax.layers.LSTM(emb_dim),
        trax.layers.Dense(len(tags)),
        trax.layers.LogSoftmax()
    )

    return model

# Train model

In [17]:
def create_data_streams(batch_size, train_data, train_labels, val_data, val_labels, word_map):

    # create training data mask pad id=35180 for training.
    train_generator = trax.data.inputs.add_loss_weights(
        data_generator(batch_size, train_data, train_labels, word_map['<pad>'], shuffle=True),
        id_to_mask=word_map['<pad>']
    )

    # create validation data
    val_generator = trax.data.inputs.add_loss_weights(
        data_generator(batch_size, val_data, val_labels, word_map['<pad>']),
        id_to_mask=word_map['pad']
    )

    return train_generator, val_generator

def train_model(model, train_generator, val_generator, n_steps, learning_rate=0.001, output_dir='/model'):
    print(f'This is the amount of steps needed to end traning: {n_steps}')

    train_task = trax.supervised.training.TrainTask(
        train_generator,
        loss_layer=trax.layers.CrossEntropyLoss(),
        optimizer=trax.optimizers.Adam(learning_rate),
        n_steps_per_checkpoint=500
    )

    val_task = trax.supervised.training.EvalTask(
        labeled_data=val_generator,
        metrics=[trax.layers.CrossEntropyLoss(), trax.layers.Accuracy(),],
        n_eval_batches=10
    )

    training_loop = trax.supervised.training.Loop(
        model, 
        train_task,
        eval_tasks=[val_task],
        output_dir=output_dir
    )

    training_loop.run(n_steps)

    return training_loop

In [18]:
#create_vocab(df)
#create_tags(df)
#create_sentences_labels(df)

# Dicts word to index and tag to index
word_map, tag_map = words_tags2index(words_path, tags_path)

# with the sentences and labels created split the data
x_train, y_train, x_val, y_val, x_test, y_test = split_sentences(sentences_path, labels_path)

# Transform the splits into numbers
train_data, train_labels, train_size = transform2numbers(word_map, tag_map, x_train, y_train)
val_data, val_labels, val_size = transform2numbers(word_map, tag_map, x_val, y_val)
test_data, test_labels, test_size = transform2numbers(word_map, tag_map, x_test, y_test)




In [None]:
batch_size = 64
emb_dim = 50
epochs = 20
n_steps = int(len(train_data) / batch_size) * epochs

model = create_model(tags=tag_map)
train_generator, val_generator = create_data_streams(batch_size, train_data, train_labels, val_data, val_labels, word_map)
training_loop = train_model(model, train_generator, val_generator, n_steps=n_steps)

This is the amount of steps needed to end traning: 13480

Step   1000: Ran 500 train steps in 83.79 secs
Step   1000: train CrossEntropyLoss |  0.13515127
Step   1000: eval  CrossEntropyLoss |  0.06577097
Step   1000: eval          Accuracy |  0.49716882

Step   1500: Ran 500 train steps in 9.44 secs
Step   1500: train CrossEntropyLoss |  0.10438364
Step   1500: eval  CrossEntropyLoss |  0.06128471
Step   1500: eval          Accuracy |  0.46283979

Step   2000: Ran 500 train steps in 8.14 secs
Step   2000: train CrossEntropyLoss |  0.09457259
Step   2000: eval  CrossEntropyLoss |  0.05736920
Step   2000: eval          Accuracy |  0.51485153

Step   2500: Ran 500 train steps in 8.15 secs
Step   2500: train CrossEntropyLoss |  0.08399539
Step   2500: eval  CrossEntropyLoss |  0.06878502
Step   2500: eval          Accuracy |  0.51982425

Step   3000: Ran 500 train steps in 8.17 secs
Step   3000: train CrossEntropyLoss |  0.07628500
Step   3000: eval  CrossEntropyLoss |  0.06083496
Step   