In [61]:
%cd /content/drive/MyDrive/Colab Notebooks/nlp/apps/name_entity_recognition

/content/drive/MyDrive/Colab Notebooks/nlp/apps/name_entity_recognition


In [62]:
path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/sherlock_novels.txt'
testing_path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/study in scarlet.txt'
ner_path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/ner_dataset.csv'
words_path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/words.txt'
tags_path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/tags.txt'
sentences_path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/sentences.txt'
labels_path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/labels.txt'

In [63]:
!pip install -q -U trax

In [64]:
import trax
import random
import pandas as pd
import numpy as np


# Get the data

The columns of the dataset are:

- The sentence number
- the word
- the part of speech of the word
- the tags


In [65]:
df = pd.read_csv(ner_path, encoding='ISO-8859-1')
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


# Understanding the tags

The tag_map corresponds to one of the possible tags a word can have. The prepositions in the tags mean:
* I: Token is inside an entity.
* B: Token begins an entity.

# Preprocess

We are going to create two files: one with the unique tags and one with the unique words 


In [66]:
def create_vocab(df, path='/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/'):
    """
    Takes the ner dataset and creates a txt file
    with words that appear at least once.
    """
    # Lowercase and upercase are treated as different words
    counts = df.Word.value_counts()

    # add the unknown and padding tokens
    words = list(counts.index) + ['UNK', '<pad>']
    with open(path + 'words.txt', 'w') as f:
        for word in words:
            f.write(word + '\n')



In [67]:
def create_tags(df, path='/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/'):
    """
    Takes the ner dataset and create a txt file with
    the different tags
    """
    tags = list(df.Tag.value_counts().index)
    with open(path + 'tags.txt', 'w') as f:
        for tag in tags:
            if tag == 'O':
                continue
            f.write(tag + '\n')

        


In [68]:
def create_sentences_labels(df, path='/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/'):
    """
    Takes the ner dataset and extract the sentences
    and the labels to their respective txt file
    """
    df_copy = df.copy()
    # Fill the na values with the sentence #
    df_copy.fillna(method='ffill', inplace=True)

    # Store the unique sentences in a list
    sentences = list(df_copy['Sentence #'].unique())
    df_copy.set_index('Sentence #', drop=True, inplace=True)

    # with the Sentence # column as index is easier to iterate
    # to get the individual sentences
    str_sentences = []
    labels = []
    print(f'Amount of sentences to process: {len(sentences)}')
    for i, sentence in enumerate(sentences):
        try:
            # get the individual words
            words = df_copy.loc[sentence].Word.values

            # get the individual labels
            ind_labels = df_copy.loc[sentence].Tag.values

            # Join the words and ind_labels in their
            # respective string
            str_sentence = ' '.join(words)
            label = ' '.join(ind_labels)
            str_sentences.append(str_sentence)
            labels.append(label)

            if i % 500 == 0:
                print(f'{i} sentences processed')
        except Exception as e:
            print(e)
            print(f'Error in sentence {i}')


    # Save the str_sentences and labels into individual
    # txt files
    print('----Saving sentences----')
    with open(path + 'sentences.txt', 'w') as f:
        for sentence in str_sentences:
            f.write(sentence + '\n')
    print('----Saving labels----')
    with open(path + 'labels.txt', 'w') as f:
        for label in labels:
            f.write(label + '\n')




In [69]:
def words_tags2index(words_path, tags_path):
    """
    Takes the words.txt and tags.txt files and 
    returns a dict mapping each word and token to
    a number
    """
    word_map = dict()
    tag_map = dict()
    with open(words_path) as f:
        for i, word in enumerate(f.readlines(), 1):
            word_map[word.strip()] = i
    
    with open(tags_path) as f:
        for i, tag in enumerate(f.readlines(), 1):
            tag_map[tag.strip()] = i
    
    # set the O tag to 0
    tag_map['O'] = 0
    
    return word_map, tag_map


# Split the data into train, val, test


In [70]:
def split_sentences(sentences_path, labels_path, ratio=0.9):
    """
    """
    with open(sentences_path) as f:
        sentences = f.readlines()

    with open(labels_path) as f:
        labels = f.readlines()

    # 90% train, 5% val and 5% test
    sen_len = len(sentences)
    train_split = int(sen_len * ratio)
    x_train = sentences[:train_split]
    y_train = labels[:train_split]

    val_split = sen_len - train_split
    val_split = int(val_split / 2)
    val_split = train_split + val_split
    
    x_val = sentences[train_split:val_split]
    y_val = labels[train_split:val_split]

    x_test = sentences[val_split:]
    y_test = labels[val_split:]

    return x_train, y_train, x_val, y_val, x_test, y_test
    



# Transform each sentence and labels to numbers


In [71]:
def transform2numbers(word_map, tag_map, sentences, tags):
    """
    """
    data = []
    labels = []
    for sentence, tag in zip(sentences, tags):
        # replace each token by its index
        # if it is in the word_map else
        # use the UNK token
        tokens = [word_map[token] if token in word_map else word_map['UNK'] for token in sentence.strip().split(' ')]
        label = [tag_map[token] for token in tag.strip().split(' ')]
        data.append(tokens)
        labels.append(label)

    return data, labels, len(data)
    


In [72]:
#create_vocab(df)
#create_tags(df)
#create_sentences_labels(df)

# Dicts word to index and tag to index
word_map, tag_map = words_tags2index(words_path, tags_path)

# with the sentences and labels created split the data
x_train, y_train, x_val, y_val, x_test, y_test = split_sentences(sentences_path, labels_path)

# Transform the splits into numbers
train_data, train_labels, train_size = transform2numbers(word_map, tag_map, x_train, y_train)
val_data, val_labels, val_size = transform2numbers(word_map, tag_map, x_val, y_val)
test_data, test_labels, test_size = transform2numbers(word_map, tag_map, x_test, y_test)