In [4]:
%cd /content/drive/MyDrive/Colab Notebooks/nlp/apps/name_entity_recognition

/content/drive/MyDrive/Colab Notebooks/nlp/apps/name_entity_recognition


In [5]:
path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/sherlock_novels.txt'
testing_path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/study in scarlet.txt'
ner_path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/ner_dataset.csv'
words_path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/words.txt'
tags_path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/tags.txt'

In [6]:
!pip install -q -U trax

In [7]:
import trax
import random
import pandas as pd
import numpy as np




# Get the data

The columns of the dataset are:

- The sentence number
- the word
- the part of speech of the word
- the tags


In [8]:
df = pd.read_csv(ner_path, encoding='ISO-8859-1')
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


# Preprocess

We are going to create two files: one with the unique tags and one with the unique words 


In [9]:
def create_vocab(df, path='/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/'):
    """
    Takes the ner dataset and creates a txt file
    with words that appear at least once.
    """
    # Lowercase and upercase are treated as different words
    counts = df.Word.value_counts()

    # add the unknown and padding tokens
    words = list(counts.index) + ['UNK', '<pad>']
    with open(path + 'words.txt', 'w') as f:
        for word in words:
            f.write(word + '\n')

#create_vocab(df)

In [10]:
def create_tags(df, path='/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/'):
    """
    Takes the ner dataset and create a txt file with
    the different tags
    """
    tags = list(df.Tag.value_counts().index)
    with open(path + 'tags.txt', 'w') as f:
        for tag in tags:
            if tag == 'O':
                continue
            f.write(tag + '\n')

#create_tags(df)
        


In [11]:
def get_sentences_labels(df, path='/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/'):
    """
    Takes the ner dataset and extract the sentences
    and the labels to their respective txt file
    """
    df_copy = df.copy()
    # Fill the na values with the sentence #
    df_copy.fillna(method='ffill', inplace=True)

    # Store the unique sentences in a list
    sentences = list(df_copy['Sentence #'].unique())
    df_copy.set_index('Sentence #', drop=True, inplace=True)

    # with the Sentence # column as index is easier to iterate
    # to get the individual sentences
    str_sentences = []
    labels = []
    print(f'Amount of sentences to process: {len(sentences)}')
    for i, sentence in enumerate(sentences):
        try:
            # get the individual words
            words = df_copy.loc[sentence].Word.values

            
            # get the individual labels
            ind_labels = df_copy.loc[sentence].Tag.values

            # Join the words and ind_labels in their
            # respective string
            str_sentence = ' '.join(words)
            label = ' '.join(ind_labels)
            str_sentences.append(str_sentence)
            labels.append(label)

            if i % 500 == 0:
                print(f'{i} sentences processed')
        except Exception as e:
            print(e)
            print(f'Error in sentence {i}')


    # Save the str_sentences and labels into individual
    # txt files
    print('----Saving sentences----')
    with open(path + 'sentences.txt', 'w') as f:
        for sentence in str_sentences:
            f.write(sentence + '\n')
    print('----Saving labels----')
    with open(path + 'labels.txt', 'w') as f:
        for label in labels:
            f.write(label + '\n')


In [12]:
get_sentences_labels(df)

Amount of sentences to process: 47959
0 sentences processed
500 sentences processed
1000 sentences processed
1500 sentences processed
2000 sentences processed
2500 sentences processed
3000 sentences processed
3500 sentences processed
4000 sentences processed
4500 sentences processed
5000 sentences processed
5500 sentences processed
6000 sentences processed
6500 sentences processed
7000 sentences processed
7500 sentences processed
8000 sentences processed
'str' object has no attribute 'values'
Error in sentence 8411
8500 sentences processed
9000 sentences processed
9500 sentences processed
10000 sentences processed
10500 sentences processed
11000 sentences processed
11500 sentences processed
12000 sentences processed
12500 sentences processed
13000 sentences processed
13500 sentences processed
14000 sentences processed
14500 sentences processed
15000 sentences processed
15500 sentences processed
16000 sentences processed
16500 sentences processed
17000 sentences processed
17500 sentence

once we have our tags and words, we need to map them into numbers that we can fit in our model.


In [13]:
def words_tags2index(words_path, tags_path):
    """
    Takes the words.txt and tags.txt files and 
    returns a dict mapping each word and token to
    a number
    """
    word_map = dict()
    tag_map = dict()
    with open(words_path) as f:
        for i, word in enumerate(f.readlines(), 1):
            word_map[word.strip()] = i
    
    with open(tags_path) as f:
        for i, tag in enumerate(f.readlines(), 1):
            tag_map[tag.strip()] = i
    
    # set the O tag to 0
    tag_map['O'] = 0
    
    return word_map, tag_map


In [14]:
word_map, tag_map = words_tags2index(words_path, tags_path)

# Understanding the tags

The tag_map corresponds to one of the possible tags a word can have. The prepositions in the tags mean:
* I: Token is inside an entity.
* B: Token begins an entity.