In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/ner_datasetreference.csv', encoding='latin').fillna(method='ffill')

In [37]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer

dataset = df.copy()


def _get_word_tokens():
        """
        Private method to generate word tokens.

        @return
        A tuple of dictionaries, first maps words to integer tokens, 
        second maps integer tokens to words
        """

        sentences = dataset.groupby(['Sentence #'])['Word'].transform(lambda word : ' '.join(word)).drop_duplicates()

        tokenizer = Tokenizer(filters="", lower=True, oov_token='<UNK>', char_level=False)
        tokenizer.fit_on_texts(list(sentences))

        return tokenizer.word_index


def get_tokenized_sentences(max_sentence_len):
        """
        Public method for ...

        @return
        """

        


        word_to_idx          = _get_word_tokens()
        word_to_idx['<PAD>'] = 0

        temp_dataset               = dataset.copy()
        temp_dataset['word_token'] = temp_dataset['Word'].str.lower().map(word_to_idx)           
        tokenized_sentences        = temp_dataset.groupby(['Sentence #'])['word_token'].apply(np.array)
        padded_tokenized_sentences = pad_sequences(tokenized_sentences, maxlen=max_sentence_len, value=0, padding='post', truncating='post') 

        return padded_tokenized_sentences, word_to_idx

In [38]:
sentence_inputs, word_to_idx = get_tokenized_sentences(30)

In [39]:
idx_to_word = {idx: word for word, idx in word_to_idx.items() }

In [40]:
string = ""

for i in sentence_inputs[0]:
    string += idx_to_word[i] + " "

string

'thousands of demonstrators have marched through london to protest the war in iraq and demand the withdrawal of british troops from that country . <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> '

In [41]:
sentence_inputs.shape

(47959, 30)

In [42]:
sentence_inputs[0]

array([ 259,    6,  974,   16, 1791,  237,  467,    7,  522,    2,  129,
          5,   61,    9,  575,    2,  832,    6,  185,   90,   22,   15,
         56,    3,    0,    0,    0,    0,    0,    0])

In [31]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer

dataset = df.copy()


def _get_char_tokens():
        """
        Private method to generate character tokens.

        @return
        A tuple of dictionaries, first maps chars words to integer tokens, 
        second maps integer tokens to chars
        """

        words     = dataset['Word'].values

        tokenizer = Tokenizer(lower=False, oov_token='<UNK>', char_level=True)
        tokenizer.fit_on_texts(list(words))

        return tokenizer.word_index



def get_tokenized_words(max_word_len, max_sentence_len):
        """
        Public method for ...

        @return
        """

        pad_value = 0

        char_to_idx          = _get_char_tokens()
        char_to_idx['<PAD>'] = pad_value

        temp_dataset                = dataset.copy()
        temp_dataset['char_tokens'] = dataset['Word'].str.split("").str[1:-1]
        temp_dataset['char_tokens'] = temp_dataset['char_tokens'].apply(lambda x: [char_to_idx[i] for i in x])
        padded_tokenized_words      = temp_dataset.groupby(['Sentence #'])['char_tokens'].apply(np.array).apply(pad_sequences, maxlen=max_word_len, value=pad_value , padding='post', truncating='post')
        padded_tokenized_words      = pad_sequences(padded_tokenized_words, maxlen=max_sentence_len, value=[pad_value]*max_word_len , padding='post', truncating='post')

        return padded_tokenized_words, char_to_idx

In [32]:
pop, char_to_idx = get_tokenized_words(5, 10)

In [33]:
pop.shape

(47959, 10, 5)

In [36]:
pop[9999] 

array([[27, 10,  2,  0,  0],
       [37,  9,  5,  4,  5],
       [ 3,  6, 12,  0,  0],
       [30,  9,  5,  8, 10],
       [17,  9,  5, 15,  2],
       [15,  5,  6,  5,  8],
       [ 8,  3, 19,  0,  0],
       [15, 14, 13, 10,  0],
       [10,  3,  8,  0,  0],
       [22,  2,  2,  6,  0]])

In [45]:
model_word_len = 10
model_sentence_len = 30

empty_word = [0] * model_word_len

padded_sentences = []

for sentence in pop[:2]:
    # print(type(sentence))
    sentence = list(pad_sequences(sentence, maxlen=model_word_len, padding='post', truncating='post'))

    while len(sentence) < model_sentence_len:
        sentence.append(empty_word)

    padded_sentences.append(sentence)

        



<class 'list'>
<class 'list'>


In [48]:
np.array(padded_sentences[0])

array([[27, 10,  7, 14,  8,  3,  6, 12,  8,  0],
       [ 7, 16,  0,  0,  0,  0,  0,  0,  0,  0],
       [12,  2, 15,  7,  6,  8,  4,  9,  3,  4],
       [10,  3, 23,  2,  0,  0,  0,  0,  0,  0],
       [15,  3,  9, 13, 10,  2, 12,  0,  0,  0],
       [ 4, 10,  9,  7, 14, 18, 10,  0,  0,  0],
       [55,  7,  6, 12,  7,  6,  0,  0,  0,  0],
       [ 4,  7,  0,  0,  0,  0,  0,  0,  0,  0],
       [17,  9,  7,  4,  2,  8,  4,  0,  0,  0],
       [ 4, 10,  2,  0,  0,  0,  0,  0,  0,  0],
       [21,  3,  9,  0,  0,  0,  0,  0,  0,  0],
       [ 5,  6,  0,  0,  0,  0,  0,  0,  0,  0],
       [30,  9,  3, 44,  0,  0,  0,  0,  0,  0],
       [ 3,  6, 12,  0,  0,  0,  0,  0,  0,  0],
       [12,  2, 15,  3,  6, 12,  0,  0,  0,  0],
       [ 4, 10,  2,  0,  0,  0,  0,  0,  0,  0],
       [21,  5,  4, 10, 12,  9,  3, 21,  3, 11],
       [ 7, 16,  0,  0,  0,  0,  0,  0,  0,  0],
       [37,  9,  5,  4,  5,  8, 10,  0,  0,  0],
       [ 4,  9,  7,  7, 17,  8,  0,  0,  0,  0],
       [16,  9,  7, 

In [58]:
dataset = df.copy()

def _get_unique_tags():

        return df['Tag'].unique()

def get_tags(max_sentence_len):
        """
        Public method for ...

        @return
        """

        unique_tags = _get_unique_tags()
        tag_to_idx  = { tag: idx+1 for idx, tag in enumerate(unique_tags) } #0 index is for no tag

        temp_dataset = dataset.copy()


        temp_dataset['tag_token'] = temp_dataset['Tag'].map(tag_to_idx)                   
        tokenized_tags            = temp_dataset.groupby(['Sentence #'])['tag_token'].apply(np.array)
        padded_tokenized_tags     = pad_sequences(tokenized_tags, maxlen=max_sentence_len, value=0, padding='post', truncating='post') 


        return padded_tokenized_tags, tag_to_idx


In [60]:
tags, tag_to_idx = get_tags(15)

In [62]:
tags[0]

array([1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1])

In [63]:
tag_to_idx

{'O': 1,
 'B-geo': 2,
 'B-gpe': 3,
 'B-per': 4,
 'I-geo': 5,
 'B-org': 6,
 'I-org': 7,
 'B-tim': 8,
 'B-art': 9,
 'I-art': 10,
 'I-per': 11,
 'I-gpe': 12,
 'I-tim': 13,
 'B-nat': 14,
 'B-eve': 15,
 'I-eve': 16,
 'I-nat': 17}