In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/ner_datasetreference.csv', encoding='latin').fillna(method='ffill')

In [37]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer

dataset = df.copy()


def _get_word_tokens():
        """
        Private method to generate word tokens.

        @return
        A tuple of dictionaries, first maps words to integer tokens, 
        second maps integer tokens to words
        """

        sentences = dataset.groupby(['Sentence #'])['Word'].transform(lambda word : ' '.join(word)).drop_duplicates()

        tokenizer = Tokenizer(filters="", lower=True, oov_token='<UNK>', char_level=False)
        tokenizer.fit_on_texts(list(sentences))

        return tokenizer.word_index


def get_tokenized_sentences(max_sentence_len):
        """
        Public method for ...

        @return
        """

        word_to_idx          = _get_word_tokens()
        word_to_idx['<PAD>'] = 0

        temp_dataset               = dataset.copy()
        temp_dataset['word_token'] = temp_dataset['Word'].str.lower().map(word_to_idx)           
        tokenized_sentences        = temp_dataset.groupby(['Sentence #'])['word_token'].apply(np.array)
        padded_tokenized_sentences = pad_sequences(tokenized_sentences, maxlen=max_sentence_len, value=0, padding='post', truncating='post') 

        return padded_tokenized_sentences, word_to_idx

In [38]:
sentence_inputs, word_to_idx = get_tokenized_sentences(30)

In [39]:
idx_to_word = {idx: word for word, idx in word_to_idx.items() }

In [40]:
string = ""

for i in sentence_inputs[0]:
    string += idx_to_word[i] + " "

string

'thousands of demonstrators have marched through london to protest the war in iraq and demand the withdrawal of british troops from that country . <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> '

In [41]:
sentence_inputs.shape

(47959, 30)

In [42]:
sentence_inputs[0]

array([ 259,    6,  974,   16, 1791,  237,  467,    7,  522,    2,  129,
          5,   61,    9,  575,    2,  832,    6,  185,   90,   22,   15,
         56,    3,    0,    0,    0,    0,    0,    0])

In [3]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer

dataset = df.copy()


def _get_char_tokens():
        """
        Private method to generate character tokens.

        @return
        A tuple of dictionaries, first maps chars words to integer tokens, 
        second maps integer tokens to chars
        """

        words     = dataset['Word'].values

        tokenizer = Tokenizer(lower=False, oov_token='<UNK>', char_level=True)
        tokenizer.fit_on_texts(list(words))

        return tokenizer.word_index



def get_tokenized_words(max_word_len):
        """
        Public method for ...

        @return
        """

        pad_value = 0

        char_to_idx          = _get_char_tokens()
        char_to_idx['<PAD>'] = pad_value

        temp_dataset                = dataset.copy()
        temp_dataset['char_tokens'] = dataset['Word'].str.split("").str[1:-1]
        temp_dataset['char_tokens'] = temp_dataset['char_tokens'].apply(lambda x: [char_to_idx[i] for i in x])
        tokenized_words             = temp_dataset.groupby(['Sentence #'])['char_tokens'].apply(np.array)


        # pop = tokenized_words.reset_index(name= 'char_tokens')['char_tokens']


        # padded_tokenized_words = pad_sequences(pop, maxlen=max_word_len, value=0, padding='post', truncating='post')


        return tokenized_words, char_to_idx

In [4]:
pop, char_to_idx = get_tokenized_words(10)

In [30]:
pop['char_tokens']

0                    [27, 10, 7, 14, 8, 3, 6, 12, 8]
1                                            [7, 16]
2          [12, 2, 15, 7, 6, 8, 4, 9, 3, 4, 7, 9, 8]
3                                     [10, 3, 23, 2]
4                          [15, 3, 9, 13, 10, 2, 12]
                             ...                    
1048570                               [4, 10, 2, 19]
1048571               [9, 2, 8, 17, 7, 6, 12, 2, 12]
1048572                                       [4, 7]
1048573                                   [4, 10, 2]
1048574                         [3, 4, 4, 3, 13, 25]
Name: char_tokens, Length: 1048575, dtype: object

In [15]:
len(pop[0])

24

In [13]:
data1=np.pad(pop[:5], ((0, 0), (10,11), (20, 21)), 'constant')


ValueError: operands could not be broadcast together with remapped shapes [original->remapped]: (3,2) and requested shape (1,2)

In [167]:
pop

Unnamed: 0,Sentence #,listvalues
0,Sentence: 1,"[[27, 10, 7, 14, 8, 3, 6, 12, 8], [7, 16], [12..."
1,Sentence: 10,"[[30, 9, 3, 6, 5, 3, 6], [7, 16, 16, 5, 13, 5,..."
2,Sentence: 100,"[[41, 2, 11, 5, 13, 7, 17, 4, 2, 9], [18, 14, ..."
3,Sentence: 1000,"[[27, 10, 2, 19], [11, 2, 16, 4], [3, 16, 4, 2..."
4,Sentence: 10000,"[[36, 20, 40, 20], [9, 2, 11, 5, 2, 16], [13, ..."
...,...,...
47954,Sentence: 9995,"[[50, 17, 17, 7, 8, 5, 4, 5, 7, 6], [11, 2, 3,..."
47955,Sentence: 9996,"[[50, 6], [27, 10, 14, 9, 8, 12, 3, 19], [24],..."
47956,Sentence: 9997,"[[45, 7, 11, 11, 7, 21, 5, 6, 18], [30, 9, 3, ..."
47957,Sentence: 9998,"[[26, 5, 6, 13, 2], [4, 10, 2, 6], [24], [3, 1..."


In [142]:
pop.index


Index(['Sentence: 1', 'Sentence: 10', 'Sentence: 100', 'Sentence: 1000',
       'Sentence: 10000', 'Sentence: 10001', 'Sentence: 10002',
       'Sentence: 10003', 'Sentence: 10004', 'Sentence: 10005',
       ...
       'Sentence: 9990', 'Sentence: 9991', 'Sentence: 9992', 'Sentence: 9993',
       'Sentence: 9994', 'Sentence: 9995', 'Sentence: 9996', 'Sentence: 9997',
       'Sentence: 9998', 'Sentence: 9999'],
      dtype='object', name='Sentence #', length=47959)

In [126]:
a = [[1, 2], [2, 4]]

b = pad_sequences(a, maxlen=3)

In [127]:
b[0]

array([0, 1, 2])