In [6]:
import tensorflow as tf
import pandas as pd
import numpy as np
from seqeval.metrics import classification_report,accuracy_score,f1_score
from tqdm import tqdm,trange
from keras.preprocessing.sequence import pad_sequences
from transformers import TF2_WEIGHTS_NAME, BertConfig, BertTokenizer, TFBertForTokenClassification, create_optimizer

In [7]:
# dataset https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/home
# read dataset
data = pd.read_csv("ner_dataset.csv",sep=",",encoding="latin1")
data.shape

(1048575, 4)

In [8]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [9]:
# filling sentence # using forward fill
data['Sentence #'] = data['Sentence #'].fillna(method='ffill')

In [10]:
data.sample(3)

Unnamed: 0,Sentence #,Word,POS,Tag
865692,Sentence: 39558,that,IN,O
451188,Sentence: 20623,nearly,RB,O
353462,Sentence: 16171,ruled,VBN,O


In [11]:
data['Sentence_No'] = data['Sentence #'].str.split(':').str[1].astype(int)
data.head(3)

Unnamed: 0,Sentence #,Word,POS,Tag,Sentence_No
0,Sentence: 1,Thousands,NNS,O,1
1,Sentence: 1,of,IN,O,1
2,Sentence: 1,demonstrators,NNS,O,1


In [12]:
data['Sentence_No'].describe()

count    1.048575e+06
mean     2.396473e+04
std      1.383697e+04
min      1.000000e+00
25%      1.199000e+04
50%      2.397800e+04
75%      3.595100e+04
max      4.795900e+04
Name: Sentence_No, dtype: float64

In [13]:
# selecting small sample of dataset to decrease traning time
SAMPLE_PROP = 0.2
sub_sample_sent = int(data['Sentence_No'].nunique()*SAMPLE_PROP)

# unique sentence number
unique_sent_no = list(set(data['Sentence_No'].values))
select_sent = np.random.choice(unique_sent_no, sub_sample_sent)

# select train and test from selected sentences
TRAIN_SIZE = 0.8
train_sent_count = int(TRAIN_SIZE*len(select_sent))
train_sent = np.random.choice(select_sent, train_sent_count).tolist()
test_sent = list(set(select_sent)-set(train_sent))

# sample dataset
train_df = data[data['Sentence_No'].isin(train_sent)].copy()
test_df = data[data['Sentence_No'].isin(test_sent)].copy()

In [14]:
test_df['Sentence_No'].nunique(),train_df['Sentence_No'].nunique()

(3747, 4948)

In [15]:
train_df.head(2)

Unnamed: 0,Sentence #,Word,POS,Tag,Sentence_No
24,Sentence: 2,Families,NNS,O,2
25,Sentence: 2,of,IN,O,2


In [16]:
test_df.head(2)

Unnamed: 0,Sentence #,Word,POS,Tag,Sentence_No
221,Sentence: 11,The,DT,O,11
222,Sentence: 11,step,NN,O,11


In [17]:
train_df.shape,test_df.shape

((108116, 5), (81827, 5))

In [18]:
# grouping by sentence #
agg_func = lambda s: [ [w,t] for w,t in zip(s["Word"].values.tolist(),s["Tag"].values.tolist())]
train_grp = train_df.groupby("Sentence #").apply(agg_func)
test_grp =  test_df.groupby("Sentence #").apply(agg_func)

In [19]:
train_grp.head()

Sentence #
Sentence: 10       [[Iranian, B-gpe], [officials, O], [say, O], [...
Sentence: 10002    [[He, O], [said, O], [last, O], [week, O], ['s...
Sentence: 1002     [[Three, O], [major, O], [banks, O], [have, O]...
Sentence: 10048    [[For, O], [the, O], [last, O], [four, B-tim],...
Sentence: 10050    [[A, O], [recent, O], [report, O], [by, O], [t...
dtype: object

In [20]:
# get sentence
train_sent = [[s[0] for s in sent] for sent in train_grp.values]
test_sent = [[s[0] for s in sent] for sent in test_grp.values]

In [21]:
# get tags
train_tags = [[t[1] for t in tag] for tag in train_grp.values]
test_tags = [[t[1] for t in tag] for tag in test_grp.values]

In [22]:
len(train_tags),len(test_tags)

(4948, 3747)

In [23]:
tag_list= list(set(train_df['Tag'].values))
label_map = {label: i for i, label in enumerate(tag_list)}

In [26]:
label_map

{'B-art': 4,
 'B-eve': 5,
 'B-geo': 14,
 'B-gpe': 16,
 'B-nat': 12,
 'B-org': 3,
 'B-per': 1,
 'B-tim': 13,
 'I-art': 15,
 'I-eve': 6,
 'I-geo': 7,
 'I-gpe': 11,
 'I-nat': 2,
 'I-org': 8,
 'I-per': 10,
 'I-tim': 9,
 'O': 0}

In [27]:
num_labels = len(tag_list) + 1
num_labels

18

In [28]:
MAX_LENGTH=128
BERT_MODEL="bert-base-multilingual-cased"
BATCH_SIZE=4
pad_token=0,
pad_token_segment_id=0,
sequence_a_segment_id=0,

In [29]:
# BERT  
MODEL_CLASSES = {"bert": (BertConfig, TFBertForTokenClassification, BertTokenizer)}

In [30]:
pad_token_label_id = 0
config_class, model_class, tokenizer_class = MODEL_CLASSES['bert']
config = config_class.from_pretrained(BERT_MODEL,num_labels=num_labels)
tokenizer = tokenizer_class.from_pretrained(BERT_MODEL,do_lower_case=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




In [None]:
# Load pretrained bert model
model = model_class.from_pretrained(BERT_MODEL,from_pt=bool(".bin" in BERT_MODEL),config=config)

In [32]:
 model.layers[-1].activation = tf.keras.activations.softmax

In [80]:
# prepare input and output for bert model
max_seq_length =128

def convert_to_input(sentences,tags):
  input_id_list,attention_mask_list,token_type_id_list=[],[],[]
  label_id_list=[]
  
  for x,y in tqdm(zip(sentences,tags),total=len(tags)):
  
    tokens = []
    label_ids = []

    for word, label in zip(x, y):
      word_tokens = tokenizer.tokenize(word)
      tokens.extend(word_tokens)
      # Use the real label id for the first token of the word, and padding ids for the remaining tokens
      label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))

  

    special_tokens_count =  2          # CLS & SEP token
    if len(tokens) > max_seq_length - special_tokens_count:
      tokens = tokens[: (max_seq_length - special_tokens_count)]
      label_ids = label_ids[: (max_seq_length - special_tokens_count)]

    # begining and end CLS SEP token will no label 0 for 'O'
    label_ids = [pad_token_label_id]+label_ids+[pad_token_label_id]
    
    inputs = tokenizer.encode_plus(tokens,add_special_tokens=True, max_length=max_seq_length,truncation=True)

    input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
    attention_masks = [1] * len(input_ids)

    attention_mask_list.append(attention_masks)
    input_id_list.append(input_ids)
    token_type_id_list.append(token_type_ids)

    label_id_list.append(label_ids)

  # padding input sequences
  input_id_list = pad_sequences(input_id_list,maxlen=max_seq_length,dtype="long",truncating="post",padding="post")
  token_type_id_list = pad_sequences(token_type_id_list,maxlen=max_seq_length,dtype="long",truncating="post",padding="post")
  attention_mask_list = pad_sequences(attention_mask_list,maxlen=max_seq_length,dtype="long",truncating="post",padding="post")
  label_id_list = pad_sequences(label_id_list,maxlen=max_seq_length,dtype="long",truncating="post",padding="post")

  return input_id_list,token_type_id_list,attention_mask_list,label_id_list

In [81]:
input_ids_train,token_ids_train,attention_masks_train,label_ids_train=convert_to_input(train_sent,train_tags)

100%|██████████| 4948/4948 [00:06<00:00, 822.85it/s]


In [82]:
input_ids_test,token_ids_test,attention_masks_test,label_ids_test=convert_to_input(test_sent,test_tags)

100%|██████████| 3747/3747 [00:04<00:00, 829.39it/s]


In [83]:
np.shape(input_ids_train),np.shape(token_ids_train),np.shape(attention_masks_train),np.shape(label_ids_train),

((4948, 128), (4948, 128), (4948, 128), (4948, 128))

In [84]:
np.shape(input_ids_test),np.shape(token_ids_test),np.shape(attention_masks_test),np.shape(label_ids_test),

((3747, 128), (3747, 128), (3747, 128), (3747, 128))

In [89]:
N_BATCH = 4
N_EPOCH = 3

# train generator
def example_to_features(input_ids,attention_masks,token_type_ids,y):
  return {"input_ids": input_ids,"attention_mask": attention_masks,"token_type_ids": token_type_ids},y

train_ds = tf.data.Dataset.from_tensor_slices((input_ids_train,attention_masks_train,token_ids_train,label_ids_train)).map(example_to_features).shuffle(buffer_size=100).batch(N_BATCH).repeat(N_EPOCH)

# test generator
test_ds=tf.data.Dataset.from_tensor_slices((input_ids_test,attention_masks_test,token_ids_test,label_ids_test)).map(example_to_features).batch(1)


In [90]:
model.summary()

Model: "tf_bert_for_token_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  177853440 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  13842     
Total params: 177,867,282
Trainable params: 177,867,282
Non-trainable params: 0
_________________________________________________________________


In [91]:
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

In [92]:
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [93]:
history = model.fit(train_ds, epochs=N_EPOCH, validation_data=test_ds)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [94]:
predict = model.predict(test_ds)

In [99]:
predict[0].shape

(3747, 128, 18)