In [1]:
import pandas as pd
import json
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split

from transformers import (TFBertForSequenceClassification,
                          BertTokenizer, TFBertForTokenClassification, BertConfig)
from tqdm import tqdm
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [39]:
MAX_LEN = 180

In [40]:
tags = {'O':0, 'I-ORG':1, 'I-LOC':2, 'I-MISC':3, 'I-PER':4, 'B-MISC':5, 'B-LOC': 6, 'B-ORG':7, 'PAD':8}

In [41]:
indices_to_tags = {0:'O', 1:'I-ORG', 2:'I-LOC', 3:'I-MISC', 4:'I-PER', 5:'B-MISC', 6:'B-LOC', 7:'B-ORG', 8:'PAD'}

### Preprocessing of the CONLL-2003 dataset. This function is called only once at the beginning and then the dataset is saved to a json file.

In [None]:
def parse_dataset(file_name, save_file=False):
    with open(file_name, "r") as file:
        content = file.read()
    find_sentences = re.compile(r'(.*?)(?:\n{2})', re.MULTILINE | re.DOTALL)
    sentences = find_sentences.findall(content)
    parse_sentence = re.compile(r"(\S+)\s(\S+)\s\S+\s(\S+)")
    df = pd.DataFrame(columns=["sentence_idx", "sentences", "tags"])
    comb_sentences = []
    all_tags = []
    idxs = []
    for i, s in enumerate(sentences):
        tokens = parse_sentence.findall(s)
        curr_sentence = []
        curr_tags = []
        for token in tokens:
            word, pos, tag = token
            curr_sentence.append(word)
            curr_tags.append(tags[tag])
        idxs.append(i)
        comb_sentences.append(curr_sentence)
        all_tags.append(curr_tags)
    df["sentence_idx"] = idxs
    df["sentences"] = comb_sentences
    
    df["tags"] = all_tags
    if save_file:
        df.to_json("parsed_sl_{}.json".format(file_name.split(".")[-1]), orient='records')
    return df

In [None]:
# df = parse_dataset("../eng.train", True)
# df.head()
# df = pd.read_csv("parsed_train.csv")
# df.tag.unique()

In [None]:
pad_token = 0
pad_token_segment_id = 0
max_length = 180
def convert_to_input(reviews):
    input_ids, attention_masks, token_type_ids = [], [], []

    for x in tqdm(reviews, position=0, leave=True):
        inputs = bert_tokenizer.encode_plus(x, max_length=max_length)

        i, t = inputs["input_ids"], inputs["token_type_ids"]
        m = [1] * len(i)

        padding_length = max_length - len(i)
#         print("##########################X", x)
#         print("### I ", i)
#         print("### M ", m)
#         print("### T ", t)
        i = i + ([pad_token] * padding_length)
        m = m + ([0] * padding_length)
        t = t + ([pad_token_segment_id] * padding_length)
        
        input_ids.append(i)
        attention_masks.append(m)
        token_type_ids.append(t)

    return [np.asarray(input_ids),
            np.asarray(attention_masks)]

### Helper functions:

In [42]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = bert_tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the corresponding tag to the first token of the word and "PAD" tag to the rest of the tokens
        labels.append(label)
        labels.extend([tags["PAD"]] * (n_subwords - 1))
#         labels.extend([label] * n_subwords)

    return tokenized_sentence, labels


In [43]:
def example_to_features(input_ids, attention_masks, y):
    return {"input_ids": input_ids,
            "attention_mask": attention_masks}, y

def example_to_features_list(input_ids, attention_masks, y):
    return [input_ids, attention_masks], y

In [44]:
def example_to_features_new(input_ids,attention_masks,token_type_ids,y):
    return {"input_ids": input_ids,
          "attention_mask": attention_masks,
          "token_type_ids": token_type_ids},y


In [45]:
def convert_predictions(y_pred, indices_to_tags):
    conv_pred = []
    for pred in y_pred:
        conv_tmp = []
        for val in pred:
            val_arg_max = np.argmax(val)
            conv_tmp.append(val_arg_max)
        conv_pred.append(conv_tmp)
    return np.array(conv_pred)

### Load the pretrained Bert model and the Bert tokenizer.

In [46]:
config =BertConfig.from_pretrained("bert-base-cased",num_labels=9)
bert_model = TFBertForTokenClassification.from_pretrained("bert-base-cased", config=config)
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased", do_lower_case=False)

In [10]:
bert_model.layers[-1].activation = tf.keras.activations.softmax

### Read the train and test dataset from file

In [47]:
with open("parsed_sl_testa.json") as test, open("parsed_sl_train.json") as train:
    test_df = json.load(test)
    data = json.load(train)

### Split the train dataset into train and validation sets.

In [48]:
# train_inputs, val_inputs, train_tags, val_tags = train_test_split(train_ids, train_tags,
#                                                             random_state=2018, test_size=0.1)
# train_masks, val_masks, _, _ = train_test_split(attention_masks_train, train_ids,
#                                              random_state=2018, test_size=0.1)


data_train, data_val = train_test_split(data, random_state=69, test_size=0.2)


### Tokenize the three datasets, assign the appropriate labels to each token 

In [49]:
tokenized_texts_train = []
tokenized_labels_train = []
for sent_dict in tqdm(data, position=0, leave=True):
    sen, lab = tokenize_and_preserve_labels(sent_dict['sentences'], sent_dict['tags'])
    tokenized_texts_train.append(sen)
    tokenized_labels_train.append(lab)

100%|██████████| 14987/14987 [00:10<00:00, 1432.52it/s]


In [50]:
tokenized_texts_test = []
tokenized_labels_test = []
for sent_dict in tqdm(test_df, position=0, leave=True):
    sen, lab = tokenize_and_preserve_labels(sent_dict['sentences'], sent_dict['tags'])
    tokenized_texts_test.append(sen)
    tokenized_labels_test.append(lab)

100%|██████████| 3466/3466 [00:02<00:00, 1311.60it/s]


In [51]:
tokenized_texts_val = []
tokenized_labels_val = []
for sent_dict in tqdm(data_val, position=0, leave=True):
    sen, lab = tokenize_and_preserve_labels(sent_dict['sentences'], sent_dict['tags'])
    tokenized_texts_val.append(sen)
    tokenized_labels_val.append(lab)

100%|██████████| 2998/2998 [00:02<00:00, 1447.54it/s]


### Pad each sequence up to the previously defined MAX_LEN = 180. Create the lists with attention masks. Convert the datasets to tensorflow Dataset type.

In [52]:
train_ids = pad_sequences([bert_tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_train],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")
test_ids = pad_sequences([bert_tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_test],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")
val_ids = pad_sequences([bert_tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_val],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

train_tags = pad_sequences([label for label in tokenized_labels_train],
                          maxlen=MAX_LEN, dtype="long", value=tags["PAD"],
                          truncating="post", padding="post")
test_tags = pad_sequences([label for label in tokenized_labels_test],
                          maxlen=MAX_LEN, dtype="long", value=tags["PAD"],
                          truncating="post", padding="post")
val_tags = pad_sequences([label for label in tokenized_labels_val],
                          maxlen=MAX_LEN, dtype="long", value=tags["PAD"],
                          truncating="post", padding="post")

In [54]:
attention_masks_train = [[float(i > 0 and i < 8) for i in ii] for ii in train_ids]
attention_masks_test = [[float(i > 0 and i < 8) for i in ii] for ii in test_ids]
attention_masks_val = [[float(i > 0 and i < 8) for i in ii] for ii in val_ids]

In [55]:
train_ds = tf.data.Dataset.from_tensor_slices((tf.constant(train_ids), tf.constant(attention_masks_train), tf.constant(train_tags))).map(
        example_to_features).shuffle(100).batch(12).repeat(5)
val_ds = tf.data.Dataset.from_tensor_slices((tf.constant(val_ids), tf.constant(attention_masks_val), tf.constant(val_tags))).map(
        example_to_features).batch(12)
test_ds = tf.data.Dataset.from_tensor_slices((tf.constant(test_ids), tf.constant(attention_masks_test), tf.constant(test_tags))).map(
        example_to_features).batch(12)

### Configure the model and start training.

In [56]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam()
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

In [57]:
bert_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
bert_model.summary()

In [58]:
print("Fine-tuning BERT on CONLL2003 dataset")
bert_history = bert_model.fit(train_ds, epochs=2, validation_data=val_ds)

Fine-tuning BERT on CONLL2003 dataset
Train for 6245 steps, validate for 250 steps
Epoch 1/2
Epoch 2/2


### Evaluate and save the model.

In [59]:
results_true = test_ds.unbatch()
results_true = np.asarray([element[1].numpy() for element in results_true])
print(results_true)

[[0 8 8 ... 8 8 8]
 [0 8 8 ... 8 8 8]
 [2 8 8 ... 8 8 8]
 ...
 [0 0 0 ... 8 8 8]
 [0 1 8 ... 8 8 8]
 [0 8 1 ... 8 8 8]]


In [60]:
results = bert_model.predict(test_ds)

In [61]:
results_predicted = convert_predictions(results, indices_to_tags)

In [62]:
print(results[350])
print(results_true[9])

[[  0.07211247  -2.9912407   -3.0754018  ... -10.632847    -9.909456
    2.988423  ]
 [  0.07211247  -2.9912407   -3.0754018  ... -10.632847    -9.909456
    2.988423  ]
 [  0.07211247  -2.9912407   -3.0754018  ... -10.632847    -9.909456
    2.988423  ]
 ...
 [  0.07211247  -2.9912407   -3.0754018  ... -10.632847    -9.909456
    2.988423  ]
 [  0.07211247  -2.9912407   -3.0754018  ... -10.632847    -9.909456
    2.988423  ]
 [  0.07211247  -2.9912407   -3.0754018  ... -10.632847    -9.909456
    2.988423  ]]
[0 0 0 1 0 0 0 0 0 0 8 8 0 0 0 8 8 8 4 0 0 8 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8]


In [63]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score

m = MultiLabelBinarizer().fit(results_true)

f1_score(m.transform(results_true),
         m.transform(results_predicted), average='weighted')
# 1.0

0.3154924449299108

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

print(f"F1 score: {f1_score(results_true, results_predicted)}")
print(f"Accuracy score: {accuracy_score(results_true, results_predicted)}")

In [30]:
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

print(f"F1 score: {f1_score(results_true, results_predicted)}")
print(f"Accuracy score: {accuracy_score(results_true, results_predicted)}")

UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('<U21'), dtype('<U21')) -> dtype('<U21')

In [36]:
bert_model.save_pretrained('./')
bert_model = TFBertForTokenClassification.from_pretrained('./')

In [37]:
def annot_confusion_matrix(valid_tags, pred_tags):

    """
    Create an annotated confusion matrix by adding label
    annotations and formatting to sklearn's `confusion_matrix`.
    """

    # Create header from unique tags
    header = sorted(list(set(valid_tags + pred_tags)))

    # Calculate the actual confusion matrix
    matrix = confusion_matrix(valid_tags, pred_tags, labels=header)

    # Final formatting touches for the string output
    mat_formatted = [header[i] + "\t" + str(row) for i, row in enumerate(matrix)]
    content = "\t" + " ".join(header) + "\n" + "\n".join(mat_formatted)

    return content


In [38]:
annot_confusion_matrix(list(results_true), list(results_predicted))

TypeError: unhashable type: 'numpy.ndarray'