## Importing libraries

In [1]:
import nltk
from nltk.tag import pos_tag
from sklearn_crfsuite import CRF
from sklearn.metrics import confusion_matrix
import warnings
import pandas as pd
from seqeval.metrics import f1_score
import datasets
warnings.filterwarnings('ignore')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\саня\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Loading the data

In [2]:
dataset = ['gusevski/factrueval2016',
           'RCC-MSU/collection3',
           'conll2003',
           'ghadeermobasher/BC5CDR-Chemical-Disease']

In [3]:
def extract_sentences_and_labels(data, model):
    sentences, labels = [], []
    if model == 'gusevski/factrueval2016':
        data = data['data'][0]
    for item in data:
        words = item['tokens']
        tags = item['ner_tags']
        sentences.append(words)
        labels.append(tags)
    return sentences, labels

In [4]:
def convert_labels(labels, model):
    if model == 'gusevski/factrueval2016' or model == 'RCC-MSU/collection3':
        label_map = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
    elif model == 'conll2003':
        label_map = {0: 'O', 1: 'B-MISC', 2: 'I-MISC', 3: 'B-PER', 4: 'I-PER', 5: 'B-ORG', 6: 'I-ORG', 7: 'B-LOC', 8: 'I-LOC'}
    else:
        label_map = {0: 'O', 1: 'CH-DIS', 2: 'DIS', 3: 'A-CH', 4: 'CH'}
    return [[label_map[tag] for tag in sent] for sent in labels]

In [5]:
def usage_model(model):
    data = datasets.load_dataset(model, trust_remote_code=True)
    train_path = data['train']
    test_path = data['test']
    val_path = data['validation']
    return train_path, test_path, val_path

In [6]:
def convert(model):
    train_path, test_path, val_path = usage_model(model)
    
    train_sentences, train_labels = extract_sentences_and_labels(train_path, model)
    train_labels = convert_labels(train_labels, model)
    
    test_sentences, test_labels = extract_sentences_and_labels(test_path, model)
    test_labels = convert_labels(test_labels, model)
    
    val_sentences, val_labels = extract_sentences_and_labels(val_path, model)
    val_labels = convert_labels(val_labels, model)
    return train_sentences, train_labels, test_sentences, test_labels

## Extracting features

In [7]:
def sent2feats(sentence):
    feats = []
    sen_tags = pos_tag(sentence)
    for i in range(0, len(sentence)):
        word = sentence[i]
        word_feats = {'word': word}
        if i == 0:
            word_feats['prevWord'] = word_feats['prevSecondWord'] = "<S>"
        elif i == 1:
            word_feats['prevWord'] = sentence[0]
            word_feats['prevSecondWord'] = '</S>'
        else:
            word_feats['prevWord'] = sentence[i-1]
            word_feats['prevSecondWord'] = sentence[i-2]
        if i == len(sentence)-2:
            word_feats['nextWord'] = sentence[i+1]
            word_feats['nextNextWord'] = '</S>'
        elif i==len(sentence)-1:
            word_feats['nextWord'] = '</S>'
            word_feats['nextNextWord'] = '</S>'
        else:
            word_feats['nextWord'] = sentence[i+1]
            word_feats['nextNextWord'] = sentence[i+2]
        word_feats['tag'] = sen_tags[i][1]
        if i == 0:
            word_feats["prevTag"] = word_feats["prevSecondTag"] = "<S>"
        elif i == 1:
            word_feats["prevTag"] = sen_tags[0][1]
            word_feats["prevSecondTag"] = "</S>"
        else:
            word_feats["prevTag"] = sen_tags[i - 1][1]
            word_feats["prevSecondTag"] = sen_tags[i - 2][1]
        if i == len(sentence) - 2:
            word_feats["nextTag"] = sen_tags[i + 1][1]
            word_feats["nextNextTag"] = "</S>"
        elif i == len(sentence) - 1:
            word_feats["nextTag"] = "</S>"
            word_feats["nextNextTag"] = "</S>"
        else:
            word_feats["nextTag"] = sen_tags[i + 1][1]
            word_feats["nextNextTag"] = sen_tags[i + 2][1]
        feats.append(word_feats)
    return feats

In [8]:
def get_feats_conll(sentences, labels):
    feats = [sent2feats(sentence) for sentence in sentences]
    return feats, labels

In [9]:
def prepare_df(test_data, y_pred):
    data = []
    for item, pred in zip(test_data, y_pred):
        text = item[0]
        gt = item[1]
        data.append({'text': text, 'ground_truth': gt, 'pred': pred})
    df = pd.DataFrame(data)
    return df


### Training a model

In [10]:
def print_cm(cm, labels):
    print("\n")
    column_width = max([len(x) for x in labels] + [5])
    empty_cell = " " * column_width
    print("    " + empty_cell, end=" ")
    for label in labels:
        print("%{0}s".format(column_width) % label, end=" ")
    print()
    for i, label1 in enumerate(labels):
        print("    %{0}s".format(column_width) % label1, end=" ")
        sum = 0
        for j in range(len(labels)):
            cell = "%{0}.0f".format(column_width) % cm[i, j]
            sum = sum + int(cell)
            print(cell, end=" ")
        print(sum)

In [11]:
def get_confusion_matrix(y_true,y_pred,labels):
    trues, preds = [], []
    for yseq_true, yseq_pred in zip(y_true, y_pred):
        trues.extend(yseq_true)
        preds.extend(yseq_pred)
    print_cm(confusion_matrix(trues,preds,labels=labels),labels)

In [12]:
def train_seq(X_train, Y_train, X_dev, Y_dev, raw_test_data):
    crf = CRF(algorithm='lbfgs', c1=0.1, c2=10, max_iterations=50)
    crf.fit(X_train, Y_train)
    y_pred = crf.predict(X_dev)
    score = f1_score(Y_dev, y_pred)
    print(f"F1 Score: {score}")
    df = prepare_df(raw_test_data, y_pred)
    return crf, df, score

## Visualize results

In [13]:
def get_score(model):
    train_sentences, train_labels, test_sentences, test_labels = convert(model)
    X_train, Y_train = get_feats_conll(train_sentences, train_labels)
    X_dev, Y_dev = get_feats_conll(test_sentences, test_labels)
    crf, df, score = train_seq(X_train, Y_train, X_dev, Y_dev, list(zip(test_sentences, test_labels)))

# Factrueval 2016

In [14]:
get_score(dataset[0])

Repo card metadata block was not found. Setting CardData to empty.


F1 Score: 0.41427571728481455


# Collection3

In [15]:
get_score(dataset[1])

F1 Score: 0.5405405405405406


# Conll2003

In [16]:
get_score(dataset[2])

F1 Score: 0.6655570966250578


# BC5CDR

In [17]:
get_score(dataset[3])

Repo card metadata block was not found. Setting CardData to empty.


F1 Score: 0.44982099549601573
