In [2]:
import pandas as pd
import numpy as np
import torch

# Investigating WNUT- and synthetic data

In [3]:
# dataset WNUT 2016
# data from https://autonlp.ai/datasets/wnut-2016
wnut_2016_train = pd.read_csv("./NER_data/WNUT 2016 train.txt", sep="\t", header=None, names=["words", "labels"])
wnut_2016_test = pd.read_csv("./NER_data/WNUT 2016 test.txt", sep="\t", header=None, names=["words", "labels"])

In [4]:
wnut_2016 = pd.concat([wnut_2016_train, wnut_2016_test], axis=0)
wnut_2016.head()

Unnamed: 0,words,labels
0,@SammieLynnsMom,O
1,@tg10781,O
2,they,O
3,will,O
4,be,O


In [5]:
wnut_2016.head(20)

Unnamed: 0,words,labels
0,@SammieLynnsMom,O
1,@tg10781,O
2,they,O
3,will,O
4,be,O
5,all,O
6,done,O
7,by,O
8,Sunday,O
9,trust,O


In [6]:
len(wnut_2016)

86401

In [7]:
wnut_2016["labels"].unique()

array(['O', 'B-geo-loc', 'B-facility', 'I-facility', 'B-movie', 'I-movie',
       'B-company', 'B-product', 'B-person', 'B-sportsteam',
       'I-sportsteam', 'I-product', 'B-other', 'I-other', 'I-company',
       'I-person', 'I-geo-loc', 'B-tvshow', 'B-musicartist',
       'I-musicartist', 'I-tvshow'], dtype=object)

In [8]:
# adding a sentence_id-column
m = wnut_2016["words"].str.contains("@").cumsum()
wnut_2016["sentence_id"] = wnut_2016.groupby(m).ngroup() + 0

wnut_2016.head()

Unnamed: 0,words,labels,sentence_id
0,@SammieLynnsMom,O,0.0
1,@tg10781,O,1.0
2,they,O,1.0
3,will,O,1.0
4,be,O,1.0


In [9]:
labels_2016 = wnut_2016["labels"].unique()
print(labels_2016)

['O' 'B-geo-loc' 'B-facility' 'I-facility' 'B-movie' 'I-movie' 'B-company'
 'B-product' 'B-person' 'B-sportsteam' 'I-sportsteam' 'I-product'
 'B-other' 'I-other' 'I-company' 'I-person' 'I-geo-loc' 'B-tvshow'
 'B-musicartist' 'I-musicartist' 'I-tvshow']


In [10]:
#add sentence-ID column, increment at every "@-symbol"
# ner_csv.loc[0, "sentence-ID"] = 1
# for row in ner_csv.iterrows():
#     if "@" in ner_csv.loc[row, "words"]:
#         ner_csv.loc[row, "sentence-ID"] = ner_csv.loc[row-1, "words"] +1
    #else:
      #  ner_csv["words"][row] = ner_csv["words"][row-1]

In [11]:
# loading WNUT 2017 dataset
# https://github.com/juand-r/entity-recognition-datasets/blob/master/data/WNUT17/CONLL-format/data/train/wnut17train.conll
#wnut_2017 = pd.read_csv("./wnut17train.conll")

def read_conll(filename):
    df = pd.read_csv(filename,
                    sep = '\t', header = None, keep_default_na = False,
                    names = ['TOKEN', 'POS', 'CHUNK', 'NE'],
                    quoting = 3, skip_blank_lines = False)
    df['SENTENCE'] = (df.TOKEN == '').cumsum()
    return df[df.TOKEN != '']

wnut_2017_train = read_conll("./NER_data/wnut17train.conll")
wnut_2017_test = read_conll("./NER_data/wnut17test.conll")

In [12]:
wnut_2017 = pd.concat([wnut_2017_train, wnut_2017_test], axis=0)
wnut_2017 = wnut_2017[["TOKEN", "POS", "SENTENCE"]]
wnut_2017 = wnut_2017.rename(columns={"TOKEN" : "words", "POS" : "labels", "SENTENCE" : "sentence_id"})

In [13]:
labels_2017 = wnut_2017["labels"].unique()
print(labels_2017)

['O' 'B-location' 'I-location' 'B-group' 'B-corporation' 'B-person'
 'B-creative-work' 'B-product' 'I-person' 'I-creative-work'
 'I-corporation' 'I-group' 'I-product']


In [14]:
wnut_2017.head()

Unnamed: 0,words,labels,sentence_id
0,@paulwalk,O,0
1,It,O,0
2,'s,O,0
3,the,O,0
4,view,O,0


In [15]:
wnut_all = pd.concat([wnut_2016, wnut_2017], axis=0)
wnut_all.head()

Unnamed: 0,words,labels,sentence_id
0,@SammieLynnsMom,O,0.0
1,@tg10781,O,1.0
2,they,O,1.0
3,will,O,1.0
4,be,O,1.0


In [16]:
wnut_all = wnut_all.dropna()

In [17]:
len(wnut_all)

172503

In [18]:
wnut_all["labels"].value_counts()

labels
O                  160976
B-person             1815
B-geo-loc             933
I-person              868
I-product             786
B-location            698
I-other               663
B-company             651
B-other               626
B-product             534
B-group               429
I-creative-work       424
I-facility            367
I-location            339
B-corporation         287
B-creative-work       282
B-facility            280
I-company             242
I-group               220
I-geo-loc             215
B-musicartist         194
I-musicartist         162
B-sportsteam          149
I-movie                72
I-corporation          68
I-tvshow               59
B-tvshow               57
B-movie                54
I-sportsteam           53
Name: count, dtype: int64

In [None]:
# read in the synthetically created data
#df_syn = pd.read_csv("./NER_data/synthetic_tweet_data3.csv")
df_syn = pd.read_csv("/content/synthetic_tweet_data3.csv")
df_syn.head()

Unnamed: 0,token,label,sentence_id
0,Is,O,0
1,it,O,0
2,just,O,0
3,me,O,0
4,or,O,0


In [None]:
df_syn["label"].unique()

array(['O', 'B-PRODUCT', 'B-COMPANY', 'B-PERSON'], dtype=object)

In [None]:
df_syn = df_syn.rename(columns={"token" : "words", "label" : "labels"})
df_syn["labels"] = df_syn["labels"].replace({"B-PRODUCT" : "B-product", "B-COMPANY" : "B-company", "B-PERSON" : "B-person"})

In [None]:
df_syn.head()

Unnamed: 0,words,labels,sentence_id
0,Is,O,0
1,it,O,0
2,just,O,0
3,me,O,0
4,or,O,0


## bert

In [None]:
# tutorial for finetuning with bert
# https://github.com/karndeepsingh/Named-Entity-Recognition/blob/main/NAMED%20ENTITY%20RECOGNITION.ipynb

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
df_syn["sentence_id"] = LabelEncoder().fit_transform(df_syn["sentence_id"])

In [None]:
X = df_syn[["sentence_id","words"]]
Y = df_syn["labels"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
x_train.head()

Unnamed: 0,sentence_id,words
14220,1014,Honestly
91125,6504,be
426021,30483,I
104037,7420,new
349990,25017,great


In [None]:
train_data = pd.DataFrame({"sentence_id":x_train["sentence_id"],"words":x_train["words"],"labels":y_train})
test_data = pd.DataFrame({"sentence_id":x_test["sentence_id"],"words":x_test["words"],"labels":y_test})

In [None]:
train_data

Unnamed: 0,sentence_id,words,labels
14220,1014,Honestly,O
91125,6504,be,O
426021,30483,I,O
104037,7420,new,O
349990,25017,great,O
...,...,...,...
301789,21562,with,O
175253,12528,be,O
464050,33210,overrated,O
161667,11543,Jobs,I-person


In [None]:
from simpletransformers.ner import NERModel,NERArgs

In [None]:
label = df_syn["labels"].unique().tolist()
label

['O', 'B-product', 'B-company', 'B-person', 'I-person', 'I-product']

In [None]:
args = NERArgs()
# epochs = times the training data is ran through the network
args.num_train_epochs = 2
args.learning_rate = 1e-4
args.overwrite_output_dir = True
args.train_batch_size = 32
args.eval_batch_size = 32

In [None]:
model = NERModel('bert', 'bert-base-cased', labels=label, args=args, use_cuda=False)
# set use_cuda to true if gpu is available

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



In [None]:
result, model_outputs, preds_list = model.eval_model(test_data)

  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1054 [00:00<?, ?it/s]

In [None]:
result

{'eval_loss': 3.4305521306537448,
 'precision': 0.0015437152079337607,
 'recall': 0.017818574514038878,
 'f1_score': 0.002841275991217874}

In [None]:
prediction, model_output = model.predict(["Tesla just came out with a new model, the Tesla-7b, which is electric. I love that!"])

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
prediction

[[{'Tesla': 'B-product'},
  {'just': 'B-product'},
  {'came': 'I-product'},
  {'out': 'I-product'},
  {'with': 'B-product'},
  {'a': 'B-person'},
  {'new': 'I-product'},
  {'model,': 'B-company'},
  {'the': 'B-person'},
  {'Tesla-7b,': 'O'},
  {'which': 'I-product'},
  {'is': 'I-product'},
  {'electric.': 'B-product'},
  {'I': 'B-person'},
  {'love': 'I-product'},
  {'that!': 'I-product'}]]

# RoBERTa-TweetNER

In [23]:
# trying out model that is RoBERTa version trained with tweetner (includes the wanted entities)
# tner/roberta-base-tweetner7-all

from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("tner/roberta-base-tweetner7-all")

model = AutoModelForSequenceClassification.from_pretrained("tner/roberta-base-tweetner7-all")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at tner/roberta-base-tweetner7-all and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

model_name = "tner/roberta-base-tweetner7-all"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

text = "Apple just launched the new iPhone, and Microsoft released a new version of Windows."

entities = ner_pipeline(text)
print(entities)

Device set to use cpu


[{'entity': 'B-corporation', 'score': 0.5353071, 'index': 1, 'word': 'Apple', 'start': 0, 'end': 5}, {'entity': 'B-product', 'score': 0.656318, 'index': 6, 'word': 'ĠiPhone', 'start': 28, 'end': 34}, {'entity': 'B-corporation', 'score': 0.70640016, 'index': 9, 'word': 'ĠMicrosoft', 'start': 40, 'end': 49}, {'entity': 'B-product', 'score': 0.6205899, 'index': 15, 'word': 'ĠWindows', 'start': 76, 'end': 83}]


In [5]:
filtered_entities = [{"word": entity["word"], "entity": entity["entity"]} for entity in entities]
print(filtered_entities)

[{'word': 'Apple', 'entity': 'B-corporation'}, {'word': 'ĠiPhone', 'entity': 'B-product'}, {'word': 'ĠMicrosoft', 'entity': 'B-corporation'}, {'word': 'ĠWindows', 'entity': 'B-product'}]


In [6]:
# testing NER on Brand Sentiment Analysis data
import pandas as pd
df_brd_sa = pd.read_csv("./analysis_data/Brand Sentiment Analysis Dataset/Dataset - Train.csv")
df_brd_sa.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [25]:
# applying pre-processing function before model application from https://huggingface.co/tner/roberta-base-tweetner7-all
import re
from urlextract import URLExtract

extractor = URLExtract()

def format_tweet(tweet):
    # mask web urls
    urls = extractor.find_urls(tweet)
    for url in urls:
        tweet = tweet.replace(url, "{{URL}}")
    # format twitter account
    tweet = re.sub(r"\b(\s*)(@[\S]+)\b", r'\1{\2@}', tweet)
    return tweet

In [8]:
df_brd_sa_sm = df_brd_sa[:100]

df_brd_sa_sm.dropna()

df_brd_sa_sm["tweet_text"] = df_brd_sa_sm.apply(lambda x: format_tweet(str(x["tweet_text"])), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brd_sa_sm["tweet_text"] = df_brd_sa_sm.apply(lambda x: format_tweet(str(x["tweet_text"])), axis=1)


In [26]:
def ner(txt):
    entities = ner_pipeline(txt)
    entities = [{"word": entity["word"], "entity": entity["entity"]} for entity in entities]
    return entities

In [14]:
df_brd_sa["entities"] = df_brd_sa.apply(lambda x: ner(str(x["tweet_text"])), axis=1)

In [None]:
df_brd_sa.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,entities
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,"[{'word': '.', 'entity': 'B-person'}, {'word':..."
1,@jessedee Know about {@fludapp@} ? Awesome iPa...,iPad or iPhone App,Positive emotion,"[{'word': '@', 'entity': 'B-person'}, {'word':..."
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,"[{'word': '@', 'entity': 'B-person'}, {'word':..."
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,"[{'word': 'Ġfestival', 'entity': 'B-event'}, {..."
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,"[{'word': 'xt', 'entity': 'I-event'}, {'word':..."
...,...,...,...,...
95,GSD&amp;M &amp; Google's Industry Party Tonigh...,,No emotion toward brand or product,"[{'word': 'ĠGoogle', 'entity': 'B-corporation'..."
96,New buzz? &quot;@mention Google to Launch Majo...,,No emotion toward brand or product,"[{'word': 'ment', 'entity': 'I-corporation'}, ..."
97,Headline: &quot;#iPad 2 is the Must-Have Gadge...,iPad,Positive emotion,"[{'word': 'i', 'entity': 'B-product'}, {'word'..."
98,.@mention &quot;Google launched checkins a mon...,Google,Positive emotion,"[{'word': '.', 'entity': 'B-person'}, {'word':..."


In [None]:
reslist = list(df_brd_sa[["tweet_text", "entities"]].values)
print(reslist)

[array(['.@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.',
       list([{'word': '.', 'entity': 'B-person'}, {'word': '@', 'entity': 'I-person'}, {'word': 'w', 'entity': 'I-person'}, {'word': 'esley', 'entity': 'I-person'}, {'word': '83', 'entity': 'I-person'}, {'word': 'Ġ3', 'entity': 'B-product'}, {'word': 'G', 'entity': 'I-product'}, {'word': 'ĠiPhone', 'entity': 'B-product'}, {'word': 'Ġ#', 'entity': 'B-location'}, {'word': 'ISE', 'entity': 'I-location'}, {'word': '_', 'entity': 'I-location'}, {'word': 'Austin', 'entity': 'I-location'}, {'word': 'Ġ#', 'entity': 'B-location'}, {'word': 'S', 'entity': 'I-location'}, {'word': 'X', 'entity': 'I-location'}, {'word': 'SW', 'entity': 'I-location'}])],
      dtype=object), array(["@jessedee Know about {@fludapp@} ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",
       list([{'word': '@', 'entity': '

In [None]:
# json allows entities-column to be stored in the CSV
import json
df_brd_sa["entities"] = df_brd_sa["entities"].apply(json.dumps)

In [16]:
# saving the df containing the prediction results as CSV to be used further
df_brd_sa.to_csv("./data after NER.csv")

# Predicting and evaluating on TweetNER7-data

In [15]:
# loading tweetner17 dataset from huggingface
# https://huggingface.co/datasets/tner/tweetner7
from datasets import load_dataset

ds = load_dataset("tner/tweetner7")

In [16]:
tweetner7_train = ds["train_2020"].to_pandas()
tweetner7_test = ds["test_2020"].to_pandas()

In [17]:
tweetner7_train.head()

Unnamed: 0,tokens,tags,id,date
0,"[Morning, 5km, run, with, {{USERNAME}}, for, b...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 2, 14...",1183344337016381440,2019-10-13
1,"[President, Trump, Arrives, at, UFC, 244, in, ...","[5, 12, 14, 14, 0, 7, 14, 4, 11, 14, 14, 14, 1...",1190961319538765824,2019-11-03
2,"["", I, 've, been, in, law, enforcement, for, 2...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...",1267032593339486209,2020-05-31
3,"[I, got, mine, yesterday, !, ****, Doctors, sa...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...",1180717545935925248,2019-10-06
4,"[Mayo, Breast, Cancer, Vaccine, Could, Be, Ava...","[6, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14...",1183251744601587712,2019-10-13


In [18]:
len(tweetner7_train)

4616

In [19]:
# converting the tags-column to a label-column to be comparable with the model-output

entity_dict = {
    0: "B-corporation", 
    1: "B-creative_work",
    2: "B-event",
    3: "B-group",
    4: "B-location",
    5: "B-person",
    6: "B-product",
    7: "I-corporation",
    8: "I-creative_work",
    9: "I-event",
    10: "I-group",
    11: "I-location",
    12: "I-person",
    13: "I-product",
    14: "O"
}

def tags_to_labels(col):
    result = []
    for i in col:
        label = entity_dict[i]
        result.append(label)
        
    return result
        
tweetner7_train["true_labels"] = tweetner7_train["tags"].apply(tags_to_labels)
tweetner7_train.head()

Unnamed: 0,tokens,tags,id,date,true_labels
0,"[Morning, 5km, run, with, {{USERNAME}}, for, b...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 2, 14...",1183344337016381440,2019-10-13,"[O, O, O, O, O, O, O, O, O, O, B-event, O, B-e..."
1,"[President, Trump, Arrives, at, UFC, 244, in, ...","[5, 12, 14, 14, 0, 7, 14, 4, 11, 14, 14, 14, 1...",1190961319538765824,2019-11-03,"[B-person, I-person, O, O, B-corporation, I-co..."
2,"["", I, 've, been, in, law, enforcement, for, 2...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...",1267032593339486209,2020-05-31,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[I, got, mine, yesterday, !, ****, Doctors, sa...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...",1180717545935925248,2019-10-06,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[Mayo, Breast, Cancer, Vaccine, Could, Be, Ava...","[6, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14...",1183251744601587712,2019-10-13,"[B-product, I-product, I-product, I-product, O..."


In [None]:
tweetner7_train["ner_pipeline"] = tweetner7_train.apply(lambda x: ner_pipeline(str(x["tokens"])), axis=1)

In [None]:
# example output of model
#test_example = [{'entity': 'B-corporation', 'score': 0.5353071, 'index': 1, 'word': 'Apple', 'start': 0, 'end': 5}, 
#{'entity': 'B-product', 'score': 0.656318, 'index': 6, 'word': 'ĠiPhone', 'start': 28, 'end': 34}, 
#{'entity': 'B-corporation', 'score': 0.70640016, 'index': 9, 'word': 'ĠMicrosoft', 'start': 40, 'end': 49}, 
#{'entity': 'B-product', 'score': 0.6205899, 'index': 15, 'word': 'ĠWindows', 'start': 76, 'end': 83}]

# creating a function transforming the predicted labels to a list to be better comparable to the actual entity labels
def output_to_labellist(row):
    labellist = []
    words_labels = {}

    for dic in row["ner_pipeline"]:
        words_labels[dic["word"]] = dic["entity"]

    for i in row["tokens"]:
        if i in words_labels.keys():
            labellist.append(words_labels[i])
        else:
            labellist.append("O")

    return labellist

#res = output_to_labellist(test_example)
#print(res)

In [91]:
tweetner7_train["predicted_labels"] = tweetner7_train.apply(output_to_labellist, axis=1)
tweetner7_train.head()

Unnamed: 0,tokens,tags,id,date,entities,labels,entity_types,true_labels,predicted_entity_types,ner_pipeline,predicted_label_list,predicted_labels
0,"[Morning, 5km, run, with, {{USERNAME}}, for, b...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 2, 14...",1183344337016381440,2019-10-13,"[{'word': 'city', 'entity': 'I-location'}]","[event, location]",[location],"[O, O, O, O, O, O, O, O, O, O, B-event, O, B-e...",[location],"[{'entity': 'I-location', 'score': 0.42394614,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[President, Trump, Arrives, at, UFC, 244, in, ...","[5, 12, 14, 14, 0, 7, 14, 4, 11, 14, 14, 14, 1...",1190961319538765824,2019-11-03,"[{'word': 'President', 'entity': 'B-person'}, ...","[person, corporation, location]","[person, location, group]","[B-person, I-person, O, O, B-corporation, I-co...","[person, location, group]","[{'entity': 'B-person', 'score': 0.575353, 'in...","[B-person, I-person, O, O, B-group, O, O, B-lo...","[B-person, I-person, O, O, B-group, O, O, B-lo..."
2,"["", I, 've, been, in, law, enforcement, for, 2...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...",1267032593339486209,2020-05-31,"[{'word': 'Buff', 'entity': 'B-location'}, {'w...","[corporation, person, event]","[person, location]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[person, location]","[{'entity': 'B-location', 'score': 0.6734836, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[I, got, mine, yesterday, !, ****, Doctors, sa...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...",1180717545935925248,2019-10-06,"[{'word': 'W', 'entity': 'B-corporation'}, {'w...","[corporation, product]","[product, corporation]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[product, corporation]","[{'entity': 'B-corporation', 'score': 0.469018...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[Mayo, Breast, Cancer, Vaccine, Could, Be, Ava...","[6, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14...",1183251744601587712,2019-10-13,"[{'word': '{', 'entity': 'B-product'}, {'word'...",[product],[product],"[B-product, I-product, I-product, I-product, O...",[product],"[{'entity': 'B-product', 'score': 0.9699516, '...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"


In [104]:
# using the package "nervaluate" to evaluate the performance
from nervaluate import Evaluator

true = tweetner7_train["true_labels"].values.tolist()
pred = tweetner7_train["predicted_labels"].values.tolist()

evaluator = Evaluator(true, pred, tags=["corporation", "creative_work", "event", "group", "location", "person", "product"], loader="list")

results, results_by_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
print(results)

{'ent_type': {'correct': 1741, 'incorrect': 437, 'partial': 0, 'missed': 13442, 'spurious': 910, 'possible': 15620, 'actual': 3088, 'precision': 0.5637953367875648, 'recall': 0.11145966709346991, 'f1': 0.18612358349369254}, 'partial': {'correct': 944, 'incorrect': 0, 'partial': 1234, 'missed': 13442, 'spurious': 910, 'possible': 15620, 'actual': 3088, 'precision': 0.5055051813471503, 'recall': 0.09993597951344431, 'f1': 0.16688047893949112}, 'strict': {'correct': 788, 'incorrect': 1390, 'partial': 0, 'missed': 13442, 'spurious': 910, 'possible': 15620, 'actual': 3088, 'precision': 0.2551813471502591, 'recall': 0.050448143405889885, 'f1': 0.08424203549283729}, 'exact': {'correct': 944, 'incorrect': 1234, 'partial': 0, 'missed': 13442, 'spurious': 910, 'possible': 15620, 'actual': 3088, 'precision': 0.30569948186528495, 'recall': 0.060435339308578744, 'f1': 0.10091939277314518}}


In [103]:
print(results_by_tag["person"])

{'ent_type': {'correct': 758, 'incorrect': 33, 'partial': 0, 'missed': 3875, 'spurious': 473, 'possible': 4666, 'actual': 1264, 'precision': 0.5996835443037974, 'recall': 0.1624517788255465, 'f1': 0.25564924114671167}, 'partial': {'correct': 266, 'incorrect': 0, 'partial': 525, 'missed': 3875, 'spurious': 473, 'possible': 4666, 'actual': 1264, 'precision': 0.41811708860759494, 'recall': 0.11326618088298328, 'f1': 0.17824620573355818}, 'strict': {'correct': 251, 'incorrect': 540, 'partial': 0, 'missed': 3875, 'spurious': 473, 'possible': 4666, 'actual': 1264, 'precision': 0.1985759493670886, 'recall': 0.05379339905700815, 'f1': 0.08465430016863407}, 'exact': {'correct': 266, 'incorrect': 525, 'partial': 0, 'missed': 3875, 'spurious': 473, 'possible': 4666, 'actual': 1264, 'precision': 0.21044303797468356, 'recall': 0.057008144020574365, 'f1': 0.08971332209106239}}


# Finetuning covid twitter bert with TweetNER7-data

In [20]:
# fine-tuning the covid-twitter-bert model, since all other bert-based models
# have already been fine-tuned on this data with most likely better resources
# https://huggingface.co/datasets/tner/tweetner7#main-models
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

model_name_covid = "digitalepidemiologylab/covid-twitter-bert-v2"
tokenizer_covid = AutoTokenizer.from_pretrained(model_name_covid)

#model_covid = AutoModelForTokenClassification.from_pretrained(model_name_covid)
#ner_pipeline_covid = pipeline("ner", model=model_covid, tokenizer=tokenizer_covid)

In [24]:
from datasets import Dataset
dataset = Dataset.from_pandas(tweetner7_train)

In [25]:
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [26]:
label_list = list(entity_dict.values())
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

def encode_labels(ds):
    ds['new_tags'] = [label_to_id[label] for label in ds['true_labels']]
    return ds

train_dataset = train_dataset.map(encode_labels)
test_dataset = test_dataset.map(encode_labels)

Map:   0%|          | 0/3692 [00:00<?, ? examples/s]

Map:   0%|          | 0/924 [00:00<?, ? examples/s]

In [27]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer_covid(
        examples["tokens"],
        truncation=True,            
        padding="max_length",
        max_length=128,       
        is_split_into_words=True    
    )

    labels = []
    for i, label in enumerate(examples["new_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  
        label_ids = []
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:  # Special tokens
                label_ids.append(-100)
            elif word_id != previous_word_id:  # First token of a word
                label_ids.append(label[word_id])
            else:  # Subword tokens
                label_ids.append(-100)
            previous_word_id = word_id
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/3692 [00:00<?, ? examples/s]

Map:   0%|          | 0/924 [00:00<?, ? examples/s]

In [29]:
from transformers import AutoModelForTokenClassification

model_covid = AutoModelForTokenClassification.from_pretrained(
    model_name_covid,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at digitalepidemiologylab/covid-twitter-bert-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
from transformers import DataCollatorForTokenClassification

# using a data collator for dynamic padding to solve error during tensor creation
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer_covid)

In [31]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # flattening predictions and labels
    true_labels = [id_to_label[label] for label in labels.flatten() if label != -100]
    true_predictions = [id_to_label[pred] for (pred, label) in zip(predictions.flatten(), labels.flatten()) if label != -100]

    # calculating metrics
    report = classification_report(true_labels, true_predictions, output_dict=True)
    return {
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"]
    }

training_args = TrainingArguments(
    output_dir="./ner-covid-twitter-bert", 
    evaluation_strategy="epoch",   # evaluating at every epoch
    learning_rate=5e-5,
    per_device_train_batch_size=4, # adjust batch size depending on RAM
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",          
    logging_steps=10,
    save_total_limit=2
)

trainer = Trainer(
    model=model_covid,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer_covid,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [32]:
trainer.train()

wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: hausbichler-georg (hausbichler-georg-wirtschaftsuniversit-t-wien). Use `wandb login --relogin` to force relogin


  0%|          | 0/1846 [00:00<?, ?it/s]

KeyboardInterrupt: 