In [1]:
#!pip install transformers accelerate torch huggingface_hub datasets nervaluate

In [None]:
import pandas as pd
import numpy as np
import torch

# RoBERTa-TweetNER pipeline attempts (brd-sa-data)

In [None]:
# trying out RoBERTa version trained with tweetner
# tner/roberta-base-tweetner7-all

from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

model_name = "tner/roberta-base-tweetner7-all"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

text = "Apple just launched the new iPhone, and Microsoft released a new version of Windows."

entities = ner_pipeline(text)
print(entities)

Device set to use cpu


[{'entity': 'B-corporation', 'score': 0.5353071, 'index': 1, 'word': 'Apple', 'start': 0, 'end': 5}, {'entity': 'B-product', 'score': 0.656318, 'index': 6, 'word': 'ĠiPhone', 'start': 28, 'end': 34}, {'entity': 'B-corporation', 'score': 0.70640016, 'index': 9, 'word': 'ĠMicrosoft', 'start': 40, 'end': 49}, {'entity': 'B-product', 'score': 0.6205899, 'index': 15, 'word': 'ĠWindows', 'start': 76, 'end': 83}]


In [None]:
filtered_entities = [{"word": entity["word"], "entity": entity["entity"]} for entity in entities]
print(filtered_entities)

[{'word': 'Apple', 'entity': 'B-corporation'}, {'word': 'ĠiPhone', 'entity': 'B-product'}, {'word': 'ĠMicrosoft', 'entity': 'B-corporation'}, {'word': 'ĠWindows', 'entity': 'B-product'}]


In [None]:
# testing NER on Brand Sentiment Analysis data
import pandas as pd
df_brd_sa = pd.read_csv("./analysis_data/Brand Sentiment Analysis Dataset/Dataset - Train.csv")
df_brd_sa.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [None]:
# applying pre-processing function before model application from https://huggingface.co/tner/roberta-base-tweetner7-all
import re
from urlextract import URLExtract

extractor = URLExtract()

def format_tweet(tweet):
    # mask web urls
    urls = extractor.find_urls(tweet)
    for url in urls:
        tweet = tweet.replace(url, "{{URL}}")
    # format twitter account
    tweet = re.sub(r"\b(\s*)(@[\S]+)\b", r'\1{\2@}', tweet)
    return tweet

In [None]:
df_brd_sa_sm = df_brd_sa[:100]

df_brd_sa_sm.dropna()

df_brd_sa_sm["tweet_text"] = df_brd_sa_sm.apply(lambda x: format_tweet(str(x["tweet_text"])), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brd_sa_sm["tweet_text"] = df_brd_sa_sm.apply(lambda x: format_tweet(str(x["tweet_text"])), axis=1)


In [None]:
def ner(txt):
    entities = ner_pipeline(txt)
    entities = [{"word": entity["word"], "entity": entity["entity"]} for entity in entities]
    return entities

In [None]:
df_brd_sa["entities"] = df_brd_sa.apply(lambda x: ner(str(x["tweet_text"])), axis=1)

In [None]:
df_brd_sa.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,entities
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,"[{'word': '.', 'entity': 'B-person'}, {'word':..."
1,@jessedee Know about {@fludapp@} ? Awesome iPa...,iPad or iPhone App,Positive emotion,"[{'word': '@', 'entity': 'B-person'}, {'word':..."
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,"[{'word': '@', 'entity': 'B-person'}, {'word':..."
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,"[{'word': 'Ġfestival', 'entity': 'B-event'}, {..."
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,"[{'word': 'xt', 'entity': 'I-event'}, {'word':..."
...,...,...,...,...
95,GSD&amp;M &amp; Google's Industry Party Tonigh...,,No emotion toward brand or product,"[{'word': 'ĠGoogle', 'entity': 'B-corporation'..."
96,New buzz? &quot;@mention Google to Launch Majo...,,No emotion toward brand or product,"[{'word': 'ment', 'entity': 'I-corporation'}, ..."
97,Headline: &quot;#iPad 2 is the Must-Have Gadge...,iPad,Positive emotion,"[{'word': 'i', 'entity': 'B-product'}, {'word'..."
98,.@mention &quot;Google launched checkins a mon...,Google,Positive emotion,"[{'word': '.', 'entity': 'B-person'}, {'word':..."


In [None]:
reslist = list(df_brd_sa[["tweet_text", "entities"]].values)
#print(reslist)

[array(['.@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.',
       list([{'word': '.', 'entity': 'B-person'}, {'word': '@', 'entity': 'I-person'}, {'word': 'w', 'entity': 'I-person'}, {'word': 'esley', 'entity': 'I-person'}, {'word': '83', 'entity': 'I-person'}, {'word': 'Ġ3', 'entity': 'B-product'}, {'word': 'G', 'entity': 'I-product'}, {'word': 'ĠiPhone', 'entity': 'B-product'}, {'word': 'Ġ#', 'entity': 'B-location'}, {'word': 'ISE', 'entity': 'I-location'}, {'word': '_', 'entity': 'I-location'}, {'word': 'Austin', 'entity': 'I-location'}, {'word': 'Ġ#', 'entity': 'B-location'}, {'word': 'S', 'entity': 'I-location'}, {'word': 'X', 'entity': 'I-location'}, {'word': 'SW', 'entity': 'I-location'}])],
      dtype=object), array(["@jessedee Know about {@fludapp@} ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",
       list([{'word': '@', 'entity': '

In [None]:
# json allows entities-column to be stored in the CSV
import json
df_brd_sa["entities"] = df_brd_sa["entities"].apply(json.dumps)

In [None]:
# saving the df containing the prediction results as CSV to be used further
df_brd_sa.to_csv("./data after NER.csv")

# loading TweetNER7-data and applying roberta-tweetner-pipeline

In [None]:
# loading tweetner17 dataset from huggingface https://huggingface.co/datasets/tner/tweetner7
from datasets import load_dataset

ds = load_dataset("tner/tweetner7")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
tweetner7_train = ds["train_all"].to_pandas()
tweetner7_test = ds["test_2021"].to_pandas()
tweetner7_test20 = ds["test_2020"].to_pandas()
tweetner7_train.dropna(inplace=True)
tweetner7_test.dropna(inplace=True)
tweetner7_test20.dropna(inplace=True)

In [None]:
tweetner7_big = pd.concat([tweetner7_train, tweetner7_test])

In [None]:
len(tweetner7_big)

9918

In [None]:
# converting the tags-column to a label-column to be comparable with the model-output
entity_dict = {
    0: "B-corporation",
    1: "B-creative_work",
    2: "B-event",
    3: "B-group",
    4: "B-location",
    5: "B-person",
    6: "B-product",
    7: "I-corporation",
    8: "I-creative_work",
    9: "I-event",
    10: "I-group",
    11: "I-location",
    12: "I-person",
    13: "I-product",
    14: "O"
}

label_list = list(entity_dict.values())
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

def tags_to_labels(col):
    result = []
    for i in col:
        label = entity_dict[i]
        result.append(label)
    return result

In [None]:
tweetner7_test["true_labels"] = tweetner7_test["tags"].apply(tags_to_labels)
tweetner7_test.head()

Unnamed: 0,tokens,tags,id,date,true_labels
0,"[New, music, coming, soon, via, {@Columbia Rec...","[14, 14, 14, 14, 14, 0, 14, 14, 14, 14, 14, 14...",1378645447510695937,2021-04-04,"[O, O, O, O, O, B-corporation, O, O, O, O, O, ..."
1,"[The, real, faces, of, #, SheikhJarrah, ., Ple...","[14, 14, 14, 14, 14, 5, 14, 14, 14, 14, 14, 14...",1403500440738844672,2021-06-11,"[O, O, O, O, O, B-person, O, O, O, O, O, O, O,..."
2,"[Sister, to, Sister, Meets, Shop, Talk, with, ...","[1, 8, 8, 14, 14, 14, 14, 5, 12, 14, 14, 6, 14...",1310700199036284929,2020-09-28,"[B-creative_work, I-creative_work, I-creative_..."
3,"[-, Albin, Ekdal, is, the, 1st, player, from, ...","[14, 5, 12, 14, 14, 14, 14, 14, 4, 14, 14, 14,...",1335702864509988866,2020-12-06,"[O, B-person, I-person, O, O, O, O, O, B-locat..."
4,"[Time, for, the, main, event, ., {@Israel Ades...","[14, 14, 14, 14, 14, 14, 5, 14, 5, 14, 2]",1368437378851934209,2021-03-07,"[O, O, O, O, O, O, B-person, O, B-person, O, B..."


In [None]:
tweetner7_train["ner_pipeline"] = tweetner7_train.apply(lambda x: ner_pipeline(str(x["tokens"])), axis=1)

In [None]:
# example output of the pipeline
#test_example = [{'entity': 'B-corporation', 'score': 0.5353071, 'index': 1, 'word': 'Apple', 'start': 0, 'end': 5},
#{'entity': 'B-product', 'score': 0.656318, 'index': 6, 'word': 'ĠiPhone', 'start': 28, 'end': 34},
#{'entity': 'B-corporation', 'score': 0.70640016, 'index': 9, 'word': 'ĠMicrosoft', 'start': 40, 'end': 49},
#{'entity': 'B-product', 'score': 0.6205899, 'index': 15, 'word': 'ĠWindows', 'start': 76, 'end': 83}]

# creating a function transforming the predicted labels to a list to be better comparable to the actual entity labels
def output_to_labellist(row, output_col):
    labellist = []
    words_labels = {}

    # adjust column-name depending on name of prediction-column
    for dic in row[output_col]:
        words_labels[dic["word"]] = dic["entity"]

    for i in row["tokens"]:
        if i in words_labels.keys():
            labellist.append(words_labels[i])
        else:
            labellist.append("O")

    return labellist

In [None]:
tweetner7_train["predicted_labels"] = tweetner7_train.apply(output_to_labellist, axis=1)
tweetner7_train.head()

NameError: name 'output_to_labellist' is not defined

In [None]:
# using the package "nervaluate" to evaluate the performance
from nervaluate import Evaluator

true = tweetner7_train["true_labels"].values.tolist()
pred = tweetner7_train["predicted_labels"].values.tolist()

evaluator = Evaluator(true, pred, tags=["corporation", "creative_work", "event", "group", "location", "person", "product"], loader="list")

results, results_by_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
print(results)

{'ent_type': {'correct': 1741, 'incorrect': 437, 'partial': 0, 'missed': 13442, 'spurious': 910, 'possible': 15620, 'actual': 3088, 'precision': 0.5637953367875648, 'recall': 0.11145966709346991, 'f1': 0.18612358349369254}, 'partial': {'correct': 944, 'incorrect': 0, 'partial': 1234, 'missed': 13442, 'spurious': 910, 'possible': 15620, 'actual': 3088, 'precision': 0.5055051813471503, 'recall': 0.09993597951344431, 'f1': 0.16688047893949112}, 'strict': {'correct': 788, 'incorrect': 1390, 'partial': 0, 'missed': 13442, 'spurious': 910, 'possible': 15620, 'actual': 3088, 'precision': 0.2551813471502591, 'recall': 0.050448143405889885, 'f1': 0.08424203549283729}, 'exact': {'correct': 944, 'incorrect': 1234, 'partial': 0, 'missed': 13442, 'spurious': 910, 'possible': 15620, 'actual': 3088, 'precision': 0.30569948186528495, 'recall': 0.060435339308578744, 'f1': 0.10091939277314518}}


In [None]:
print(results_by_tag["person"])

{'ent_type': {'correct': 758, 'incorrect': 33, 'partial': 0, 'missed': 3875, 'spurious': 473, 'possible': 4666, 'actual': 1264, 'precision': 0.5996835443037974, 'recall': 0.1624517788255465, 'f1': 0.25564924114671167}, 'partial': {'correct': 266, 'incorrect': 0, 'partial': 525, 'missed': 3875, 'spurious': 473, 'possible': 4666, 'actual': 1264, 'precision': 0.41811708860759494, 'recall': 0.11326618088298328, 'f1': 0.17824620573355818}, 'strict': {'correct': 251, 'incorrect': 540, 'partial': 0, 'missed': 3875, 'spurious': 473, 'possible': 4666, 'actual': 1264, 'precision': 0.1985759493670886, 'recall': 0.05379339905700815, 'f1': 0.08465430016863407}, 'exact': {'correct': 266, 'incorrect': 525, 'partial': 0, 'missed': 3875, 'spurious': 473, 'possible': 4666, 'actual': 1264, 'precision': 0.21044303797468356, 'recall': 0.057008144020574365, 'f1': 0.08971332209106239}}


# predicting with function similar to llama-approach

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# add_prefix_space necessary fixes issue with prediction function
roberta_tweetner_t = AutoTokenizer.from_pretrained("tner/roberta-base-tweetner7-all", add_prefix_space=True)

roberta_tweetner = AutoModelForTokenClassification.from_pretrained(
    "tner/roberta-base-tweetner7-all",
    device_map="auto",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

In [None]:
# prediction function explicitly handles numpy arrays, not needed for llama
# (tweetner7-tokens are saved as such) and encodes immediately
# function should work for all BERT-/RoBERTa-models
import numpy as np

def bert_pred(model, tokenizer, df, text_col, output_col):

    model.to(model.device)
    model.eval()
    all_predictions = []

    for i, row in df.iterrows():
        text = row[text_col]

        if isinstance(text, np.ndarray):
            if text.dtype.kind == 'U' or text.dtype.kind == 'S':  # if array contains strings
                text_list = text.tolist()
            else:
                text_list = [str(x) for x in text]

            encoded = tokenizer(text_list, is_split_into_words=True, return_tensors="pt")
        elif isinstance(text, str):
            encoded = tokenizer(text, return_tensors="pt")
        elif isinstance(text, list):
            encoded = tokenizer(text, is_split_into_words=True, return_tensors="pt")
        else:
            encoded = tokenizer(str(text), return_tensors="pt")

        # moving encoded text to device
        encoded = {k: v.to(model.device) for k, v in encoded.items()}

        with torch.no_grad():
            outputs = model(**encoded)
            preds = torch.argmax(outputs.logits, dim=-1)[0].cpu().tolist()

            # mapping predictions back to original tokens for token classification
            if isinstance(text, (list, np.ndarray)):
                if isinstance(text, np.ndarray):
                    word_ids = tokenizer(text.tolist(), is_split_into_words=True).word_ids()
                else:
                    word_ids = tokenizer(text, is_split_into_words=True).word_ids()

                processed_preds = []
                prev_word_id = None

                for word_id, pred in zip(word_ids, preds):
                    if word_id is None:
                        continue
                    elif word_id != prev_word_id:
                        processed_preds.append(pred)
                    prev_word_id = word_id

                all_predictions.append(processed_preds)
            else:
                all_predictions.append(preds)

    df[output_col] = all_predictions
    return df

In [None]:
bert_pred(roberta_tweetner, roberta_tweetner_t, tweetner7_test, "tokens", "roberta_pred")

Unnamed: 0,tokens,tags,id,date,true_labels,bert_pred,roberta_prediction_labels,roberta_pred
0,"[New, music, coming, soon, via, {@Columbia Rec...","[14, 14, 14, 14, 14, 0, 14, 14, 14, 14, 14, 14...",1378645447510695937,2021-04-04,"[O, O, O, O, O, B-corporation, O, O, O, O, O, ...","[14, 14, 14, 14, 14, 0, 14, 14, 14, 14, 14, 14...","[O, O, O, O, O, B-corporation, O, O, O, O, O, ...","[14, 14, 14, 14, 14, 0, 14, 14, 14, 14, 14, 14..."
1,"[The, real, faces, of, #, SheikhJarrah, ., Ple...","[14, 14, 14, 14, 14, 5, 14, 14, 14, 14, 14, 14...",1403500440738844672,2021-06-11,"[O, O, O, O, O, B-person, O, O, O, O, O, O, O,...","[14, 14, 14, 14, 14, 5, 14, 14, 14, 14, 14, 14...","[O, O, O, O, O, B-person, O, O, O, O, O, O, O,...","[14, 14, 14, 14, 14, 5, 14, 14, 14, 14, 14, 14..."
2,"[Sister, to, Sister, Meets, Shop, Talk, with, ...","[1, 8, 8, 14, 14, 14, 14, 5, 12, 14, 14, 6, 14...",1310700199036284929,2020-09-28,"[B-creative_work, I-creative_work, I-creative_...","[14, 14, 14, 14, 14, 14, 14, 5, 12, 14, 14, 6,...","[O, O, O, O, O, O, O, B-person, I-person, O, O...","[14, 14, 14, 14, 14, 14, 14, 5, 12, 14, 14, 6,..."
3,"[-, Albin, Ekdal, is, the, 1st, player, from, ...","[14, 5, 12, 14, 14, 14, 14, 14, 4, 14, 14, 14,...",1335702864509988866,2020-12-06,"[O, B-person, I-person, O, O, O, O, O, B-locat...","[14, 5, 12, 14, 14, 14, 14, 14, 4, 14, 14, 14,...","[O, B-person, I-person, O, O, O, O, O, B-locat...","[14, 5, 12, 14, 14, 14, 14, 14, 4, 14, 14, 14,..."
4,"[Time, for, the, main, event, ., {@Israel Ades...","[14, 14, 14, 14, 14, 14, 5, 14, 5, 14, 2]",1368437378851934209,2021-03-07,"[O, O, O, O, O, O, B-person, O, B-person, O, B...","[14, 14, 14, 14, 14, 14, 5, 14, 14, 14, 3]","[O, O, O, O, O, O, B-person, O, O, O, B-group]","[14, 14, 14, 14, 14, 14, 5, 14, 14, 14, 3]"
...,...,...,...,...,...,...,...,...
2802,"[I, know, watching, your, favorite, fighter, l...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...",1317702078559522817,2020-10-18,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1..."
2803,"[Someone, please, tell, {{USERNAME}}, that, fe...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14]",1376262016998989827,2021-03-28,"[O, O, O, O, O, O, O, O, O, O, O]","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14]","[O, O, O, O, O, O, O, O, O, O, O]","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14]"
2804,"[A, mountaintop, cable, car, plunged, to, the,...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 4, 14...",1396495893101154304,2021-05-23,"[O, O, O, O, O, O, O, O, O, O, B-location, O, ...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 4, 4, 14,...","[O, O, O, O, O, O, O, O, O, B-location, B-loca...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 4, 4, 14,..."
2805,"[Tjo, guys, ?, A, whole, car, for, her, mom, ?...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...",1386197571652173824,2021-04-25,"[O, O, O, O, O, O, O, O, O, O, O, O, O, B-grou...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1..."


In [None]:
tweetner7_test["roberta_prediction_labels"] = tweetner7_test["roberta_pred"].apply(lambda x: [id_to_label[i] for i in x])

In [None]:
from nervaluate import Evaluator

true = tweetner7_test["true_labels"].values.tolist()
pred = tweetner7_test["roberta_prediction_labels"].values.tolist()

evaluator = Evaluator(true, pred, tags=["corporation", "creative_work", "event", "group", "location", "person", "product"], loader="list")

results, results_by_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
print(results)

{'ent_type': {'correct': 5906, 'incorrect': 1462, 'partial': 0, 'missed': 1280, 'spurious': 1253, 'possible': 8648, 'actual': 8621, 'precision': 0.6850713374318524, 'recall': 0.6829324699352451, 'f1': 0.6840002316289305}, 'partial': {'correct': 6618, 'incorrect': 0, 'partial': 750, 'missed': 1280, 'spurious': 1253, 'possible': 8648, 'actual': 8621, 'precision': 0.8111587982832618, 'recall': 0.8086262719703978, 'f1': 0.8098905553303608}, 'strict': {'correct': 5459, 'incorrect': 1909, 'partial': 0, 'missed': 1280, 'spurious': 1253, 'possible': 8648, 'actual': 8621, 'precision': 0.6332212040366547, 'recall': 0.6312442183163737, 'f1': 0.6322311656725925}, 'exact': {'correct': 6618, 'incorrect': 750, 'partial': 0, 'missed': 1280, 'spurious': 1253, 'possible': 8648, 'actual': 8621, 'precision': 0.7676603642268879, 'recall': 0.765263644773358, 'f1': 0.7664601308703457}}


In [22]:
from sklearn.metrics import precision_score, recall_score, classification_report

# function returns dictionary containing precision, recall and f1-score
# should only be used for comparing columns with numeric tags, not string-labels
def evaluate_ner_predictions(df, true_col, pred_col):
    y_true_flat = []
    y_pred_flat = []

    for true_seq, pred_seq in zip(df[true_col], df[pred_col]):
       # skipping padding/special tokens (marked as -100 in ground truth)
       for true_label, pred_label in zip(true_seq, pred_seq):
            if true_label != -100:
               y_true_flat.append(true_label)
               y_pred_flat.append(pred_label)

    precision = precision_score(y_true_flat, y_pred_flat, average='weighted')
    recall = recall_score(y_true_flat, y_pred_flat, average='weighted', zero_division=0)
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print(classification_report(y_true_flat, y_pred_flat))

    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [None]:
metrics = evaluate_ner_predictions(tweetner7_test, "tags", "bert_pred")

# Testing RoBERTa-tweetner7 with WNUT

In [None]:
import pandas as pd
df_wnut = pd.read_csv("./wnut_complete_processed.csv")
df_wnut.head()

Unnamed: 0,tokens,label_list
0,"['@SammieLynnsMom', '@tg10781', 'they', 'will'...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
1,"['@ls_n', 'perhaps', ',', 'but', 'folks', 'may...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,"['@Carr0t', 'aye', 'been', 'tonight', '-', 'ex...","['O', 'O', 'O', 'O', 'O', 'O']"
3,"['RT', '@LilTwist', ':', 'RT', 'this', 'if', '...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,"['@Hollly_', '16', 'b', '17', 'in', 'feb']","['O', 'O', 'O', 'O', 'O', 'O']"


In [None]:
import ast

for col in ["tokens", "label_list"]:
    df_wnut[col] = df_wnut[col].apply(ast.literal_eval)

In [None]:
type(df_wnut["tokens"][0])

list

In [None]:
df_wnut.head()

Unnamed: 0,tokens,label_list
0,"[@SammieLynnsMom, @tg10781, they, will, be, al...","[O, O, O, O, O, O, O, O, O, O, O, O]"
1,"[@ls_n, perhaps, ,, but, folks, may, find, som...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[@Carr0t, aye, been, tonight, -, excellent]","[O, O, O, O, O, O]"
3,"[RT, @LilTwist, :, RT, this, if, you, want, me...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-c..."
4,"[@Hollly_, 16, b, 17, in, feb]","[O, O, O, O, O, O]"


In [None]:
df_wnut["tags"] = df_wnut["label_list"].apply(lambda x: [label_to_id[label] for label in x])

In [None]:
import numpy as np
import torch

# reworked prediction function with max_length and error exceptions to handle cuda runtime error
def bert_pred(model, tokenizer, df, text_col, output_col, max_length=128, batch_size=32):
    device = model.device
    print(f"Using device: {device}")

    # printing model config info
    if hasattr(model, 'config'):
        print(f"Model has {model.config.num_labels} labels")

    model.eval()
    all_predictions = []

    for i, row in df.iterrows():
        try:
            text = row[text_col]

            if isinstance(text, np.ndarray):
                if text.dtype.kind == 'U' or text.dtype.kind == 'S':
                    text_list = text.tolist()
                else:
                    text_list = [str(x) for x in text]

                if not text_list:
                    all_predictions.append([])
                    continue

                encoded = tokenizer(
                    text_list,
                    is_split_into_words=True,
                    return_tensors="pt",
                    max_length=max_length,
                    truncation=True,
                    padding="max_length"
                )
            elif isinstance(text, str):
                if not text.strip():
                    all_predictions.append([])
                    continue

                encoded = tokenizer(
                    text,
                    return_tensors="pt",
                    max_length=max_length,
                    truncation=True,
                    padding="max_length"
                )
            elif isinstance(text, list):
                if not text:
                    all_predictions.append([])
                    continue

                encoded = tokenizer(
                    text,
                    is_split_into_words=True,
                    return_tensors="pt",
                    max_length=max_length,
                    truncation=True,
                    padding="max_length"
                )
            else:
                encoded = tokenizer(
                    str(text),
                    return_tensors="pt",
                    max_length=max_length,
                    truncation=True,
                    padding="max_length"
                )

            encoded = {k: v.to(device) for k, v in encoded.items()}

            if i == 0:
                print(f"Input shape: {encoded['input_ids'].shape}")

            with torch.no_grad():
                outputs = model(**encoded)
                preds = torch.argmax(outputs.logits, dim=-1)[0].cpu().tolist()

                if isinstance(text, (list, np.ndarray)):
                    if isinstance(text, np.ndarray):
                        word_ids = tokenizer(
                            text.tolist(),
                            is_split_into_words=True,
                            max_length=max_length,
                            truncation=True,
                            padding="max_length"
                        ).word_ids()
                    else:
                        word_ids = tokenizer(
                            text,
                            is_split_into_words=True,
                            max_length=max_length,
                            truncation=True,
                            padding="max_length"
                        ).word_ids()

                    processed_preds = []
                    prev_word_id = None

                    for word_id, pred in zip(word_ids, preds):
                        if word_id is None:
                            continue
                        elif word_id != prev_word_id:
                            processed_preds.append(pred)
                        prev_word_id = word_id

                    all_predictions.append(processed_preds)
                else:
                    all_predictions.append(preds)

        except Exception as e:
            print(f"Error in row {i}: {e}")
            # adding empty predictions for failed rows
            all_predictions.append([])
            continue

    df[output_col] = all_predictions
    return df

In [None]:
bert_pred(roberta_tweetner, roberta_tweetner_t, df_wnut, "tokens", "roberta_pred")

Using device: cuda:0
Model has 15 labels
Input shape: torch.Size([1, 512])


Unnamed: 0,tokens,label_list,tags,roberta_pred
0,"[@SammieLynnsMom, @tg10781, they, will, be, al...","[O, O, O, O, O, O, O, O, O, O, O, O]","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14]","[5, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14]"
1,"[@ls_n, perhaps, ,, but, folks, may, find, som...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1..."
2,"[@Carr0t, aye, been, tonight, -, excellent]","[O, O, O, O, O, O]","[14, 14, 14, 14, 14, 14]","[5, 14, 14, 14, 14, 14]"
3,"[RT, @LilTwist, :, RT, this, if, you, want, me...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-c...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...","[14, 5, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14..."
4,"[@Hollly_, 16, b, 17, in, feb]","[O, O, O, O, O, O]","[14, 14, 14, 14, 14, 14]","[5, 14, 14, 14, 14, 14]"
...,...,...,...,...
6865,"[@null, http://t.co/3O3Z3xyNC0, December, 25, ...","[O, O, O, O, O, O, O, O, O, O, O, O]","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14]","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14]"
6866,"[On, January, 22nd, ,, from, 11-, 1.30, ,, TVP...","[O, O, O, O, O, O, O, O, B-location, O, O, O, ...","[14, 14, 14, 14, 14, 14, 14, 14, 4, 14, 14, 14...","[14, 14, 14, 14, 14, 14, 14, 14, 0, 14, 14, 14..."
6867,"[Found, this, cool, photo, ,, not, mine, Love,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1..."
6868,"[Conditioning, plan, all, made, up, for, tomor...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1..."


In [None]:
metrics = evaluate_ner_predictions(df_wnut, "tags", "roberta_pred")

              precision    recall  f1-score   support

           0       0.48      0.65      0.55       797
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.74      0.71      0.73      1230
           5       0.30      0.73      0.43      1312
           6       0.45      0.40      0.42       436
           7       0.47      0.60      0.53       265
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.46      0.71      0.56       433
          12       0.55      0.81      0.66       583
          13       0.57      0.36      0.44       684
          14       0.99      0.95      0.97    129765

    accuracy                           0.93    135505
   macro avg       0.33      0.39      0.35    135505
weighted avg       0.97   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Testing and Finetuning RoBERTa-tweetner7 with synthetic data

In [None]:
import pandas as pd
df_syn = pd.read_csv("./NER_data/synthetic_tweet_data_grouped2.csv")
df_syn_sm = df_syn[:25000]
df_syn_sm.head()

Unnamed: 0,tokens,labels
0,"[I, ', m, amazed, at, how, Smith, transformed,...","[O, O, O, O, O, O, B-person, O, B-company, O, ..."
1,"[Davids, just, announced, the, new, GPT, -, 3,...","[B-person, O, O, O, O, O, O, O, O, O, O, B-com..."
2,"[Spending, my, weekend, tinkering, with, Samsu...","[O, O, O, O, O, B-company, O, O, B-product, O,..."
3,"[I, love, my, new, Echo, ,, thanks, Sony, !]","[O, O, O, O, B-product, O, O, B-company, O]"
4,"[Less, than, 2, hours, until, we, announce, th...","[O, O, O, O, O, O, O, O, O, O, O, B-product, O..."


In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(df_syn_sm)

In [None]:
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [None]:
entity_dict_syn = {
    0: "B-corporation",
    1: "B-person",
    2: "B-product",
    3: "I-corporation",
    4: "I-person",
    5: "I-product",
    6: "O"
}

label_list = list(entity_dict_syn.values())
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

# def encode_labels(ds):
#      ds['new_tags'] = [label_to_id[label] for label in ds['labels']]
#      return ds

# def encode_labels(ds):
#     ds['new_tags'] = [
#         [label_to_id[label] for label in labels] if isinstance(labels, list) else []
#         for labels in ds['labels']
#     ]
#     return ds

# train_dataset = train_dataset.map(encode_labels)
# test_dataset = test_dataset.map(encode_labels)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

KeyError: '['

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = roberta_tweetner_t(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )

    all_labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_id = None

        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id != previous_word_id:
                try:
                    label_ids.append(label[word_id])
                except IndexError:
                    label_ids.append(-100)
            else:
                label_ids.append(-100)
            previous_word_id = word_id

        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

## new syn data gen approach

In [None]:
persons = [
        "John Smith", "Emily Chen", "Michael Johnson", "Sarah Williams",
        "David Lee", "Maria Rodriguez", "James Brown", "Davis",
        "Robert Kim", "Jennifer Lopez", "Thomas Wilson", "Jessica Taylor", "Cook",
        "Carlos Vega", "Aisha Patel", "Daniel Park", "Olivia Nguyen", "Musk", "Smith"
    ]

corporations = [
        "Google", "Microsoft", "Apple", "Amazon", "Meta", "BlackBerry",
        "IBM", "Tesla", "Netflix", "Walmart", "JP Morgan",
        "Acme Corp", "TechSolutions", "Global Systems", "DataWorks",
        "Quantum Industries", "NexGen", "FutureSpace", "EcoSystems", "Nokia", "Motorola"
    ]

products = [
        "iPhone 13", "Galaxy S22", "Surface Pro", "PlayStation 5", "Xbox Series X",
        "MacBook Air", "Echo Dot", "AirPods Pro", "Tesla Model 3", "iPad Mini",
        "Dyson V11", "Fitbit Charge", "Nintendo Switch", "Kindle Paperwhite",
        "Roomba i7", "GoPro Hero", "Bose QuietComfort", "Instant Pot", "Echo", "AirTag", "ThinkPad"
    ]

events = [
        "CES 2023", "Web Summit", "SXSW", "TechCrunch Disrupt", "E3 Expo",
        "Google I/O", "WWDC", "Consumer Electronics Show", "Mobile World Congress",
        "Black Hat Conference", "DEF CON", "AWS re:Invent", "Game Developers Conference",
        "Dreamforce", "Comic-Con", "Coachella", "New York Fashion Week", "GamesCom", "AI-Con"
    ]

locations = [
        "New York", "San Francisco", "London", "Tokyo", "Berlin",
        "Paris", "Sydney", "Toronto", "Chicago", "Seattle",
        "Los Angeles", "Miami", "Singapore", "Hong Kong", "Milwaukee",
        "Dubai", "Barcelona", "Austin", "Stockholm", "Seoul", "Vienna"
    ]

templates = [
        "{person} from {corporation} announced that {product} will be showcased at {event}.",
        "At {event}, {person} demonstrated how {product} is revolutionizing {corporation}'s approach in {location}.",
        "{corporation} has selected {location} as the venue for {event}, where {person} will launch {product}.",
        "The new {product} developed by {corporation} will be presented by {person} during {event} in {location}.",
        "{person} confirmed that {corporation} will be expanding its {product} line.",
        "According to {person}, {corporation}'s latest {product} has been well-received at {event} in {location}.",
        "Reviews from {event} suggest that {person} made a strong case for {corporation}'s new {product} in the {location} market.",
        "{corporation} is planning to open a {product} store in {location}, announced {person} at {event}.",
        "The collaboration between {corporation} and {person} resulted in {product}, which will finally debut at {event} in {location}.",
        "Attendees at {event} in {location} were very impressed when {person} revealed {corporation}'s innovative {product}! The clapping didnt stop",
        "{person} traveled to {location} to promote {product} at {event} on behalf of {corporation}.",
        "The {product} team from {corporation}, led by {person}, won first prize at {event} in {location}. Let's go!",
        "Consumers in {location} can now purchase {product} after {corporation}'s expansion announcement by {person} at {event}.",
        "{product} is the must-have gadget of the year.",
        "I hate the new {product}, the older ones are much better.",
        "Less than 2 hours until they announce the details on the {product} giveaway!",
        "All eyes are on {corporation} after the announcement of their new {product}.",
        "It's time for {person} to leave {corporation}. What is he even doing.",
        "{corporation} has been selected as the top AI startup in {location}, wow!",
        "I am having so many issues with the {product}. {corporation} needs to fix this!",
        "Can not wait for {product} also. They should sell them down at {event}.",
        "Whats happening at {corporation}? {person} really needs to step up.",
        "{corporation} is giving free {product} to open source coders who are attending this meet-up.",
        "{person} was right! The {product} from {corporation} is revolutionary!",
        "Less than 2 hours until we announce the details on the {product} giveaway!",
        "{corporation} CEO {person}: Newest {product} rollout will begin next month!",
        "{corporation} has a temporary Retail Store in {location} for the {product} release today. Opens at 5pm.",
        "{person} said that {corporation} is working on something big.",
        "It's time for {person} to leave {corporation}. What is he even doing?",
        "{corporation} just keeps raising the bar with every {product} they launch. Crazy!",
        "{person} just hinted at new features in {corporation}'s upcoming {product}. I am hyped!",
        "Rumors say {corporation} is releasing {product} soon.",
        "Just watched the {corporation} keynote. {product} looks impressive.",
        "Is it just me, or does {corporation}'s {product} feel rushed and unfinished?",
        "The {product} is making me rethink my loyalty to {corporation}. Its not good.",
        "Who else is gonna get the new {product} next month?",
        "{corporation}'s industry party tonight was great for the launch of {product}.",
        "Attending {event} this week! Can't wait to see what {corporation} unveils about their upcoming {product}. Anyone else going?",
        "Is anyone else experiencing issues with the new {product} update? {corporation}'s support hasn't been helpful. #TechSupport",
        "Just switched from {product} to {product} and the difference is incredible. {corporation} really cooked with this one!",
        "Hot take: {corporation}'s approach to development is outdated. They need to focus more on usability if they want to compete with {corporation}",
        "The new update to {product} completely revolutionized my workflow. Thanks {corporation} for fixing the issue! #ProductivityTech",
        "Arrived at {event} in {location}! The {corporation} booth is already packed with people trying the new {product}. #TechConference",
        "Just spotted {person} from {corporation} at a restaurant in {location} right after {event}. Tried to ask about {product} rumors but no comment!",
        "I snuck into the VIP section at {event} in {location} and got a selfie with {person}! Check my Insta! #Winning",
        "PSA: Free {product} giveaways at {corporation}'s booth at {event} in {location}! Run don't walk, peeps! I got the last blue one",
        "This {product} launch line at {corporation}'s store in {location} is ridiculous. Been here 3hrs and moved like 10 feet, But I NEED it today! #TechAddict",
        "Shoutout to the nice {corporation} rep at {event} in {location} who gave me an extra {product} for my kid! Some tech people are actually decent humans",
        "My {product} just updated itself and now I can't find ANYTHING. Hey {corporation}, stop 'fixing' stuff that ain't broken! {person} needs to chill with these changes",
        "Omg {person} just liked my tweet criticizing {corporation}'s {product}! Screenshot this before they realize and unlike!",
        "The way {person} casually uses {product} in interviews makes it seem so cool, but when I bought it from {corporation} it's just... meh. Marketing wins again",
        "new CEO {person} has really not done much yet at {corporation}, hasnt he?"
    ]

In [None]:
len(templates)

52

In [None]:
import random
import pandas as pd

# reworked new function
def generate_ner_dataset(num_examples, templates=templates,persons=persons, corporations=corporations,
                         products=products, events=events, locations=locations):
    data = []

    for _ in range(num_examples):
        template = random.choice(templates)

        person = random.choice(persons)
        corporation = random.choice(corporations)
        product = random.choice(products)
        event = random.choice(events)
        location = random.choice(locations)

        sentence = template.format(
            person=person,
            corporation=corporation,
            product=product,
            event=event,
            location=location
        )

        tokens = []
        labels = []

        raw_words = []
        current_word = ""
        for char in sentence:
            if char.isalnum() or char in "-'":
                current_word += char
            else:
                if current_word:
                    raw_words.append(current_word)
                    current_word = ""
                if not char.isspace():
                    raw_words.append(char)
        if current_word:
            raw_words.append(current_word)

        i = 0
        while i < len(raw_words):
            token = raw_words[i]

            found_entity = False

            if i < len(raw_words) - 1 and f"{raw_words[i]} {raw_words[i+1]}" in persons:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                labels.append("B-person")
                labels.append("I-person")
                i += 2
                found_entity = True

            elif i < len(raw_words) - 2 and f"{raw_words[i]} {raw_words[i+1]} {raw_words[i+2]}" in products:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                tokens.append(raw_words[i+2])
                labels.append("B-product")
                labels.append("I-product")
                labels.append("I-product")
                i += 3
                found_entity = True
            elif i < len(raw_words) - 1 and f"{raw_words[i]} {raw_words[i+1]}" in products:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                labels.append("B-product")
                labels.append("I-product")
                i += 2
                found_entity = True

            elif i < len(raw_words) - 2 and f"{raw_words[i]} {raw_words[i+1]} {raw_words[i+2]}" in events:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                tokens.append(raw_words[i+2])
                labels.append("B-event")
                labels.append("I-event")
                labels.append("I-event")
                i += 3
                found_entity = True
            elif i < len(raw_words) - 1 and f"{raw_words[i]} {raw_words[i+1]}" in events:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                labels.append("B-event")
                labels.append("I-event")
                i += 2
                found_entity = True

            elif i < len(raw_words) - 1 and f"{raw_words[i]} {raw_words[i+1]}" in locations:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                labels.append("B-location")
                labels.append("I-location")
                i += 2
                found_entity = True

            if not found_entity:
                if token in [name.split()[0] for name in persons]:
                    tokens.append(token)
                    labels.append("B-person")
                    i += 1
                elif token in corporations:
                    tokens.append(token)
                    labels.append("B-corporation")
                    i += 1
                elif token in products:
                    tokens.append(token)
                    labels.append("B-product")
                    i += 1
                elif token in events:
                    tokens.append(token)
                    labels.append("B-event")
                    i += 1
                elif token in locations:
                    tokens.append(token)
                    labels.append("B-location")
                    i += 1
                else:
                    tokens.append(token)
                    labels.append("O")
                    i += 1

        data.append({"tokens": tokens, "labels": labels, "sentence": sentence})

    df = pd.DataFrame(data)

    return df

In [None]:
df_syn_new3 = generate_ner_dataset(10000)
df_syn_new3.head()

Unnamed: 0,tokens,labels,sentence
0,"[Just, spotted, Aisha, Patel, from, Microsoft,...","[O, O, B-person, I-person, O, B-corporation, O...",Just spotted Aisha Patel from Microsoft at a r...
1,"[Jessica, Taylor, traveled, to, San, Francisco...","[B-person, I-person, O, O, B-location, I-locat...",Jessica Taylor traveled to San Francisco to pr...
2,"[Less, than, 2, hours, until, they, announce, ...","[O, O, O, O, O, O, O, O, O, O, O, B-product, I...",Less than 2 hours until they announce the deta...
3,"[At, WWDC, ,, Aisha, Patel, demonstrated, how,...","[O, B-event, O, B-person, I-person, O, O, B-pr...","At WWDC, Aisha Patel demonstrated how Nintendo..."
4,"[The, Roomba, i7, is, making, me, rethink, my,...","[O, B-product, I-product, O, O, O, O, O, O, O,...",The Roomba i7 is making me rethink my loyalty ...


In [None]:
# converting named-tags to numeric tags for comparison with model output
def convert_to_numeric(label_list):
    return [label_to_id.get(label, -100) for label in label_list]

df_syn_new3["tags"] = df_syn_new3["labels"].apply(convert_to_numeric)

## predicting with RoBERTa-TweetNER7 on syn data

In [None]:
bert_pred(roberta_tweetner, roberta_tweetner_t, df_syn_new3, "tokens", "roberta_pred")

Unnamed: 0,tokens,labels,sentence,tags,roberta_pred
0,"[Just, spotted, Aisha, Patel, from, Microsoft,...","[O, O, B-person, I-person, O, B-corporation, O...",Just spotted Aisha Patel from Microsoft at a r...,"[14, 14, 5, 12, 14, 0, 14, 14, 14, 14, 4, 14, ...","[14, 14, 5, 12, 14, 0, 14, 14, 14, 14, 4, 14, ..."
1,"[Jessica, Taylor, traveled, to, San, Francisco...","[B-person, I-person, O, O, B-location, I-locat...",Jessica Taylor traveled to San Francisco to pr...,"[5, 12, 14, 14, 4, 11, 14, 14, 6, 13, 14, 2, 9...","[5, 12, 14, 14, 4, 11, 14, 14, 6, 13, 14, 2, 9..."
2,"[Less, than, 2, hours, until, they, announce, ...","[O, O, O, O, O, O, O, O, O, O, O, B-product, I...",Less than 2 hours until they announce the deta...,"[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 6...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 6..."
3,"[At, WWDC, ,, Aisha, Patel, demonstrated, how,...","[O, B-event, O, B-person, I-person, O, O, B-pr...","At WWDC, Aisha Patel demonstrated how Nintendo...","[14, 2, 14, 5, 12, 14, 14, 6, 13, 14, 14, 14, ...","[14, 2, 14, 5, 12, 14, 14, 6, 13, 14, 14, 0, 1..."
4,"[The, Roomba, i7, is, making, me, rethink, my,...","[O, B-product, I-product, O, O, O, O, O, O, O,...",The Roomba i7 is making me rethink my loyalty ...,"[14, 6, 13, 14, 14, 14, 14, 14, 14, 14, 0, 14,...","[14, 6, 13, 14, 14, 14, 14, 14, 14, 14, 0, 14,..."
...,...,...,...,...,...
9995,"[All, eyes, are, on, TechSolutions, after, the...","[O, O, O, O, B-corporation, O, O, O, O, O, O, ...",All eyes are on TechSolutions after the announ...,"[14, 14, 14, 14, 0, 14, 14, 14, 14, 14, 14, 6,...","[14, 14, 14, 14, 0, 14, 14, 14, 14, 14, 14, 6,..."
9996,"[The, way, Maria, Rodriguez, casually, uses, A...","[O, O, B-person, I-person, O, O, B-product, O,...",The way Maria Rodriguez casually uses AirTag i...,"[14, 14, 5, 12, 14, 14, 6, 14, 14, 14, 14, 14,...","[14, 14, 5, 12, 14, 14, 6, 14, 14, 14, 14, 14,..."
9997,"[Attendees, at, DEF, CON, in, Chicago, were, v...","[O, O, B-event, I-event, O, B-location, O, O, ...",Attendees at DEF CON in Chicago were very impr...,"[14, 14, 2, 9, 14, 4, 14, 14, 14, 14, 5, 14, 1...","[14, 14, 2, 9, 14, 4, 14, 14, 14, 14, 5, 14, 5..."
9998,"[I, snuck, into, the, VIP, section, at, AWS, r...","[O, O, O, O, O, O, O, O, O, O, O, O, B-locatio...",I snuck into the VIP section at AWS re:Invent ...,"[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...","[14, 14, 14, 14, 14, 14, 14, 2, 9, 14, 9, 14, ..."


In [None]:
df_syn_new3["roberta_pred_labels"] = df_syn_new3["roberta_pred"].apply(tags_to_labels)

In [None]:
from nervaluate import Evaluator

true = df_syn_new3["labels"].values.tolist()
pred = df_syn_new3["roberta_pred_labels"].values.tolist()

evaluator = Evaluator(true, pred, tags=["corporation", "creative_work", "event", "group", "location", "person", "product"], loader="list")

results, results_by_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
print(results)

{'ent_type': {'correct': 24502, 'incorrect': 656, 'partial': 0, 'missed': 590, 'spurious': 4472, 'possible': 25748, 'actual': 29630, 'precision': 0.8269321633479582, 'recall': 0.9516078918750971, 'f1': 0.8849001408501571}, 'partial': {'correct': 24756, 'incorrect': 0, 'partial': 402, 'missed': 590, 'spurious': 4472, 'possible': 25748, 'actual': 29630, 'precision': 0.8422882213972326, 'recall': 0.9692791673139661, 'f1': 0.9013326591787352}, 'strict': {'correct': 24408, 'incorrect': 750, 'partial': 0, 'missed': 590, 'spurious': 4472, 'possible': 25748, 'actual': 29630, 'precision': 0.8237597030037125, 'recall': 0.9479571228833308, 'f1': 0.8815052909097476}, 'exact': {'correct': 24756, 'incorrect': 402, 'partial': 0, 'missed': 590, 'spurious': 4472, 'possible': 25748, 'actual': 29630, 'precision': 0.8355045561930475, 'recall': 0.9614727357464657, 'f1': 0.8940734587742424}}


In [None]:
evaluate_ner_predictions(df_syn_new3, "tags", "roberta_pred")

              precision    recall  f1-score   support

           0       0.56      0.89      0.69      5368
           1       0.00      0.00      0.00         0
           2       0.80      0.98      0.88      2960
           3       0.00      0.00      0.00         0
           4       0.97      0.94      0.96      3837
           5       0.98      0.94      0.96      4950
           6       0.99      0.99      0.99      8633
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.68      1.00      0.81      2487
          10       0.00      0.00      0.00         0
          11       0.82      0.75      0.79       887
          12       0.97      1.00      0.98      3927
          13       1.00      1.00      1.00      8223
          14       1.00      0.95      0.97    143317

    accuracy                           0.95    184589
   macro avg       0.58      0.63      0.60    184589
weighted avg       0.97   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'precision': 0.9727139768881184,
 'recall': 0.9536104534939785,
 'f1': 0.9630674895570083}

## fine-tuning

In [None]:
from datasets import Dataset
dataset_syn = Dataset.from_pandas(df_syn_new3)

In [None]:
train_test_split = dataset_syn.train_test_split(test_size=0.2)
train_dataset_syn = train_test_split['train']
test_dataset_syn = train_test_split['test']

In [None]:
label_map = {tag: id for id, tag in enumerate(label_list)}

In [None]:
# works with synthetic data from above
def tokenize_and_align_labels(examples):
    tokenized_inputs = roberta_tweetner_t(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []

        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id != previous_word_id:
                if isinstance(label[word_id], str):
                    # converting string label to id
                    label_ids.append(label_map[label[word_id]])
                else:
                    label_ids.append(label[word_id])
            else:
                label_ids.append(-100)
            previous_word_id = word_id

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
train_dataset_syn = train_dataset_syn.map(tokenize_and_align_labels, batched=True)
test_dataset_syn = test_dataset_syn.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # flattening predictions and labels
    true_labels = [id_to_label[label] for label in labels.flatten() if label != -100]
    true_predictions = [id_to_label[pred] for (pred, label) in zip(predictions.flatten(), labels.flatten()) if label != -100]

    # calculating metrics
    report = classification_report(true_labels, true_predictions, output_dict=True)
    return {
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"]
    }

In [None]:
training_args = TrainingArguments(
    output_dir="./roberta-tweetner-syndata",
    evaluation_strategy="epoch",   # evaluating at every epoch
    learning_rate=1e-5,
    per_device_train_batch_size=64, # adjust batch size depending on RAM
    per_device_eval_batch_size=64,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=2
)

trainer = Trainer(
    model=roberta_tweetner,
    args=training_args,
    train_dataset=train_dataset_syn,
    eval_dataset=test_dataset_syn,
    tokenizer=roberta_tweetner_t,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

KeyboardInterrupt: 

# Finetuning RoBERTa-base with corpus

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification

roberta_t = AutoTokenizer.from_pretrained("FacebookAI/roberta-base", add_prefix_space=True)

entity_dict = {
    0: "B-corporation",
    1: "B-event",
    2: "B-location",
    3: "B-person",
    4: "B-product",
    5: "I-corporation",
    6: "I-event",
    7: "I-location",
    8: "I-person",
    9: "I-product",
    10: "O"
}

label_list = list(entity_dict.values())
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

roberta = AutoModelForTokenClassification.from_pretrained(
    "FacebookAI/roberta-base",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import pandas as pd

df_corpus = pd.read_json("NER_corpus.json", orient="records")
df_corpus.rename(columns={"labels":"original_labels"}, inplace=True)
df_corpus.head()

Unnamed: 0,tokens,original_labels,tags
0,"[., {{MENTION}}, stand, tall, in, the, red, zo...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."
1,"[Who, else, is, gonna, get, the, new, Surface,...","[O, O, O, O, O, O, O, B-product, I-product, O,...","[10, 10, 10, 10, 10, 10, 10, 4, 9, 10, 10, 10]"
2,"[EDIE, IN, BETWEEN, comes, out, THIS, MONTH, ,...","[B-person, O, O, O, O, O, O, O, O, O, O, O, O,...","[3, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10..."
3,"[‘, Joe, Biden, may, have, won, the, keys, to,...","[O, B-person, I-person, O, O, O, O, O, O, O, B...","[10, 3, 8, 10, 10, 10, 10, 10, 10, 10, 2, 7, 1..."
4,"[Imma, hit, reload, until, I, see, additional,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."


In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_corpus)

def tokenize_and_align_labels(examples):
    tokenized_inputs = roberta_t(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:  # Special tokens
                label_ids.append(-100)
            elif word_id != previous_word_id:  # First token of a word
                label_ids.append(label[word_id])
            else:  # Subword tokens
                label_ids.append(-100)
            previous_word_id = word_id
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/24835 [00:00<?, ? examples/s]

In [None]:
train_test_split1 = tokenized_datasets.train_test_split(test_size=0.15, seed=42)
train_val_dataset = train_test_split1["train"]
test_dataset = train_test_split1["test"]

train_test_split2 = train_val_dataset.train_test_split(test_size=0.15, seed=42)
train_dataset = train_test_split2["train"]
val_dataset = train_test_split2["test"]

In [None]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # flattening predictions and labels
    true_labels = [id_to_label[label] for label in labels.flatten() if label != -100]
    true_predictions = [id_to_label[pred] for (pred, label) in zip(predictions.flatten(), labels.flatten()) if label != -100]

    # calculating metrics
    report = classification_report(true_labels, true_predictions, output_dict=True)
    return {
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"]
    }

training_args = TrainingArguments(
    output_dir="./ner-roberta-base",
    evaluation_strategy="epoch", # evaluating at every epoch
    learning_rate=3e-5,
    per_device_train_batch_size=32, # adjust batch size depending on RAM
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2
)

trainer = Trainer(
    model=roberta,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=roberta_t,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mhausbichler-georg[0m ([33mhausbichler-georg-wirtschaftsuniversit-t-wien[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1524,0.154643,0.952661,0.954559,0.952034
2,0.1522,0.14783,0.953895,0.955819,0.954123
3,0.129,0.151375,0.954521,0.956487,0.95507


TrainOutput(global_step=1683, training_loss=0.15927138216284437, metrics={'train_runtime': 571.6023, 'train_samples_per_second': 94.167, 'train_steps_per_second': 2.944, 'total_flos': 3516425909862912.0, 'train_loss': 0.15927138216284437, 'epoch': 3.0})

In [None]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.15595071017742157,
 'eval_precision': 0.9532918441670758,
 'eval_recall': 0.9553523858235833,
 'eval_f1': 0.9539312435811959,
 'eval_runtime': 12.5476,
 'eval_samples_per_second': 296.948,
 'eval_steps_per_second': 9.324,
 'epoch': 3.0}

In [None]:
trainer.save_model("./roberta-base-after-corpus")

## applying the model

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

roberta_corpus_t = AutoTokenizer.from_pretrained("./roberta-base-after-corpus")
roberta_corpus = AutoModelForTokenClassification.from_pretrained(
    "./roberta-base-after-corpus",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id)

In [None]:
df_corpus_test = test_dataset.to_pandas()
df_corpus_test.head()

Unnamed: 0,tokens,original_labels,tags,input_ids,attention_mask,labels
0,"[There, has, been, no, electricity, for, last,...","[O, O, O, O, O, O, O, O, O, O, B-location, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 2, 10...","[0, 345, 34, 57, 117, 4382, 13, 94, 158, 722, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,..."
1,"[2-0, to, West, Ham, at, half-time, ., Think, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...","[0, 132, 12, 288, 7, 580, 3600, 23, 457, 12, 9...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, -100, -100, 10, 10, 10, 10, 10, -10..."
2,"[I, do, n't, know, why, this, {{MENTION}}, tea...","[O, O, O, O, O, O, B-corporation, O, O, O, O, ...","[10, 10, 10, 10, 10, 10, 0, 10, 10, 10, 10, 10...","[0, 38, 109, 295, 75, 216, 596, 42, 47517, 126...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, -100, 10, 10, 10, 0, -100, ..."
3,"[I, guess, it, follows, that, President, Trump...","[O, O, O, O, O, B-person, I-person, O, O, O, O...","[10, 10, 10, 10, 10, 3, 8, 10, 10, 10, 10, 10,...","[0, 38, 4443, 24, 3905, 14, 270, 140, 222, 295...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, 10, 10, 3, 8, 10, 10, -100,..."
4,"[Suddenly, getting, Classic, WoW, Naxx, tips, ...","[O, O, O, O, B-product, O, O, B-person, O, O, ...","[10, 10, 10, 10, 4, 10, 10, 3, 10, 10, 10, 10,...","[0, 26825, 562, 7175, 20963, 771, 234, 3631, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, 10, -100, 4, -100, -100, 10..."


In [None]:
bert_pred(roberta_corpus, roberta_corpus_t, df_corpus_test, "tokens", "roberta_pred")

Using device: cuda
Model has 11 labels
Input shape: torch.Size([1, 128])


Unnamed: 0,tokens,original_labels,tags,input_ids,attention_mask,labels,roberta_pred
0,"[There, has, been, no, electricity, for, last,...","[O, O, O, O, O, O, O, O, O, O, B-location, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 2, 10...","[0, 345, 34, 57, 117, 4382, 13, 94, 158, 722, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 2, 10..."
1,"[2-0, to, West, Ham, at, half-time, ., Think, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...","[0, 132, 12, 288, 7, 580, 3600, 23, 457, 12, 9...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, -100, -100, 10, 10, 10, 10, 10, -10...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."
2,"[I, do, n't, know, why, this, {{MENTION}}, tea...","[O, O, O, O, O, O, B-corporation, O, O, O, O, ...","[10, 10, 10, 10, 10, 10, 0, 10, 10, 10, 10, 10...","[0, 38, 109, 295, 75, 216, 596, 42, 47517, 126...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, -100, 10, 10, 10, 0, -100, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."
3,"[I, guess, it, follows, that, President, Trump...","[O, O, O, O, O, B-person, I-person, O, O, O, O...","[10, 10, 10, 10, 10, 3, 8, 10, 10, 10, 10, 10,...","[0, 38, 4443, 24, 3905, 14, 270, 140, 222, 295...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, 10, 10, 3, 8, 10, 10, -100,...","[10, 10, 10, 10, 10, 3, 8, 10, 10, 10, 10, 10,..."
4,"[Suddenly, getting, Classic, WoW, Naxx, tips, ...","[O, O, O, O, B-product, O, O, B-person, O, O, ...","[10, 10, 10, 10, 4, 10, 10, 3, 10, 10, 10, 10,...","[0, 26825, 562, 7175, 20963, 771, 234, 3631, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, 10, -100, 4, -100, -100, 10...","[10, 10, 10, 10, 10, 10, 10, 3, 10, 10, 10, 10..."
...,...,...,...,...,...,...,...
3721,"[Rumors, say, IBM, is, releasing, AirTag, soon...","[O, O, B-corporation, O, O, B-product, O, O]","[10, 10, 0, 10, 10, 4, 10, 10]","[0, 13772, 994, 224, 11510, 16, 8704, 1754, 45...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...","[-100, 10, -100, 10, 0, 10, 10, 4, -100, 10, 1...","[10, 10, 0, 10, 10, 4, 10, 10]"
3722,"[New, Music, +Video, :, {{MENTION}}, -, John, ...","[O, O, O, O, O, O, B-person, I-person, O, O, O]","[10, 10, 10, 10, 10, 10, 3, 8, 10, 10, 10]","[0, 188, 3920, 2055, 17967, 4832, 47517, 12613...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, -100, 10, 10, -100, -100, -...","[10, 10, 10, 10, 3, 10, 3, 8, 10, 10, 10]"
3723,"[The, American, people, are, taking, this, pan...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...","[0, 20, 470, 82, 32, 602, 42, 23387, 14414, 36...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, 10, 10, 10, 10, -100, 10, 1...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."
3724,"[Happy, Mother, ’s, Day, !, Truly, the, most, ...","[B-event, I-event, I-event, I-event, O, O, O, ...","[1, 6, 6, 6, 10, 10, 10, 10, 10, 10, 10, 10, 1...","[0, 9899, 8133, 44, 27, 29, 1053, 27785, 39003...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 1, 6, 6, -100, -100, 6, 10, 10, 10, 10,...","[1, 6, 6, 6, 10, 10, 10, 10, 10, 10, 10, 10, 1..."


In [None]:
from nervaluate import Evaluator

df_corpus_test["pred_labels"] = df_corpus_test["roberta_pred"].apply(lambda x: [id_to_label[i] for i in x])

true = df_corpus_test["original_labels"].values.tolist()
pred = df_corpus_test["pred_labels"].values.tolist()

evaluator = Evaluator(true, pred, tags=["corporation", "event", "location", "person", "product"], loader="list")

results, results_by_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
print(results)

{'ent_type': {'correct': 5310, 'incorrect': 580, 'partial': 0, 'missed': 1148, 'spurious': 942, 'possible': 7038, 'actual': 6832, 'precision': 0.7772248243559718, 'recall': 0.7544757033248082, 'f1': 0.7656813266041816}, 'partial': {'correct': 5544, 'incorrect': 0, 'partial': 346, 'missed': 1148, 'spurious': 942, 'possible': 7038, 'actual': 6832, 'precision': 0.8367974238875878, 'recall': 0.8123046319977266, 'f1': 0.8243691420331651}, 'strict': {'correct': 5037, 'incorrect': 853, 'partial': 0, 'missed': 1148, 'spurious': 942, 'possible': 7038, 'actual': 6832, 'precision': 0.7372658079625293, 'recall': 0.7156862745098039, 'f1': 0.7263157894736842}, 'exact': {'correct': 5544, 'incorrect': 346, 'partial': 0, 'missed': 1148, 'spurious': 942, 'possible': 7038, 'actual': 6832, 'precision': 0.8114754098360656, 'recall': 0.7877237851662404, 'f1': 0.7994232155731796}}


In [None]:
print(results_by_tag)

{'corporation': {'ent_type': {'correct': 708, 'incorrect': 255, 'partial': 0, 'missed': 188, 'spurious': 129, 'possible': 1151, 'actual': 1092, 'precision': 0.6483516483516484, 'recall': 0.6151172893136403, 'f1': 0.6312973695942934}, 'partial': {'correct': 915, 'incorrect': 0, 'partial': 48, 'missed': 188, 'spurious': 129, 'possible': 1151, 'actual': 1092, 'precision': 0.8598901098901099, 'recall': 0.8158123370981755, 'f1': 0.8372715113687027}, 'strict': {'correct': 686, 'incorrect': 277, 'partial': 0, 'missed': 188, 'spurious': 129, 'possible': 1151, 'actual': 1092, 'precision': 0.6282051282051282, 'recall': 0.5960034752389227, 'f1': 0.6116807846633973}, 'exact': {'correct': 915, 'incorrect': 48, 'partial': 0, 'missed': 188, 'spurious': 129, 'possible': 1151, 'actual': 1092, 'precision': 0.8379120879120879, 'recall': 0.7949609035621199, 'f1': 0.8158716005349977}}, 'event': {'ent_type': {'correct': 612, 'incorrect': 51, 'partial': 0, 'missed': 307, 'spurious': 173, 'possible': 970, 'ac

In [None]:
metrics = evaluate_ner_predictions(df_corpus_test, "tags", "roberta_pred")

              precision    recall  f1-score   support

           0       0.74      0.60      0.66      1151
           1       0.71      0.58      0.64       970
           2       0.81      0.81      0.81      1089
           3       0.77      0.82      0.80      2426
           4       0.82      0.74      0.78      1402
           5       0.51      0.30      0.38       232
           6       0.77      0.61      0.68       871
           7       0.68      0.60      0.64       359
           8       0.88      0.87      0.87      1082
           9       0.82      0.75      0.79       961
          10       0.98      0.98      0.98     83303

    accuracy                           0.96     93846
   macro avg       0.77      0.70      0.73     93846
weighted avg       0.95      0.96      0.95     93846

Precision: 0.9533
Recall: 0.9554
F1 Score: 0.9543


# Finetuning covid twitter bert

## with TweetNER7-data

In [2]:
# fine-tuning the covid-twitter-bert model, since most other bert-based models
# have already been fine-tuned on this data with most likely better resources
# https://huggingface.co/datasets/tner/tweetner7#main-models
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

model_name_covid = "digitalepidemiologylab/covid-twitter-bert-v2"
tokenizer_covid = AutoTokenizer.from_pretrained(model_name_covid)

#model_covid = AutoModelForTokenClassification.from_pretrained(model_name_covid)
#ner_pipeline_covid = pipeline("ner", model=model_covid, tokenizer=tokenizer_covid)

KeyboardInterrupt: 

In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(tweetner7_big)

In [None]:
print(dataset["true_labels"][0])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-event', 'O', 'B-event', 'O', 'O', 'O', 'O', 'O', 'O', 'B-location', 'I-location', 'I-location', 'I-location', 'I-location', 'O']


In [None]:
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [None]:
label_list = list(entity_dict.values())
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

def encode_labels(ds):
    ds['new_tags'] = [label_to_id[label] for label in ds['true_labels']]
    return ds

train_dataset = train_dataset.map(encode_labels)
test_dataset = test_dataset.map(encode_labels)

Map:   0%|          | 0/9104 [00:00<?, ? examples/s]

KeyError: 'true_labels'

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer_covid(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["new_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:  # Special tokens
                label_ids.append(-100)
            elif word_id != previous_word_id:  # First token of a word
                label_ids.append(label[word_id])
            else:  # Subword tokens
                label_ids.append(-100)
            previous_word_id = word_id
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/7934 [00:00<?, ? examples/s]

NameError: name 'tokenizer_covid' is not defined

In [None]:
from transformers import AutoModelForTokenClassification

model_covid = AutoModelForTokenClassification.from_pretrained(
    model_name_covid,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at digitalepidemiologylab/covid-twitter-bert-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import DataCollatorForTokenClassification

# using a data collator for dynamic padding to solve error during tensor creation
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer_covid)

In [None]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # flattening predictions and labels
    true_labels = [id_to_label[label] for label in labels.flatten() if label != -100]
    true_predictions = [id_to_label[pred] for (pred, label) in zip(predictions.flatten(), labels.flatten()) if label != -100]

    # calculating metrics
    report = classification_report(true_labels, true_predictions, output_dict=True)
    return {
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"]
    }

training_args = TrainingArguments(
    output_dir="./ner-covid-twitter-bert",
    evaluation_strategy="epoch",   # evaluating at every epoch
    learning_rate=5e-5,
    per_device_train_batch_size=4, # adjust batch size depending on RAM
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2
)

trainer = Trainer(
    model=model_covid,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer_covid,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.2903,0.304659,0.912737,0.917854,0.914056
2,0.2128,0.331941,0.909902,0.91888,0.912325
3,0.1797,0.392808,0.909601,0.915961,0.911201
4,0.0859,0.454004,0.908302,0.91341,0.910407


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.2903,0.304659,0.912737,0.917854,0.914056
2,0.2128,0.331941,0.909902,0.91888,0.912325
3,0.1797,0.392808,0.909601,0.915961,0.911201
4,0.0859,0.454004,0.908302,0.91341,0.910407
5,0.0749,0.547471,0.90816,0.91402,0.910589


TrainOutput(global_step=9920, training_loss=0.17053519577133439, metrics={'train_runtime': 4614.7466, 'train_samples_per_second': 8.596, 'train_steps_per_second': 2.15, 'total_flos': 9210857561341440.0, 'train_loss': 0.17053519577133439, 'epoch': 5.0})

In [None]:
# saving the fine-tuned model
model_covid.save_pretrained("./models/ner-covid-twitter-bert_after_first_finetuning")
tokenizer_covid.save_pretrained("./models/ner-covid-twitter-bert_after_first_finetuning")

('./models/ner-covid-twitter-bert_after_first_finetuning/tokenizer_config.json',
 './models/ner-covid-twitter-bert_after_first_finetuning/special_tokens_map.json',
 './models/ner-covid-twitter-bert_after_first_finetuning/vocab.txt',
 './models/ner-covid-twitter-bert_after_first_finetuning/added_tokens.json',
 './models/ner-covid-twitter-bert_after_first_finetuning/tokenizer.json')

In [None]:
# loading the saved model and applying it
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer_covid_f = AutoTokenizer.from_pretrained("./models/ner-covid-twitter-bert_after_first_finetuning")
model_covid_f = AutoModelForTokenClassification.from_pretrained("./models/ner-covid-twitter-bert_after_first_finetuning")

In [None]:
from transformers import pipeline

fine_tuned_model_path = "./models/covid-twitter-bert_after_first_finetuning"
pipeline_covid_f = pipeline("ner", model=fine_tuned_model_path, tokenizer=fine_tuned_model_path)

Device set to use cpu


In [None]:
tweetner7_big["pred_covidtwitterbert_f"] = tweetner7_big.apply(lambda x: pipeline_covid_f(str(x["tokens"])), axis=1)

In [None]:
tweetner7_big.head()

Unnamed: 0,tokens,tags,id,date,true_labels,pred_covidtwitterbert_f
0,"[Morning, 5km, run, with, {{USERNAME}}, for, b...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 2, 14...",1183344337016381440,2019-10-13,"[O, O, O, O, O, O, O, O, O, O, B-event, O, B-e...","[{'entity': 'B-event', 'score': 0.96457744, 'i..."
1,"[President, Trump, Arrives, at, UFC, 244, in, ...","[5, 12, 14, 14, 0, 7, 14, 4, 11, 14, 14, 14, 1...",1190961319538765824,2019-11-03,"[B-person, I-person, O, O, B-corporation, I-co...","[{'entity': 'B-person', 'score': 0.9804436, 'i..."
2,"["", I, 've, been, in, law, enforcement, for, 2...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...",1267032593339486209,2020-05-31,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[{'entity': 'B-group', 'score': 0.565347, 'ind..."
3,"[I, got, mine, yesterday, !, ****, Doctors, sa...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...",1180717545935925248,2019-10-06,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[{'entity': 'B-corporation', 'score': 0.931802..."
4,"[Mayo, Breast, Cancer, Vaccine, Could, Be, Ava...","[6, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14...",1183251744601587712,2019-10-13,"[B-product, I-product, I-product, I-product, O...","[{'entity': 'B-product', 'score': 0.9158095, '..."


In [None]:
tweetner7_big["pred_labels_covidtwitterbert_f"] = tweetner7_big.apply(output_to_labellist, axis=1)

In [None]:
from nervaluate import Evaluator

true = tweetner7_big["true_labels"].values.tolist()
pred = tweetner7_big["pred_labels_covidtwitterbert_f"].values.tolist()

evaluator = Evaluator(true, pred, tags=["corporation", "creative_work", "event", "group", "location", "person", "product"], loader="list")

results, results_by_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
print(results)

{'ent_type': {'correct': 1150, 'incorrect': 192, 'partial': 0, 'missed': 30784, 'spurious': 2136, 'possible': 32126, 'actual': 3478, 'precision': 0.33064979873490513, 'recall': 0.03579655108012202, 'f1': 0.06459948320413436}, 'partial': {'correct': 646, 'incorrect': 0, 'partial': 696, 'missed': 30784, 'spurious': 2136, 'possible': 32126, 'actual': 3478, 'precision': 0.28579643473260496, 'recall': 0.030940671107514164, 'f1': 0.055836422873834404}, 'strict': {'correct': 559, 'incorrect': 783, 'partial': 0, 'missed': 30784, 'spurious': 2136, 'possible': 32126, 'actual': 3478, 'precision': 0.16072455434157562, 'recall': 0.017400236568511487, 'f1': 0.03140096618357489}, 'exact': {'correct': 646, 'incorrect': 696, 'partial': 0, 'missed': 30784, 'spurious': 2136, 'possible': 32126, 'actual': 3478, 'precision': 0.18573893041978148, 'recall': 0.020108323476312022, 'f1': 0.03628805752162678}}


In [None]:
print(results_by_tag["product"])

{'ent_type': {'correct': 59, 'incorrect': 25, 'partial': 0, 'missed': 1766, 'spurious': 71, 'possible': 1850, 'actual': 155, 'precision': 0.38064516129032255, 'recall': 0.03189189189189189, 'f1': 0.05885286783042394}, 'partial': {'correct': 43, 'incorrect': 0, 'partial': 41, 'missed': 1766, 'spurious': 71, 'possible': 1850, 'actual': 155, 'precision': 0.4096774193548387, 'recall': 0.03432432432432433, 'f1': 0.06334164588528679}, 'strict': {'correct': 32, 'incorrect': 52, 'partial': 0, 'missed': 1766, 'spurious': 71, 'possible': 1850, 'actual': 155, 'precision': 0.2064516129032258, 'recall': 0.017297297297297298, 'f1': 0.03192019950124689}, 'exact': {'correct': 43, 'incorrect': 41, 'partial': 0, 'missed': 1766, 'spurious': 71, 'possible': 1850, 'actual': 155, 'precision': 0.27741935483870966, 'recall': 0.023243243243243242, 'f1': 0.0428927680798005}}


## with NER-corpus

In [2]:
from transformers import AutoTokenizer

tokenizer_covid = AutoTokenizer.from_pretrained("digitalepidemiologylab/covid-twitter-bert-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/421 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [3]:
from transformers import AutoModelForTokenClassification

entity_dict = {
    0: "B-corporation",
    1: "B-event",
    2: "B-location",
    3: "B-person",
    4: "B-product",
    5: "I-corporation",
    6: "I-event",
    7: "I-location",
    8: "I-person",
    9: "I-product",
    10: "O"
}

label_list = list(entity_dict.values())
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

model_covid = AutoModelForTokenClassification.from_pretrained(
    "digitalepidemiologylab/covid-twitter-bert-v2",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

pytorch_model.bin:   0%|          | 0.00/1.35G [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at digitalepidemiologylab/covid-twitter-bert-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
import pandas as pd

df_corpus = pd.read_json("NER_corpus.json", orient="records")
df_corpus.rename(columns={"labels":"original_labels"}, inplace=True)
df_corpus.head()

Unnamed: 0,tokens,original_labels,tags
0,"[., {{MENTION}}, stand, tall, in, the, red, zo...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."
1,"[Who, else, is, gonna, get, the, new, Surface,...","[O, O, O, O, O, O, O, B-product, I-product, O,...","[10, 10, 10, 10, 10, 10, 10, 4, 9, 10, 10, 10]"
2,"[EDIE, IN, BETWEEN, comes, out, THIS, MONTH, ,...","[B-person, O, O, O, O, O, O, O, O, O, O, O, O,...","[3, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10..."
3,"[‘, Joe, Biden, may, have, won, the, keys, to,...","[O, B-person, I-person, O, O, O, O, O, O, O, B...","[10, 3, 8, 10, 10, 10, 10, 10, 10, 10, 2, 7, 1..."
4,"[Imma, hit, reload, until, I, see, additional,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."


In [5]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_corpus)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer_covid(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:  # Special tokens
                label_ids.append(-100)
            elif word_id != previous_word_id:  # First token of a word
                label_ids.append(label[word_id])
            else:  # Subword tokens
                label_ids.append(-100)
            previous_word_id = word_id
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Map:   0%|          | 0/24835 [00:00<?, ? examples/s]

In [6]:
train_test_split1 = tokenized_datasets.train_test_split(test_size=0.15, seed=42)
train_val_dataset = train_test_split1["train"]
test_dataset = train_test_split1["test"]

train_test_split2 = train_val_dataset.train_test_split(test_size=0.15, seed=42)
train_dataset = train_test_split2["train"]
val_dataset = train_test_split2["test"]

In [7]:
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification
import numpy as np
from sklearn.metrics import classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # flattening predictions and labels
    true_labels = [id_to_label[label] for label in labels.flatten() if label != -100]
    true_predictions = [id_to_label[pred] for (pred, label) in zip(predictions.flatten(), labels.flatten()) if label != -100]

    # calculating metrics
    report = classification_report(true_labels, true_predictions, output_dict=True)
    return {
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"]
    }

training_args = TrainingArguments(
    output_dir="./ner-covid-twitter-bert",
    evaluation_strategy="epoch", # evaluating at every epoch
    learning_rate=3e-5,
    per_device_train_batch_size=32, # adjust batch size depending on RAM
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2
)

trainer = Trainer(
    model=model_covid,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer_covid,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [8]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhausbichler-georg[0m ([33mhausbichler-georg-wirtschaftsuniversit-t-wien[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1458,0.151762,0.953083,0.9547,0.953155
2,0.1383,0.145724,0.955023,0.957195,0.955489


TrainOutput(global_step=1122, training_loss=0.15975551841093258, metrics={'train_runtime': 1170.8309, 'train_samples_per_second': 30.648, 'train_steps_per_second': 0.958, 'total_flos': 8331684657343488.0, 'train_loss': 0.15975551841093258, 'epoch': 2.0})

In [9]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.15194624662399292,
 'eval_precision': 0.9543592730355339,
 'eval_recall': 0.9566065282025598,
 'eval_f1': 0.9549182587954612,
 'eval_runtime': 38.3897,
 'eval_samples_per_second': 97.057,
 'eval_steps_per_second': 3.048,
 'epoch': 2.0}

In [10]:
trainer.save_model("./ner-covid-twitter-bert_after_corpus")

In [11]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

bert_covid_corpus_t = AutoTokenizer.from_pretrained("./ner-covid-twitter-bert_after_corpus")
bert_covid_corpus = AutoModelForTokenClassification.from_pretrained(
    "./ner-covid-twitter-bert_after_corpus",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id)

In [12]:
corpus_test_df = test_dataset.to_pandas()
corpus_test_df.head()

Unnamed: 0,tokens,original_labels,tags,input_ids,token_type_ids,attention_mask,labels
0,"[There, has, been, no, electricity, for, last,...","[O, O, O, O, O, O, O, O, O, O, B-location, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 2, 10...","[101, 2045, 2038, 2042, 2053, 6451, 2005, 2197...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,..."
1,"[2-0, to, West, Ham, at, half-time, ., Think, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...","[101, 1016, 1011, 1014, 2000, 2225, 10654, 201...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, -100, -100, 10, 10, 10, 10, 10, -10..."
2,"[I, do, n't, know, why, this, {{MENTION}}, tea...","[O, O, O, O, O, O, B-corporation, O, O, O, O, ...","[10, 10, 10, 10, 10, 10, 0, 10, 10, 10, 10, 10...","[101, 1045, 2079, 1050, 1005, 1056, 2113, 2339...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, -100, -100, 10, 10, 10, 0, ..."
3,"[I, guess, it, follows, that, President, Trump...","[O, O, O, O, O, B-person, I-person, O, O, O, O...","[10, 10, 10, 10, 10, 3, 8, 10, 10, 10, 10, 10,...","[101, 1045, 3984, 2009, 4076, 2008, 2343, 8398...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, 10, 10, 3, 8, 10, 10, -100,..."
4,"[Suddenly, getting, Classic, WoW, Naxx, tips, ...","[O, O, O, O, B-product, O, O, B-person, O, O, ...","[10, 10, 10, 10, 4, 10, 10, 3, 10, 10, 10, 10,...","[101, 3402, 2893, 4438, 10166, 6583, 20348, 10...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, 10, 4, -100, 10, 10, 3, -10..."


In [16]:
import numpy as np
import torch

param_device = next(bert_covid_corpus.parameters()).device

# prediction function with max_length and error exceptions to handle cuda runtime error but predicting on GPU
def bert_pred(model, tokenizer, df, text_col, output_col, max_length=128, batch_size=32):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    #device = model.device
    print(f"Using device: {device}")

    # printing model config info
    if hasattr(model, 'config'):
        print(f"Model has {model.config.num_labels} labels")

    model.eval()
    all_predictions = []

    for i, row in df.iterrows():
        try:
            text = row[text_col]

            if isinstance(text, np.ndarray):
                if text.dtype.kind == 'U' or text.dtype.kind == 'S':
                    text_list = text.tolist()
                else:
                    text_list = [str(x) for x in text]

                if not text_list:
                    all_predictions.append([])
                    continue

                encoded = tokenizer(
                    text_list,
                    is_split_into_words=True,
                    return_tensors="pt",
                    max_length=max_length,
                    truncation=True,
                    padding="max_length"
                )
            elif isinstance(text, str):
                if not text.strip():
                    all_predictions.append([])
                    continue

                encoded = tokenizer(
                    text,
                    return_tensors="pt",
                    max_length=max_length,
                    truncation=True,
                    padding="max_length"
                )
            elif isinstance(text, list):
                if not text:
                    all_predictions.append([])
                    continue

                encoded = tokenizer(
                    text,
                    is_split_into_words=True,
                    return_tensors="pt",
                    max_length=max_length,
                    truncation=True,
                    padding="max_length"
                )
            else:
                encoded = tokenizer(
                    str(text),
                    return_tensors="pt",
                    max_length=max_length,
                    truncation=True,
                    padding="max_length"
                )

            encoded = {k: v.to(device) for k, v in encoded.items()}

            if i == 0:
                print(f"Input shape: {encoded['input_ids'].shape}")

            with torch.no_grad():
                outputs = model(**encoded)
                preds = torch.argmax(outputs.logits, dim=-1)[0].cpu().tolist()

                if isinstance(text, (list, np.ndarray)):
                    if isinstance(text, np.ndarray):
                        word_ids = tokenizer(
                            text.tolist(),
                            is_split_into_words=True,
                            max_length=max_length,
                            truncation=True,
                            padding="max_length"
                        ).word_ids()
                    else:
                        word_ids = tokenizer(
                            text,
                            is_split_into_words=True,
                            max_length=max_length,
                            truncation=True,
                            padding="max_length"
                        ).word_ids()

                    processed_preds = []
                    prev_word_id = None

                    for word_id, pred in zip(word_ids, preds):
                        if word_id is None:
                            continue
                        elif word_id != prev_word_id:
                            processed_preds.append(pred)
                        prev_word_id = word_id

                    all_predictions.append(processed_preds)
                else:
                    all_predictions.append(preds)

        except Exception as e:
            print(f"Error in row {i}: {e}")
            # adding empty predictions for failed rows
            all_predictions.append([])
            continue

    df[output_col] = all_predictions
    return df

In [17]:
bert_pred(bert_covid_corpus, bert_covid_corpus_t, corpus_test_df, "tokens", "covid_bert_pred")

Using device: cuda
Model has 11 labels
Input shape: torch.Size([1, 128])


Unnamed: 0,tokens,original_labels,tags,input_ids,token_type_ids,attention_mask,labels,covid_bert_pred
0,"[There, has, been, no, electricity, for, last,...","[O, O, O, O, O, O, O, O, O, O, B-location, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 2, 10...","[101, 2045, 2038, 2042, 2053, 6451, 2005, 2197...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 2, 7,..."
1,"[2-0, to, West, Ham, at, half-time, ., Think, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...","[101, 1016, 1011, 1014, 2000, 2225, 10654, 201...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, -100, -100, 10, 10, 10, 10, 10, -10...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."
2,"[I, do, n't, know, why, this, {{MENTION}}, tea...","[O, O, O, O, O, O, B-corporation, O, O, O, O, ...","[10, 10, 10, 10, 10, 10, 0, 10, 10, 10, 10, 10...","[101, 1045, 2079, 1050, 1005, 1056, 2113, 2339...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, -100, -100, 10, 10, 10, 0, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."
3,"[I, guess, it, follows, that, President, Trump...","[O, O, O, O, O, B-person, I-person, O, O, O, O...","[10, 10, 10, 10, 10, 3, 8, 10, 10, 10, 10, 10,...","[101, 1045, 3984, 2009, 4076, 2008, 2343, 8398...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, 10, 10, 3, 8, 10, 10, -100,...","[10, 10, 10, 10, 10, 3, 8, 10, 10, 10, 10, 10,..."
4,"[Suddenly, getting, Classic, WoW, Naxx, tips, ...","[O, O, O, O, B-product, O, O, B-person, O, O, ...","[10, 10, 10, 10, 4, 10, 10, 3, 10, 10, 10, 10,...","[101, 3402, 2893, 4438, 10166, 6583, 20348, 10...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, 10, 4, -100, 10, 10, 3, -10...","[10, 10, 10, 10, 10, 10, 10, 3, 10, 10, 10, 10..."
...,...,...,...,...,...,...,...,...
3721,"[Rumors, say, IBM, is, releasing, AirTag, soon...","[O, O, B-corporation, O, O, B-product, O, O]","[10, 10, 0, 10, 10, 4, 10, 10]","[101, 11256, 2360, 9980, 2003, 8287, 2250, 159...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...","[-100, 10, 10, 0, 10, 10, 4, -100, 10, 10, -10...","[10, 10, 0, 10, 10, 4, 10, 10]"
3722,"[New, Music, +Video, :, {{MENTION}}, -, John, ...","[O, O, O, O, O, O, B-person, I-person, O, O, O]","[10, 10, 10, 10, 10, 10, 3, 8, 10, 10, 10]","[101, 2047, 2189, 1009, 2678, 1024, 1063, 1063...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, -100, 10, 10, -100, -100, -...","[10, 10, 10, 10, 10, 10, 3, 8, 10, 10, 10]"
3723,"[The, American, people, are, taking, this, pan...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...","[101, 1996, 2137, 2111, 2024, 2635, 2023, 6090...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, 10, 10, 10, 10, -100, -100,...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."
3724,"[Happy, Mother, ’s, Day, !, Truly, the, most, ...","[B-event, I-event, I-event, I-event, O, O, O, ...","[1, 6, 6, 6, 10, 10, 10, 10, 10, 10, 10, 10, 1...","[101, 3407, 2388, 1521, 1055, 2154, 999, 5621,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 1, 6, 6, -100, 6, 10, 10, 10, 10, 10, 1...","[10, 1, 6, 6, 10, 10, 10, 10, 10, 10, 10, 10, ..."


In [18]:
corpus_test_df["covid_bert_pred"].explode().value_counts()

Unnamed: 0_level_0,count
covid_bert_pred,Unnamed: 1_level_1
10,84223
3,2571
4,1275
2,1072
8,1063
0,947
9,869
1,737
6,661
7,310


In [19]:
from nervaluate import Evaluator

corpus_test_df["pred_labels"] = corpus_test_df["covid_bert_pred"].apply(lambda x: [id_to_label[i] for i in x])

true = corpus_test_df["original_labels"].values.tolist()
pred = corpus_test_df["pred_labels"].values.tolist()

evaluator = Evaluator(true, pred, tags=["corporation", "event", "location", "person", "product"], loader="list")

results, results_by_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
print(results)

{'ent_type': {'correct': 5372, 'incorrect': 516, 'partial': 0, 'missed': 1150, 'spurious': 911, 'possible': 7038, 'actual': 6799, 'precision': 0.7901161935578762, 'recall': 0.7632850241545893, 'f1': 0.7764688877646889}, 'partial': {'correct': 5535, 'incorrect': 0, 'partial': 353, 'missed': 1150, 'spurious': 911, 'possible': 7038, 'actual': 6799, 'precision': 0.8400500073540227, 'recall': 0.8115231599886331, 'f1': 0.8255402182554022}, 'strict': {'correct': 5089, 'incorrect': 799, 'partial': 0, 'missed': 1150, 'spurious': 911, 'possible': 7038, 'actual': 6799, 'precision': 0.74849242535667, 'recall': 0.7230747371412333, 'f1': 0.7355640673556405}, 'exact': {'correct': 5535, 'incorrect': 353, 'partial': 0, 'missed': 1150, 'spurious': 911, 'possible': 7038, 'actual': 6799, 'precision': 0.8140903073981468, 'recall': 0.7864450127877238, 'f1': 0.8000289080002891}}


In [20]:
print(results_by_tag)

{'corporation': {'ent_type': {'correct': 724, 'incorrect': 242, 'partial': 0, 'missed': 185, 'spurious': 118, 'possible': 1151, 'actual': 1084, 'precision': 0.6678966789667896, 'recall': 0.629018245004344, 'f1': 0.6478747203579418}, 'partial': {'correct': 918, 'incorrect': 0, 'partial': 48, 'missed': 185, 'spurious': 118, 'possible': 1151, 'actual': 1084, 'precision': 0.8690036900369004, 'recall': 0.8184187662901824, 'f1': 0.8429530201342281}, 'strict': {'correct': 701, 'incorrect': 265, 'partial': 0, 'missed': 185, 'spurious': 118, 'possible': 1151, 'actual': 1084, 'precision': 0.6466789667896679, 'recall': 0.6090356211989574, 'f1': 0.6272930648769575}, 'exact': {'correct': 918, 'incorrect': 48, 'partial': 0, 'missed': 185, 'spurious': 118, 'possible': 1151, 'actual': 1084, 'precision': 0.8468634686346863, 'recall': 0.7975673327541268, 'f1': 0.821476510067114}}, 'event': {'ent_type': {'correct': 599, 'incorrect': 44, 'partial': 0, 'missed': 327, 'spurious': 159, 'possible': 970, 'actu

In [23]:
metrics = evaluate_ner_predictions(corpus_test_df, "tags", "covid_bert_pred")

              precision    recall  f1-score   support

           0       0.76      0.62      0.68      1151
           1       0.74      0.56      0.64       970
           2       0.82      0.81      0.81      1089
           3       0.79      0.84      0.81      2426
           4       0.83      0.76      0.79      1402
           5       0.59      0.28      0.38       232
           6       0.77      0.58      0.66       871
           7       0.71      0.62      0.66       359
           8       0.88      0.86      0.87      1082
           9       0.83      0.75      0.79       961
          10       0.97      0.99      0.98     83296

    accuracy                           0.96     93839
   macro avg       0.79      0.70      0.74     93839
weighted avg       0.95      0.96      0.95     93839

Precision: 0.9544
Recall: 0.9566
F1 Score: 0.9555


### applying model to case study data

In [24]:
import pandas as pd

df_dell = pd.read_json("dell_cs_processed.json", orient="records")
df_dell.head()

Unnamed: 0,Datetime,text,tokens
0,2022-09-30 23:29:15+00:00,Logitech Apple Google Microsoft Dell Lenovo #W...,"[Logitech, Apple, Google, Microsoft, Dell, Len..."
1,2022-09-30 21:46:35+00:00,{{MENTION}} {{MENTION}} {{MENTION}} {{MENTION}...,"[{{MENTION}}, {{MENTION}}, {{MENTION}}, {{MENT..."
2,2022-09-30 21:18:02+00:00,As {{MENTION}} celebrates its 40th anniversary...,"[As, {{MENTION}}, celebrates, its, 40th, anniv..."
3,2022-09-30 20:05:24+00:00,Dell your customer service is horrible especia...,"[Dell, your, customer, service, is, horrible, ..."
4,2022-09-30 20:03:17+00:00,{{MENTION}} Dell Dellcares Dell give the man w...,"[{{MENTION}}, Dell, Dellcares, Dell, give, the..."


In [25]:
bert_pred(bert_covid_corpus, bert_covid_corpus_t, df_dell, "tokens", "covid_bert_pred")

Using device: cuda
Model has 11 labels
Input shape: torch.Size([1, 128])


Unnamed: 0,Datetime,text,tokens,covid_bert_pred
0,2022-09-30 23:29:15+00:00,Logitech Apple Google Microsoft Dell Lenovo #W...,"[Logitech, Apple, Google, Microsoft, Dell, Len...","[0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10,..."
1,2022-09-30 21:46:35+00:00,{{MENTION}} {{MENTION}} {{MENTION}} {{MENTION}...,"[{{MENTION}}, {{MENTION}}, {{MENTION}}, {{MENT...","[10, 10, 10, 10, 10, 10, 10, 10, 4, 10, 10, 10..."
2,2022-09-30 21:18:02+00:00,As {{MENTION}} celebrates its 40th anniversary...,"[As, {{MENTION}}, celebrates, its, 40th, anniv...","[10, 0, 10, 10, 10, 10, 3, 8, 10, 3, 10, 10, 1..."
3,2022-09-30 20:05:24+00:00,Dell your customer service is horrible especia...,"[Dell, your, customer, service, is, horrible, ...","[0, 10, 10, 10, 10, 10, 10, 10, 3, 10, 10, 10,..."
4,2022-09-30 20:03:17+00:00,{{MENTION}} Dell Dellcares Dell give the man w...,"[{{MENTION}}, Dell, Dellcares, Dell, give, the...","[10, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10]"
...,...,...,...,...
24965,2022-01-01 02:02:04+00:00,{{MENTION}} {{MENTION}} Dell I wouldn't even k...,"[{{MENTION}}, {{MENTION}}, Dell, I, wouldn't, ...","[10, 10, 0, 10, 10, 10, 10, 10, 10, 10, 10, 10..."
24966,2022-01-01 01:57:34+00:00,{{MENTION}} {{MENTION}} Dell I didn't really l...,"[{{MENTION}}, {{MENTION}}, Dell, I, didn't, re...","[10, 10, 0, 10, 10, 10, 10, 10, 10, 10, 10, 10..."
24967,2022-01-01 01:36:36+00:00,Hey {{MENTION}} here it is....27 4K UHD USB-C ...,"[Hey, {{MENTION}}, here, it, is....27, 4K, UHD...","[10, 10, 10, 10, 10, 9, 9, 9, 10, 0, 10]"
24968,2022-01-01 01:31:30+00:00,{{MENTION}} {{MENTION}} Alienware Intel {{MENT...,"[{{MENTION}}, {{MENTION}}, Alienware, Intel, {...","[10, 10, 0, 5, 10, 10, 10, 0, 5, 10, 10, 10, 1..."


In [26]:
df_dell.to_json("dell_cs_after_ner.json", orient="records")

# fine-tuning BERT-base with tweetner7-data

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

#using cased BERT for NER because it may be advantageous (e.g. for persons, companies)
# google-bert/bert-base-cased
bert_base_cased_t = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

bert_base_cased = AutoModelForTokenClassification.from_pretrained(
    "google-bert/bert-base-cased",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# steps between executed from loading and applying and syn data sections (until train- and test-dataset)
def tokenize_and_align_labels(examples):
    tokenized_inputs = bert_base_cased_t(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["tags"]): # or "new_tags"
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:  # Special tokens
                label_ids.append(-100)
            elif word_id != previous_word_id:  # First token of a word
                label_ids.append(label[word_id])
            else:  # Subword tokens
                label_ids.append(-100)
            previous_word_id = word_id
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # flattening predictions and labels
    true_labels = [id_to_label[label] for label in labels.flatten() if label != -100]
    true_predictions = [id_to_label[pred] for (pred, label) in zip(predictions.flatten(), labels.flatten()) if label != -100]

    # calculating metrics
    report = classification_report(true_labels, true_predictions, output_dict=True)
    return {
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"]
    }

In [None]:
training_args = TrainingArguments(
    output_dir="./ner-bert-base-cased",
    evaluation_strategy="epoch",   # evaluating at every epoch
    learning_rate=5e-5,
    per_device_train_batch_size=64, # adjust batch size depending on RAM
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=2
)

trainer = Trainer(
    model=bert_base_cased,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=bert_base_cased_t,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhausbichler-georg[0m ([33mhausbichler-georg-wirtschaftsuniversit-t-wien[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.316216,0.901891,0.912124,0.904049
2,No log,0.301958,0.908153,0.914398,0.910465
3,No log,0.306279,0.908833,0.915082,0.911317


TrainOutput(global_step=372, training_loss=0.33267023230111725, metrics={'train_runtime': 541.2529, 'train_samples_per_second': 43.976, 'train_steps_per_second': 0.687, 'total_flos': 1555029095477760.0, 'train_loss': 0.33267023230111725, 'epoch': 3.0})

In [None]:
bert_base_cased.save_pretrained("bert-base-cased-ner-tweetner7")
bert_base_cased_t.save_pretrained("bert-base-cased-ner-tweetner7")

('bert-base-cased-ner-tweetner7/tokenizer_config.json',
 'bert-base-cased-ner-tweetner7/special_tokens_map.json',
 'bert-base-cased-ner-tweetner7/vocab.txt',
 'bert-base-cased-ner-tweetner7/added_tokens.json',
 'bert-base-cased-ner-tweetner7/tokenizer.json')

In [None]:
bert_c_ft_t = AutoTokenizer.from_pretrained("bert-base-cased-ner-tweetner7")

bert_c_ft = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased-ner-tweetner7",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

In [None]:
bert_pred(bert_c_ft, bert_c_ft_t, tweetner7_test20, "tokens", "bert_pred")

Unnamed: 0,tokens,tags,id,date,bert_pred
0,"[Hanging, out, with, the, #, Popinboyz, and, i...","[14, 14, 14, 14, 14, 3, 14, 14, 14, 14, 14, 14...",1224056835298250758,2020-02-02,"[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1..."
1,"["", It, 's, not, a, question, of, knowing, rig...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...",1273251723398623232,2020-06-17,"[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1..."
2,"[{{URL}}, presents, {{USERNAME}}, at, the, ven...","[14, 14, 14, 14, 14, 14, 14, 2, 9, 9, 9, 9, 14...",1224029146206167044,2020-02-02,"[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1..."
3,"[., {@Air Force Football@}, posted, its, first...","[14, 3, 14, 14, 14, 14, 14, 14, 14, 14, 14, 5,...",1201017635313291266,2019-12-01,"[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1..."
4,"[Lifeprint, 2x3, Instant, Printer, for, iPhone...","[6, 13, 13, 13, 14, 6, 14, 14, 14, 6, 14, 14, ...",1246739207046995969,2020-04-05,"[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1..."
...,...,...,...,...,...
571,"[I, have, a, strong, feeling, {@Ebuka Obi-Uche...","[14, 14, 14, 14, 14, 5, 14, 14, 14, 14, 14, 14...",1292521012567977985,2020-08-09,"[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1..."
572,"[., {@The Rewatchables@}, do, a, Rewatchables,...","[14, 1, 14, 14, 14, 14, 5, 12, 14, 14, 14]",1249209193736871936,2020-04-12,"[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 5]"
573,"[HI, For, SF9, Comeback, if, you, want, to, he...","[14, 14, 3, 14, 14, 14, 14, 14, 14, 14, 14, 14...",1277693251982635008,2020-06-29,"[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1..."
574,"[#, OICSilent, _, On, _, Kashmir, {{USERNAME}}...","[14, 14, 14, 14, 14, 4, 14, 14, 14, 14, 14, 14...",1297469989205745665,2020-08-23,"[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1..."


In [None]:
metrics = evaluate_ner_predictions(tweetner7_test20, "tags", "bert_pred")
print(metrics)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       191
           1       0.00      0.00      0.00       179
           2       0.00      0.00      0.00       265
           3       0.00      0.00      0.00       311
           4       0.00      0.00      0.00       165
           5       0.04      0.01      0.01       596
           6       0.00      0.00      0.00       220
           7       0.00      0.00      0.00        63
           8       0.00      0.00      0.00       223
           9       0.00      0.00      0.00       262
          10       0.00      0.00      0.00       107
          11       0.00      0.00      0.00        62
          12       0.02      0.00      0.01       266
          13       0.00      0.00      0.00        84
          14       0.83      0.98      0.90     14739

    accuracy                           0.82     17733
   macro avg       0.06      0.07      0.06     17733
weighted avg       0.69   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 2nd fine-tuning with tweetner7-bigger

In [None]:
tweetner7_validation20 = ds["validation_2020"].to_pandas()
tweetner7_validation21 = ds["validation_2021"].to_pandas()

# leaving "test_2020"-part out of training data to test
tweetner7_test20 = ds["test_2020"].to_pandas()

tweetner7_bigger = pd.concat([tweetner7_big, tweetner7_validation20, tweetner7_validation21, tweetner7_test20])
tweetner7_bigger.reset_index(drop=True, inplace=True)
tweetner7_bigger.dropna(inplace=True)

#tweetner7_test20.reset_index(drop=True, inplace=True)
#tweetner7_test20.dropna(inplace=True)

len(tweetner7_bigger)

11380

In [None]:
from datasets import Dataset

#train_dataset_bigger = Dataset.from_pandas(tweetner7_bigger)
#test_dataset_bigger = Dataset.from_pandas(tweetner7_test20)

dataset = Dataset.from_pandas(tweetner7_bigger)

# setting seed so that validation set is deterministic
train_test_split = dataset.train_test_split(test_size=0.05, seed=42)
train_dataset_bigger = train_test_split['train']
test_dataset_bigger = train_test_split['test']

In [None]:
train_dataset_bigger = train_dataset_bigger.map(tokenize_and_align_labels, batched=True)
test_dataset_bigger = test_dataset_bigger.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/10811 [00:00<?, ? examples/s]

Map:   0%|          | 0/569 [00:00<?, ? examples/s]

In [None]:
training_args = TrainingArguments(
    output_dir="./ner-bert-base-cased",
    evaluation_strategy="epoch",   # evaluating at every epoch
    learning_rate=1e-5,
    per_device_train_batch_size=64, # adjust batch size depending on RAM
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=2
)

trainer = Trainer(
    model=bert_base_cased,
    args=training_args,
    train_dataset=train_dataset_bigger,
    eval_dataset=test_dataset_bigger,
    tokenizer=bert_base_cased_t,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhausbichler-georg[0m ([33mhausbichler-georg-wirtschaftsuniversit-t-wien[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.410115,0.865609,0.892231,0.869347
2,No log,0.352286,0.886682,0.903552,0.892503
3,0.457100,0.342545,0.892437,0.906104,0.896841


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=507, training_loss=0.4554884287967719, metrics={'train_runtime': 337.8294, 'train_samples_per_second': 96.004, 'train_steps_per_second': 1.501, 'total_flos': 2118908438519040.0, 'train_loss': 0.4554884287967719, 'epoch': 3.0})

In [None]:
bert_base_cased.save_pretrained("bert-base-cased-ner-tweetner7-bigger")
bert_base_cased_t.save_pretrained("bert-base-cased-ner-tweetner7-bigger")

('bert-base-cased-ner-tweetner7-bigger/tokenizer_config.json',
 'bert-base-cased-ner-tweetner7-bigger/special_tokens_map.json',
 'bert-base-cased-ner-tweetner7-bigger/vocab.txt',
 'bert-base-cased-ner-tweetner7-bigger/added_tokens.json',
 'bert-base-cased-ner-tweetner7-bigger/tokenizer.json')

In [None]:
bert_c_ft_bigger_t = AutoTokenizer.from_pretrained("bert-base-cased-ner-tweetner7-bigger")

bert_c_ft_bigger = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased-ner-tweetner7-bigger",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

In [None]:
test_dataset_bigger_df = test_dataset_bigger.to_pandas()

In [None]:
bert_pred(bert_c_ft_bigger, bert_c_ft_bigger_t, test_dataset_bigger_df, "tokens", "bert_pred")

Unnamed: 0,tokens,tags,id,date,input_ids,token_type_ids,attention_mask,labels,bert_pred
0,"[#, NewFlorida, #, Duval, #, Jacksonville, Sed...","[14, 4, 14, 4, 14, 3, 10, 10, 10, 10, 10, 10, ...",1234145123027619846,2020-03-01,"[101, 108, 1203, 2271, 10885, 6859, 108, 27281...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 14, 4, -100, -100, -100, 14, 4, 14, 3, ...","[14, 14, 14, 5, 14, 5, 12, 12, 14, 5, 14, 14, ..."
1,"[“, Notorious, ”, with, {@Conor McGregor@}, “,...","[14, 1, 14, 14, 5, 14, 1, 8, 14, 14, 14, 14, 1...",1218740537437626368,2020-01-19,"[101, 789, 1753, 19402, 790, 1114, 196, 137, 2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 14, 1, -100, 14, 14, 5, -100, -100, -10...","[14, 14, 14, 14, 5, 14, 1, 8, 14, 14, 14, 14, ..."
2,"[Juwan, Howard, ,, who, protested, the, NCAA, ...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...",1373419447705276424,2021-03-20,"[101, 23915, 5491, 4115, 117, 1150, 11472, 110...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 14, -100, 14, 14, 14, 14, 14, 14, 14, -...","[5, 12, 14, 14, 14, 14, 3, 14, 14, 14, 14, 3, ..."
3,"[I, need, a, red, concept, hanbin, {@BELIFT LA...","[14, 14, 14, 14, 14, 14, 3, 14, 14, 14, 14, 14...",1323199940089114624,2020-11-02,"[101, 146, 1444, 170, 1894, 3400, 5871, 1179, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 14, 14, 14, 14, 14, 14, -100, -100, 3, ...","[14, 14, 14, 14, 14, 14, 6, 14, 14, 14, 14, 14..."
4,"[I, wanna, go, to, {@gabi demartino@}, ‘, s, f...","[14, 14, 14, 14, 5, 14, 14, 2, 9, 14, 14, 14, ...",1193556099493961730,2019-11-10,"[101, 146, 16445, 1301, 1106, 196, 137, 176, 2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 14, 14, 14, 14, 5, -100, -100, -100, -1...","[14, 14, 14, 14, 5, 14, 14, 14, 14, 14, 14, 14..."
...,...,...,...,...,...,...,...,...,...
564,"[Kimberly, Potter, ,, the, Cop, Who, Killed, D...","[5, 12, 14, 3, 10, 14, 14, 5, 12, 14, 14, 14, ...",1383751427688861699,2021-04-18,"[101, 26564, 11434, 117, 1103, 3291, 1643, 262...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 5, 12, 14, 3, 10, -100, 14, 14, -100, 5...","[5, 12, 14, 14, 14, 14, 14, 5, 12, 14, 14, 14,..."
565,"[Great, weekend, .., ., Friday, :, {@Castlefor...","[14, 14, 14, 14, 14, 14, 3, 14, 14, 4, 11, 14,...",1426966663824613381,2021-08-15,"[101, 2038, 5138, 119, 119, 119, 5286, 131, 19...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 14, 14, 14, -100, 14, 14, 14, 3, -100, ...","[14, 14, 14, 14, 14, 14, 3, 14, 14, 3, 10, 10,..."
566,"[Why, are, people, this, way, .., ., even, if,...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...",1318099262677319684,2020-10-19,"[101, 2009, 1132, 1234, 1142, 1236, 119, 119, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 14, 14, 14, 14, 14, 14, -100, 14, 14, 1...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1..."
567,"[President, Trump, To, Release, New, List, Of,...","[5, 12, 14, 14, 14, 14, 14, 14, 3, 10, 10, 14,...",1273942175278039041,2020-06-19,"[101, 1697, 8499, 1706, 17443, 1203, 5619, 209...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 5, 12, 14, 14, 14, 14, 14, 14, -100, -1...","[5, 12, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14..."


In [None]:
metrics = evaluate_ner_predictions(test_dataset_bigger_df, "tags", "bert_pred")
print(metrics)

              precision    recall  f1-score   support

           0       0.37      0.32      0.34       185
           1       0.46      0.36      0.40       168
           2       0.59      0.33      0.42       252
           3       0.63      0.45      0.53       325
           4       0.71      0.69      0.70       192
           5       0.77      0.82      0.80       513
           6       0.68      0.48      0.56       209
           7       0.00      0.00      0.00        67
           8       0.50      0.52      0.51       234
           9       0.55      0.50      0.52       226
          10       0.45      0.20      0.27       146
          11       0.63      0.53      0.57        91
          12       0.78      0.85      0.81       201
          13       0.59      0.15      0.23        89
          14       0.95      0.98      0.96     15132

    accuracy                           0.91     18030
   macro avg       0.58      0.48      0.51     18030
weighted avg       0.89   

## applying model on unlabeled data and analyzing results

In [None]:
# predicting on already processed Tweets-Bigtech-data
df_bigtech = pd.read_csv("tweets_bigtech_10ksample.csv")
df_bigtech.head()

Unnamed: 0.1,Unnamed: 0,created_at,file_name,followers,friends,group_name,location,retweet_count,screenname,search_query,text,twitter_id,username,polarity,partition_0,partition_1,sentiment
0,0,2020-07-12 09:24:26,AMD,25,114,AMD,United Kingdom,0.0,moffphcgaming,#AMD,Been on holiday so back now. Gonna try get som...,1.282244e+18,🕹MoffPHC Gaming🕹,-0.3102,Technology,AMD,0
1,1,2020-07-12 08:16:45,AMD,1719,1,AMD,digitalocean,1.0,LinuxDreams,#AMD,RT @LinuxReviews: #Linux architect Linus Torva...,1.282227e+18,LinuxDreams,-0.3612,Technology,AMD,0
2,2,2020-07-12 08:11:41,AMD,69,135,AMD,Amsterdam,1.0,LinuxReviews,#AMD,"#Linux architect Linus Torvalds: AVX512 Is ""A ...",1.282226e+18,LinuxReviews,-0.3612,Technology,AMD,0
3,3,2020-07-12 02:22:50,AMD,34,155,AMD,San Francisco,0.0,NdrewGarcia,#AMD,#AMD stuck in a range box chart https://t.co/5...,1.282138e+18,Encino_Man,-0.25,Technology,AMD,0
4,4,2020-07-11 23:58:44,AMD,802,730,AMD,"New Jersey, USA",0.0,Roger_Clinton1,#AMD,$AMD Epyc Milan Leak – Three early Genesis sam...,1.282102e+18,Roger Ocasio-Clinton,-0.34,Technology,AMD,0


In [None]:
df_bigtech_sm = df_bigtech.sample(n=1000, random_state=42)
df_bigtech_sm.reset_index(drop=True, inplace=True)

In [None]:
df_bigtech_sm["tokens"] = df_bigtech_sm.apply(lambda x: x["text"].split(), axis=1)

In [None]:
bert_pred(bert_c_ft_bigger, bert_c_ft_bigger_t, df_bigtech_sm, "tokens", "bert_pred")

Unnamed: 0.1,Unnamed: 0,created_at,file_name,followers,friends,group_name,location,retweet_count,screenname,search_query,text,twitter_id,username,polarity,partition_0,partition_1,sentiment,tokens,bert_pred
0,952,2020-07-20 19:26:40,Microsoft,253,59,Microsoft,"Dallas, TX",0.0,Sc0ttMcC,#Microsoft,My theory on why there has been no #Xbox revea...,1.285295e+18,Scott McCollum,-0.5122,Technology,Microsoft,0,"[My, theory, on, why, there, has, been, no, #X...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1..."
1,6599,2020-07-28 09:50:24,AMD,204,516,AMD,"England, United Kingdom",0.0,harikanesh,#AMD,Mixed feelings about #Durysta. Is this really ...,1.288049e+18,Hari,0.7814,Technology,AMD,2,"[Mixed, feelings, about, #Durysta., Is, this, ...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1..."
2,4220,2020-07-11 18:54:19,Apple,14,158,Apple,ประเทศไทย,9408.0,CyrusDamnN,#Apple OR #Iphone,RT @GFRDofficial: [🎞 #G_TODAY] #여자친구 #GFRIEN...,1.282025e+18,จยย 🍎🍎,0.0000,Technology,Apple,1,"[RT, @GFRDofficial:, [🎞, #G_TODAY], #여자친구, #GF...","[14, 14, 14, 14, 14, 14, 14, 8, 8, 8, 14, 14, ..."
3,2554,2020-09-03 22:52:32,Apple,40508,35032,Apple,Planet Earth,6.0,UserExperienceU,#Apple OR #Iphone,@kymalainenkimmo @Huawei #Apple say it’s the s...,1.301654e+18,Karl A Smith,-0.2755,Technology,Apple,0,"[@kymalainenkimmo, @Huawei, #Apple, say, it’s,...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1..."
4,4519,2020-07-14 20:43:38,Twitch,61,169,Twitch,Portland Or,0.0,humanx2live,#Twitch,OP Raum! In Paladins with Nanaboomsickle of Hu...,1.283140e+18,HUMANx2,0.0000,Technology,Twitch,1,"[OP, Raum!, In, Paladins, with, Nanaboomsickle...","[14, 14, 14, 1, 14, 5, 14, 0, 14, 14, 14, 14, 14]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,3248,2020-07-23 17:45:27,Apple,13793,14002,Apple,michigan,1.0,pezoutlaw,#Apple OR #Iphone,"RT @pezoutlaw: Pez Outlaw, ""Notes From The Asy...",1.286357e+18,"Pez Outlaw, sj glew",-0.2732,Technology,Apple,0,"[RT, @pezoutlaw:, Pez, Outlaw,, ""Notes, From, ...","[14, 14, 1, 8, 14, 8, 8, 8, 14, 14, 14, 14, 1,..."
996,2698,2020-09-01 05:01:12,Apple,5870,161,Apple,"Atlanta, GA",3.0,NcsVentures,#Apple OR #Iphone,#mobilesecurity | #android | #iphone | POCO X3...,1.300660e+18,National Cyber Security,-0.6633,Technology,Apple,0,"[#mobilesecurity, |, #android, |, #iphone, |, ...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1..."
997,7955,2020-07-15 05:11:52,Twitch,1299,798,Twitch,Yo Mommas House,9.0,Ex_Factor618,#Twitch,"RT @IdcWhyShudU: 🚨Stream Alert 🚨 🗓July 15, 2...",1.283268e+18,Static Shock,0.6124,Technology,Twitch,2,"[RT, @IdcWhyShudU:, 🚨Stream, Alert, 🚨, 🗓July, ...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1..."
998,647,2020-07-08 22:31:14,Netflix,560,540,Netflix,"Newcastle Upon Tyne, England",0.0,itsalexfisher,#Netflix,I can’t recommend #stateless enough. A must wa...,1.280993e+18,alex fisher,-0.9186,Technology,Netflix,0,"[I, can’t, recommend, #stateless, enough., A, ...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1..."


In [None]:
df_bigtech_sm["pred_labels"] = df_bigtech_sm["bert_pred"].apply(tags_to_labels)

In [None]:
df_bigtech_sm.head()

Unnamed: 0.1,Unnamed: 0,created_at,file_name,followers,friends,group_name,location,retweet_count,screenname,search_query,text,twitter_id,username,polarity,partition_0,partition_1,sentiment,tokens,bert_pred,pred_labels
0,952,2020-07-20 19:26:40,Microsoft,253,59,Microsoft,"Dallas, TX",0.0,Sc0ttMcC,#Microsoft,My theory on why there has been no #Xbox revea...,1.285295e+18,Scott McCollum,-0.5122,Technology,Microsoft,0,"[My, theory, on, why, there, has, been, no, #X...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...","[O, O, O, O, O, O, O, O, O, O, O, O, I-creativ..."
1,6599,2020-07-28 09:50:24,AMD,204,516,AMD,"England, United Kingdom",0.0,harikanesh,#AMD,Mixed feelings about #Durysta. Is this really ...,1.288049e+18,Hari,0.7814,Technology,AMD,2,"[Mixed, feelings, about, #Durysta., Is, this, ...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,4220,2020-07-11 18:54:19,Apple,14,158,Apple,ประเทศไทย,9408.0,CyrusDamnN,#Apple OR #Iphone,RT @GFRDofficial: [🎞 #G_TODAY] #여자친구 #GFRIEN...,1.282025e+18,จยย 🍎🍎,0.0,Technology,Apple,1,"[RT, @GFRDofficial:, [🎞, #G_TODAY], #여자친구, #GF...","[14, 14, 14, 14, 14, 14, 14, 8, 8, 8, 14, 14, ...","[O, O, O, O, O, O, O, I-creative_work, I-creat..."
3,2554,2020-09-03 22:52:32,Apple,40508,35032,Apple,Planet Earth,6.0,UserExperienceU,#Apple OR #Iphone,@kymalainenkimmo @Huawei #Apple say it’s the s...,1.301654e+18,Karl A Smith,-0.2755,Technology,Apple,0,"[@kymalainenkimmo, @Huawei, #Apple, say, it’s,...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,4519,2020-07-14 20:43:38,Twitch,61,169,Twitch,Portland Or,0.0,humanx2live,#Twitch,OP Raum! In Paladins with Nanaboomsickle of Hu...,1.28314e+18,HUMANx2,0.0,Technology,Twitch,1,"[OP, Raum!, In, Paladins, with, Nanaboomsickle...","[14, 14, 14, 1, 14, 5, 14, 0, 14, 14, 14, 14, 14]","[O, O, O, B-creative_work, O, B-person, O, B-c..."


In [None]:
for row in df_bigtech_sm[900:].iterrows():
    print(row[1]["text"])
    print(row[1]["pred_labels"])

RT @FullyChargedShw: The Polestar 2 - here's the Fully Charged review  https://t.co/aBnS4rC6SD  @bobbyllew @PolestarCars   #Polestar2 #Pole…
['O', 'O', 'B-creative_work', 'I-creative_work', 'I-creative_work', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
RT @CarlKoinberg: I never heard about "lemon laundering" before.  It is apparently a way for companies to trick their customers into buying…
['B-person', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
RT @OC3D: AMD Zen 3 EPYC "Genesis" A0 sample specs leak - Retail Zen 3 CPUs are due to release this year.  #Zen3 #AMD  https://t.co/4QJn174…
['O', 'O', 'B-product', 'B-product', 'I-product', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-product', 'I-product', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Get Godfall™ when you purchase a SAPPHIRE PULSE RX 5500 XT GPU from participating retailers! .  . #Godfall #gamebundle #AMD #Radeon #RX5500XT #RX5500series #Navi #SAPPHIR

# fine-tuning BERT-base with NER-corpus

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# reduced entity dict fitting the corpus
entity_dict_sm = {
    0: "B-corporation",
    1: "B-event",
    2: "B-location",
    3: "B-person",
    4: "B-product",
    5: "I-corporation",
    6: "I-event",
    7: "I-location",
    8: "I-person",
    9: "I-product",
    10: "O"
}

label_list = list(entity_dict_sm.values())
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

bert_base_cased_t = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

bert_base_cased = AutoModelForTokenClassification.from_pretrained(
    "google-bert/bert-base-cased",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import pandas as pd

df_nerc = pd.read_json("NER_corpus.json", orient="records")
df_nerc.head()

Unnamed: 0,tokens,labels,tags
0,"[., {{MENTION}}, stand, tall, in, the, red, zo...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."
1,"[Who, else, is, gonna, get, the, new, Surface,...","[O, O, O, O, O, O, O, B-product, I-product, O,...","[10, 10, 10, 10, 10, 10, 10, 4, 9, 10, 10, 10]"
2,"[EDIE, IN, BETWEEN, comes, out, THIS, MONTH, ,...","[B-person, O, O, O, O, O, O, O, O, O, O, O, O,...","[3, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10..."
3,"[‘, Joe, Biden, may, have, won, the, keys, to,...","[O, B-person, I-person, O, O, O, O, O, O, O, B...","[10, 3, 8, 10, 10, 10, 10, 10, 10, 10, 2, 7, 1..."
4,"[Imma, hit, reload, until, I, see, additional,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."


In [None]:
len(df_nerc)

24835

In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(df_nerc)

In [None]:
#label_map = {tag: id for id, tag in enumerate(label_list)}

In [None]:
# version of function that worked with synthetic data
def tokenize_and_align_labels(examples):
    tokenized_inputs = bert_base_cased_t(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []

        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id != previous_word_id:
                if isinstance(label[word_id], str):
                    # converting string label to id
                    label_ids.append(label_map[label[word_id]])
                else:
                    label_ids.append(label[word_id])
            else:
                label_ids.append(-100)
            previous_word_id = word_id

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/24835 [00:00<?, ? examples/s]

In [None]:
train_test_split1 = tokenized_dataset.train_test_split(test_size=0.15, seed=42)
train_val_dataset = train_test_split1['train']
test_dataset = train_test_split1['test']

train_test_split2 = train_val_dataset.train_test_split(test_size=0.15, seed=42)
train_dataset = train_test_split2['train']
val_dataset = train_test_split2['test']

In [None]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # flattening predictions and labels
    true_labels = [id_to_label[label] for label in labels.flatten() if label != -100]
    true_predictions = [id_to_label[pred] for (pred, label) in zip(predictions.flatten(), labels.flatten()) if label != -100]

    # calculating metrics
    report = classification_report(true_labels, true_predictions, output_dict=True)
    return {
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"]
    }

In [None]:
training_args = TrainingArguments(
    output_dir="./bert-base-cased-ner-corpus",
    evaluation_strategy="epoch",   # evaluating at every epoch
    learning_rate=3e-5,
    per_device_train_batch_size=32, # adjust batch size depending on RAM
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2
)

trainer = Trainer(
    model=bert_base_cased,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=bert_base_cased_t,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mhausbichler-georg[0m ([33mhausbichler-georg-wirtschaftsuniversit-t-wien[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1596,0.162027,0.949463,0.951808,0.949893
2,0.1503,0.15789,0.951001,0.953006,0.951336


TrainOutput(global_step=1122, training_loss=0.17906336733364167, metrics={'train_runtime': 371.1213, 'train_samples_per_second': 96.691, 'train_steps_per_second': 3.023, 'total_flos': 2344283939908608.0, 'train_loss': 0.17906336733364167, 'epoch': 2.0})

In [None]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.1643805056810379,
 'eval_precision': 0.9503697486045067,
 'eval_recall': 0.9528348152412631,
 'eval_f1': 0.9509609661966004,
 'eval_runtime': 12.9036,
 'eval_samples_per_second': 288.757,
 'eval_steps_per_second': 9.067,
 'epoch': 2.0}

In [None]:
bert_base_cased.save_pretrained("./bert-base-cased-ner-corpus")
bert_base_cased_t.save_pretrained("./bert-base-cased-ner-corpus")

('./bert-base-cased-ner-corpus/tokenizer_config.json',
 './bert-base-cased-ner-corpus/special_tokens_map.json',
 './bert-base-cased-ner-corpus/vocab.txt',
 './bert-base-cased-ner-corpus/added_tokens.json',
 './bert-base-cased-ner-corpus/tokenizer.json')

## applying model

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

bert_corpus_t = AutoTokenizer.from_pretrained("./bert-base-cased-ner-corpus")

bert_corpus = AutoModelForTokenClassification.from_pretrained(
    "./bert-base-cased-ner-corpus",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

In [None]:
df_corpus_test = test_dataset.to_pandas()
df_corpus_test.head()

Unnamed: 0,tokens,original_labels,tags,input_ids,attention_mask,labels
0,"[There, has, been, no, electricity, for, last,...","[O, O, O, O, O, O, O, O, O, O, B-location, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 2, 10...","[0, 345, 34, 57, 117, 4382, 13, 94, 158, 722, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,..."
1,"[2-0, to, West, Ham, at, half-time, ., Think, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...","[0, 132, 12, 288, 7, 580, 3600, 23, 457, 12, 9...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, -100, -100, 10, 10, 10, 10, 10, -10..."
2,"[I, do, n't, know, why, this, {{MENTION}}, tea...","[O, O, O, O, O, O, B-corporation, O, O, O, O, ...","[10, 10, 10, 10, 10, 10, 0, 10, 10, 10, 10, 10...","[0, 38, 109, 295, 75, 216, 596, 42, 47517, 126...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, -100, 10, 10, 10, 0, -100, ..."
3,"[I, guess, it, follows, that, President, Trump...","[O, O, O, O, O, B-person, I-person, O, O, O, O...","[10, 10, 10, 10, 10, 3, 8, 10, 10, 10, 10, 10,...","[0, 38, 4443, 24, 3905, 14, 270, 140, 222, 295...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, 10, 10, 3, 8, 10, 10, -100,..."
4,"[Suddenly, getting, Classic, WoW, Naxx, tips, ...","[O, O, O, O, B-product, O, O, B-person, O, O, ...","[10, 10, 10, 10, 4, 10, 10, 3, 10, 10, 10, 10,...","[0, 26825, 562, 7175, 20963, 771, 234, 3631, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, 10, -100, 4, -100, -100, 10..."


In [None]:
df_corpus_test["true_labels"] = df_corpus_test["tags"].apply(lambda x: [id_to_label[tag] for tag in x])

In [None]:
# version of function from WNUT data
bert_pred(bert_corpus, bert_corpus_t, df_corpus_test, "tokens", "bert_pred")

Using device: cpu
Model has 11 labels
Input shape: torch.Size([1, 128])


Unnamed: 0,tokens,original_labels,tags,input_ids,attention_mask,labels,true_labels,bert_pred
0,"[There, has, been, no, electricity, for, last,...","[O, O, O, O, O, O, O, O, O, O, B-location, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 2, 10...","[0, 345, 34, 57, 117, 4382, 13, 94, 158, 722, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,...","[O, O, O, O, O, O, O, O, O, O, B-location, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 2, 10..."
1,"[2-0, to, West, Ham, at, half-time, ., Think, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...","[0, 132, 12, 288, 7, 580, 3600, 23, 457, 12, 9...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, -100, -100, 10, 10, 10, 10, 10, -10...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."
2,"[I, do, n't, know, why, this, {{MENTION}}, tea...","[O, O, O, O, O, O, B-corporation, O, O, O, O, ...","[10, 10, 10, 10, 10, 10, 0, 10, 10, 10, 10, 10...","[0, 38, 109, 295, 75, 216, 596, 42, 47517, 126...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, -100, 10, 10, 10, 0, -100, ...","[O, O, O, O, O, O, B-corporation, O, O, O, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."
3,"[I, guess, it, follows, that, President, Trump...","[O, O, O, O, O, B-person, I-person, O, O, O, O...","[10, 10, 10, 10, 10, 3, 8, 10, 10, 10, 10, 10,...","[0, 38, 4443, 24, 3905, 14, 270, 140, 222, 295...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, 10, 10, 3, 8, 10, 10, -100,...","[O, O, O, O, O, B-person, I-person, O, O, O, O...","[10, 10, 10, 10, 10, 3, 8, 10, 10, 10, 10, 10,..."
4,"[Suddenly, getting, Classic, WoW, Naxx, tips, ...","[O, O, O, O, B-product, O, O, B-person, O, O, ...","[10, 10, 10, 10, 4, 10, 10, 3, 10, 10, 10, 10,...","[0, 26825, 562, 7175, 20963, 771, 234, 3631, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, 10, -100, 4, -100, -100, 10...","[O, O, O, O, B-product, O, O, B-person, O, O, ...","[10, 10, 10, 10, 10, 10, 10, 3, 10, 10, 10, 10..."
...,...,...,...,...,...,...,...,...
3721,"[Rumors, say, IBM, is, releasing, AirTag, soon...","[O, O, B-corporation, O, O, B-product, O, O]","[10, 10, 0, 10, 10, 4, 10, 10]","[0, 13772, 994, 224, 11510, 16, 8704, 1754, 45...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...","[-100, 10, -100, 10, 0, 10, 10, 4, -100, 10, 1...","[O, O, B-corporation, O, O, B-product, O, O]","[10, 10, 0, 10, 10, 4, 10, 10]"
3722,"[New, Music, +Video, :, {{MENTION}}, -, John, ...","[O, O, O, O, O, O, B-person, I-person, O, O, O]","[10, 10, 10, 10, 10, 10, 3, 8, 10, 10, 10]","[0, 188, 3920, 2055, 17967, 4832, 47517, 12613...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, -100, 10, 10, -100, -100, -...","[O, O, O, O, O, O, B-person, I-person, O, O, O]","[10, 10, 10, 10, 10, 10, 3, 8, 10, 10, 10]"
3723,"[The, American, people, are, taking, this, pan...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...","[0, 20, 470, 82, 32, 602, 42, 23387, 14414, 36...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 10, 10, 10, 10, 10, 10, 10, -100, 10, 1...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."
3724,"[Happy, Mother, ’s, Day, !, Truly, the, most, ...","[B-event, I-event, I-event, I-event, O, O, O, ...","[1, 6, 6, 6, 10, 10, 10, 10, 10, 10, 10, 10, 1...","[0, 9899, 8133, 44, 27, 29, 1053, 27785, 39003...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 1, 6, 6, -100, -100, 6, 10, 10, 10, 10,...","[B-event, I-event, I-event, I-event, O, O, O, ...","[1, 6, 6, 6, 10, 10, 10, 10, 10, 10, 10, 10, 1..."


In [None]:
from nervaluate import Evaluator

df_corpus_test["bert_pred_labels"] = df_corpus_test["bert_pred"].apply(lambda x: [id_to_label[i] for i in x])

true = df_corpus_test["true_labels"].values.tolist()
pred = df_corpus_test["bert_pred_labels"].values.tolist()

evaluator = Evaluator(true, pred, tags=["corporation", "event", "location", "person", "product"], loader="list")

results, results_by_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
print(results)

{'ent_type': {'correct': 5159, 'incorrect': 578, 'partial': 0, 'missed': 1301, 'spurious': 990, 'possible': 7038, 'actual': 6727, 'precision': 0.7669094693028096, 'recall': 0.7330207445296959, 'f1': 0.7495822738830366}, 'partial': {'correct': 5384, 'incorrect': 0, 'partial': 353, 'missed': 1301, 'spurious': 990, 'possible': 7038, 'actual': 6727, 'precision': 0.8265943213914078, 'recall': 0.7900682011935208, 'f1': 0.8079186342172177}, 'strict': {'correct': 4885, 'incorrect': 852, 'partial': 0, 'missed': 1301, 'spurious': 990, 'possible': 7038, 'actual': 6727, 'precision': 0.726178088300877, 'recall': 0.6940892298948564, 'f1': 0.7097711587359243}, 'exact': {'correct': 5384, 'incorrect': 353, 'partial': 0, 'missed': 1301, 'spurious': 990, 'possible': 7038, 'actual': 6727, 'precision': 0.8003567712204549, 'recall': 0.7649900539926116, 'f1': 0.7822738830366872}}


In [None]:
print(results_by_tag)

{'corporation': {'ent_type': {'correct': 659, 'incorrect': 272, 'partial': 0, 'missed': 220, 'spurious': 106, 'possible': 1151, 'actual': 1037, 'precision': 0.6354869816779171, 'recall': 0.5725456125108601, 'f1': 0.6023765996343694}, 'partial': {'correct': 887, 'incorrect': 0, 'partial': 44, 'missed': 220, 'spurious': 106, 'possible': 1151, 'actual': 1037, 'precision': 0.8765670202507232, 'recall': 0.789748045178106, 'f1': 0.8308957952468007}, 'strict': {'correct': 641, 'incorrect': 290, 'partial': 0, 'missed': 220, 'spurious': 106, 'possible': 1151, 'actual': 1037, 'precision': 0.6181292189006751, 'recall': 0.5569070373588184, 'f1': 0.5859232175502742}, 'exact': {'correct': 887, 'incorrect': 44, 'partial': 0, 'missed': 220, 'spurious': 106, 'possible': 1151, 'actual': 1037, 'precision': 0.8553519768563163, 'recall': 0.7706342311033884, 'f1': 0.8107861060329067}}, 'event': {'ent_type': {'correct': 584, 'incorrect': 53, 'partial': 0, 'missed': 333, 'spurious': 194, 'possible': 970, 'act

In [None]:
metrics = evaluate_ner_predictions(df_corpus_test, "tags", "bert_pred")

              precision    recall  f1-score   support

           0       0.77      0.57      0.65      1151
           1       0.70      0.55      0.62       970
           2       0.82      0.77      0.79      1089
           3       0.75      0.82      0.78      2426
           4       0.82      0.73      0.77      1402
           5       0.51      0.21      0.30       232
           6       0.72      0.60      0.65       871
           7       0.73      0.58      0.65       359
           8       0.87      0.86      0.86      1082
           9       0.83      0.75      0.79       961
          10       0.97      0.98      0.98     83255

    accuracy                           0.95     93798
   macro avg       0.77      0.67      0.71     93798
weighted avg       0.95      0.95      0.95     93798

Precision: 0.9504
Recall: 0.9528
F1 Score: 0.9516


In [None]:
df_tbt_app = pd.read_json("tweets_bigtech_10k_application.json", orient="records")
df_tbt_app.head()

Unnamed: 0,text,tokens,labels
0,Microsoft Build 2020: Empowering developers to...,"[Microsoft, Build, 2020:, Empowering, develope...",1
1,I will do photoshop editing retouching documen...,"[I, will, do, photoshop, editing, retouching, ...",1
2,#Laney #Amps 🎶 Will Steal Your #Face Right Off...,"[#Laney, #Amps, 🎶, Will, Steal, Your, #Face, R...",0
3,RT @ophierian_vp: Geralt &amp; Aerondight Silv...,"[RT, @ophierian_vp:, Geralt, &amp;, Aerondight...",1
4,"#Google, Amazon funnel at least $25 million to...","[#Google,, Amazon, funnel, at, least, $25, mil...",0


In [None]:
# applying model to unlabeled data for 2nd run through visualizations
bert_pred(bert_corpus, bert_corpus_t, df_tbt_app, "tokens", "bert_pred")

Unnamed: 0,text,tokens,labels,bert_pred
0,Microsoft Build 2020: Empowering developers to...,"[Microsoft, Build, 2020:, Empowering, develope...",1,"[0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10..."
1,I will do photoshop editing retouching documen...,"[I, will, do, photoshop, editing, retouching, ...",1,"[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."
2,#Laney #Amps 🎶 Will Steal Your #Face Right Off...,"[#Laney, #Amps, 🎶, Will, Steal, Your, #Face, R...",0,"[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."
3,RT @ophierian_vp: Geralt &amp; Aerondight Silv...,"[RT, @ophierian_vp:, Geralt, &amp;, Aerondight...",1,"[10, 10, 3, 10, 10, 10, 10, 10, 10, 10, 10, 10..."
4,"#Google, Amazon funnel at least $25 million to...","[#Google,, Amazon, funnel, at, least, $25, mil...",0,"[10, 0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10..."
...,...,...,...,...
9995,Check out our newest episode of Thanks for Sha...,"[Check, out, our, newest, episode, of, Thanks,...",2,"[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 0, 3,..."
9996,RT @USTechFin: The 'fantastic four' of Big #US...,"[RT, @USTechFin:, The, 'fantastic, four', of, ...",2,"[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."
9997,RT @MisheruSeeto: The King : Eternal Monarch (...,"[RT, @MisheruSeeto:, The, King, :, Eternal, Mo...",1,"[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."
9998,Have you been affected by #Google Dance or Goo...,"[Have, you, been, affected, by, #Google, Dance...",2,"[10, 10, 10, 10, 10, 10, 9, 10, 4, 10, 10, 10,..."


In [None]:
df_tbt_app.to_json("tweets_bigtech_10k_application_afterNER.json", orient="records")

In [None]:
for index, row in df_bigtech_sm[:100].iterrows():
    print(row["text"])
    print(row["bert_pred"])

My theory on why there has been no #Xbox reveal for #Microsoft Flight Simulator: Flying with a standard Xbox controller is almost impossible and the HOTAS accessories aren't ready for consoles yet https://t.co/4OLUZg9k04
[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 10, 10, 10, 10, 4, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]
Mixed feelings about #Durysta. Is this really innovative in the age of selective laser trabeculoplasty which can be a great 1st line treatment &amp; non invasive? If we're not careful we could overrun already stretched #glaucoma services a la #AMD #AntiVEGF.  https://t.co/aS2YsmaRpr
[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]
RT @GFRDofficial: [🎞 #G_TODAY]   #여자친구 #GFRIEND 回:Song of the Sirens   🍎 #Apple #OOTD https://t.co/iQSXyoxJM8
[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]
@kymalainenkimmo @Huawei #Appl

In [None]:
# applying model to dell data to create time plot
df_dell = pd.read_json("sentiment_dell_processed.json", orient="records")
df_dell.head()

Unnamed: 0,Datetime,text,tokens,sentiment
0,2022-09-30 23:29:15+00:00,Logitech Apple Google Microsoft Dell Lenovo #W...,"[Logitech, Apple, Google, Microsoft, Dell, Len...",neutral
1,2022-09-30 21:46:35+00:00,{{MENTION}} {{MENTION}} {{MENTION}} {{MENTION}...,"[{{MENTION}}, {{MENTION}}, {{MENTION}}, {{MENT...",neutral
2,2022-09-30 21:18:02+00:00,As {{MENTION}} celebrates its 40th anniversary...,"[As, {{MENTION}}, celebrates, its, 40th, anniv...",positive
3,2022-09-30 20:05:24+00:00,Dell your customer service is horrible especia...,"[Dell, your, customer, service, is, horrible, ...",negative
4,2022-09-30 20:03:17+00:00,{{MENTION}} Dell Dellcares Dell give the man w...,"[{{MENTION}}, Dell, Dellcares, Dell, give, the...",neutral


In [None]:
bert_pred(bert_corpus, bert_corpus_t, df_dell, "tokens", "ner_tags")

Unnamed: 0,Datetime,text,tokens,sentiment,ner_tags
0,2022-09-30 23:29:15+00:00,Logitech Apple Google Microsoft Dell Lenovo #W...,"[Logitech, Apple, Google, Microsoft, Dell, Len...",neutral,"[4, 9, 0, 0, 9, 4, 10, 10, 10, 10, 10, 10, 10,..."
1,2022-09-30 21:46:35+00:00,{{MENTION}} {{MENTION}} {{MENTION}} {{MENTION}...,"[{{MENTION}}, {{MENTION}}, {{MENTION}}, {{MENT...",neutral,"[10, 10, 10, 10, 10, 10, 10, 10, 4, 9, 10, 10,..."
2,2022-09-30 21:18:02+00:00,As {{MENTION}} celebrates its 40th anniversary...,"[As, {{MENTION}}, celebrates, its, 40th, anniv...",positive,"[10, 10, 10, 10, 10, 10, 3, 8, 10, 10, 10, 10,..."
3,2022-09-30 20:05:24+00:00,Dell your customer service is horrible especia...,"[Dell, your, customer, service, is, horrible, ...",negative,"[0, 10, 10, 10, 10, 10, 10, 10, 3, 10, 10, 10,..."
4,2022-09-30 20:03:17+00:00,{{MENTION}} Dell Dellcares Dell give the man w...,"[{{MENTION}}, Dell, Dellcares, Dell, give, the...",neutral,"[10, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10]"
...,...,...,...,...,...
24965,2022-01-01 02:02:04+00:00,{{MENTION}} {{MENTION}} Dell I wouldn't even k...,"[{{MENTION}}, {{MENTION}}, Dell, I, wouldn't, ...",negative,"[10, 10, 0, 10, 10, 10, 10, 10, 10, 10, 10, 10..."
24966,2022-01-01 01:57:34+00:00,{{MENTION}} {{MENTION}} Dell I didn't really l...,"[{{MENTION}}, {{MENTION}}, Dell, I, didn't, re...",positive,"[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."
24967,2022-01-01 01:36:36+00:00,Hey {{MENTION}} here it is....27 4K UHD USB-C ...,"[Hey, {{MENTION}}, here, it, is....27, 4K, UHD...",neutral,"[10, 10, 10, 10, 10, 10, 4, 4, 10, 0, 10]"
24968,2022-01-01 01:31:30+00:00,{{MENTION}} {{MENTION}} Alienware Intel {{MENT...,"[{{MENTION}}, {{MENTION}}, Alienware, Intel, {...",neutral,"[10, 10, 4, 10, 10, 10, 10, 0, 10, 10, 10, 10,..."


In [None]:
df_dell["ner_labels"] = df_dell["ner_tags"].apply(lambda x: [id_to_label[tag] for tag in x])

In [None]:
df_dell.to_json("dell_afterNER.json", orient="records")