In [1]:
#!pip install transformers accelerate torch huggingface_hub datasets nervaluate

In [None]:
import pandas as pd
import numpy as np
import torch

# RoBERTa-TweetNER pipeline attempts (brd-sa-data)

In [None]:
# trying out RoBERTa version trained with tweetner
# tner/roberta-base-tweetner7-all

from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

model_name = "tner/roberta-base-tweetner7-all"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

text = "Apple just launched the new iPhone, and Microsoft released a new version of Windows."

entities = ner_pipeline(text)
print(entities)

In [None]:
filtered_entities = [{"word": entity["word"], "entity": entity["entity"]} for entity in entities]
print(filtered_entities)

In [None]:
# testing NER on Brand Sentiment Analysis data
import pandas as pd
df_brd_sa = pd.read_csv("./analysis_data/Brand Sentiment Analysis Dataset/Dataset - Train.csv")
df_brd_sa.head()

In [None]:
# applying pre-processing function before model application from https://huggingface.co/tner/roberta-base-tweetner7-all
import re
from urlextract import URLExtract

extractor = URLExtract()

def format_tweet(tweet):
    # mask web urls
    urls = extractor.find_urls(tweet)
    for url in urls:
        tweet = tweet.replace(url, "{{URL}}")
    # format twitter account
    tweet = re.sub(r"\b(\s*)(@[\S]+)\b", r'\1{\2@}', tweet)
    return tweet

In [None]:
df_brd_sa_sm = df_brd_sa[:100]

df_brd_sa_sm.dropna()

df_brd_sa_sm["tweet_text"] = df_brd_sa_sm.apply(lambda x: format_tweet(str(x["tweet_text"])), axis=1)

In [None]:
def ner(txt):
    entities = ner_pipeline(txt)
    entities = [{"word": entity["word"], "entity": entity["entity"]} for entity in entities]
    return entities

In [None]:
df_brd_sa["entities"] = df_brd_sa.apply(lambda x: ner(str(x["tweet_text"])), axis=1)

In [None]:
df_brd_sa.head()

In [None]:
reslist = list(df_brd_sa[["tweet_text", "entities"]].values)
#print(reslist)

In [None]:
# json allows entities-column to be stored in the CSV
import json
df_brd_sa["entities"] = df_brd_sa["entities"].apply(json.dumps)

In [None]:
# saving the df containing the prediction results as CSV to be used further
df_brd_sa.to_csv("./data after NER.csv")

# loading TweetNER7-data and applying roberta-tweetner-pipeline

In [None]:
# loading tweetner17 dataset from huggingface https://huggingface.co/datasets/tner/tweetner7
from datasets import load_dataset

ds = load_dataset("tner/tweetner7")

In [None]:
tweetner7_train = ds["train_all"].to_pandas()
tweetner7_test = ds["test_2021"].to_pandas()
tweetner7_test20 = ds["test_2020"].to_pandas()
tweetner7_train.dropna(inplace=True)
tweetner7_test.dropna(inplace=True)
tweetner7_test20.dropna(inplace=True)

In [None]:
tweetner7_big = pd.concat([tweetner7_train, tweetner7_test])

In [None]:
len(tweetner7_big)

In [None]:
# converting the tags-column to a label-column to be comparable with the model-output
entity_dict = {
    0: "B-corporation",
    1: "B-creative_work",
    2: "B-event",
    3: "B-group",
    4: "B-location",
    5: "B-person",
    6: "B-product",
    7: "I-corporation",
    8: "I-creative_work",
    9: "I-event",
    10: "I-group",
    11: "I-location",
    12: "I-person",
    13: "I-product",
    14: "O"
}

label_list = list(entity_dict.values())
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

def tags_to_labels(col):
    result = []
    for i in col:
        label = entity_dict[i]
        result.append(label)
    return result

In [None]:
tweetner7_test["true_labels"] = tweetner7_test["tags"].apply(tags_to_labels)
tweetner7_test.head()

In [None]:
tweetner7_train["ner_pipeline"] = tweetner7_train.apply(lambda x: ner_pipeline(str(x["tokens"])), axis=1)

In [None]:
# example output of the pipeline
#test_example = [{'entity': 'B-corporation', 'score': 0.5353071, 'index': 1, 'word': 'Apple', 'start': 0, 'end': 5},
#{'entity': 'B-product', 'score': 0.656318, 'index': 6, 'word': 'ĠiPhone', 'start': 28, 'end': 34},
#{'entity': 'B-corporation', 'score': 0.70640016, 'index': 9, 'word': 'ĠMicrosoft', 'start': 40, 'end': 49},
#{'entity': 'B-product', 'score': 0.6205899, 'index': 15, 'word': 'ĠWindows', 'start': 76, 'end': 83}]

# creating a function transforming the predicted labels to a list to be better comparable to the actual entity labels
def output_to_labellist(row, output_col):
    labellist = []
    words_labels = {}

    # adjust column-name depending on name of prediction-column
    for dic in row[output_col]:
        words_labels[dic["word"]] = dic["entity"]

    for i in row["tokens"]:
        if i in words_labels.keys():
            labellist.append(words_labels[i])
        else:
            labellist.append("O")

    return labellist

In [None]:
tweetner7_train["predicted_labels"] = tweetner7_train.apply(output_to_labellist, axis=1)
tweetner7_train.head()

In [None]:
# using the package "nervaluate" to evaluate the performance
from nervaluate import Evaluator

true = tweetner7_train["true_labels"].values.tolist()
pred = tweetner7_train["predicted_labels"].values.tolist()

evaluator = Evaluator(true, pred, tags=["corporation", "creative_work", "event", "group", "location", "person", "product"], loader="list")

results, results_by_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
print(results)

In [None]:
print(results_by_tag["person"])

# predicting with function similar to llama-approach

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# add_prefix_space necessary fixes issue with prediction function
roberta_tweetner_t = AutoTokenizer.from_pretrained("tner/roberta-base-tweetner7-all", add_prefix_space=True)

roberta_tweetner = AutoModelForTokenClassification.from_pretrained(
    "tner/roberta-base-tweetner7-all",
    device_map="auto",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

In [None]:
# prediction function explicitly handles numpy arrays, not needed for llama
# (tweetner7-tokens are saved as such) and encodes immediately
# function should work for all BERT-/RoBERTa-models
import numpy as np

def bert_pred(model, tokenizer, df, text_col, output_col):

    model.to(model.device)
    model.eval()
    all_predictions = []

    for i, row in df.iterrows():
        text = row[text_col]

        if isinstance(text, np.ndarray):
            if text.dtype.kind == 'U' or text.dtype.kind == 'S':  # if array contains strings
                text_list = text.tolist()
            else:
                text_list = [str(x) for x in text]

            encoded = tokenizer(text_list, is_split_into_words=True, return_tensors="pt")
        elif isinstance(text, str):
            encoded = tokenizer(text, return_tensors="pt")
        elif isinstance(text, list):
            encoded = tokenizer(text, is_split_into_words=True, return_tensors="pt")
        else:
            encoded = tokenizer(str(text), return_tensors="pt")

        # moving encoded text to device
        encoded = {k: v.to(model.device) for k, v in encoded.items()}

        with torch.no_grad():
            outputs = model(**encoded)
            preds = torch.argmax(outputs.logits, dim=-1)[0].cpu().tolist()

            # mapping predictions back to original tokens for token classification
            if isinstance(text, (list, np.ndarray)):
                if isinstance(text, np.ndarray):
                    word_ids = tokenizer(text.tolist(), is_split_into_words=True).word_ids()
                else:
                    word_ids = tokenizer(text, is_split_into_words=True).word_ids()

                processed_preds = []
                prev_word_id = None

                for word_id, pred in zip(word_ids, preds):
                    if word_id is None:
                        continue
                    elif word_id != prev_word_id:
                        processed_preds.append(pred)
                    prev_word_id = word_id

                all_predictions.append(processed_preds)
            else:
                all_predictions.append(preds)

    df[output_col] = all_predictions
    return df

In [None]:
bert_pred(roberta_tweetner, roberta_tweetner_t, tweetner7_test, "tokens", "roberta_pred")

In [None]:
tweetner7_test["roberta_prediction_labels"] = tweetner7_test["roberta_pred"].apply(lambda x: [id_to_label[i] for i in x])

In [None]:
from nervaluate import Evaluator

true = tweetner7_test["true_labels"].values.tolist()
pred = tweetner7_test["roberta_prediction_labels"].values.tolist()

evaluator = Evaluator(true, pred, tags=["corporation", "creative_work", "event", "group", "location", "person", "product"], loader="list")

results, results_by_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
print(results)

In [22]:
from sklearn.metrics import precision_score, recall_score, classification_report

# function returns dictionary containing precision, recall and f1-score
# should only be used for comparing columns with numeric tags, not string-labels
def evaluate_ner_predictions(df, true_col, pred_col):
    y_true_flat = []
    y_pred_flat = []

    for true_seq, pred_seq in zip(df[true_col], df[pred_col]):
       # skipping padding/special tokens (marked as -100 in ground truth)
       for true_label, pred_label in zip(true_seq, pred_seq):
            if true_label != -100:
               y_true_flat.append(true_label)
               y_pred_flat.append(pred_label)

    precision = precision_score(y_true_flat, y_pred_flat, average='weighted')
    recall = recall_score(y_true_flat, y_pred_flat, average='weighted', zero_division=0)
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print(classification_report(y_true_flat, y_pred_flat))

    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [None]:
metrics = evaluate_ner_predictions(tweetner7_test, "tags", "bert_pred")

# Testing RoBERTa-tweetner7 with WNUT

In [None]:
import pandas as pd
df_wnut = pd.read_csv("./wnut_complete_processed.csv")
df_wnut.head()

In [None]:
import ast

for col in ["tokens", "label_list"]:
    df_wnut[col] = df_wnut[col].apply(ast.literal_eval)

In [None]:
type(df_wnut["tokens"][0])

In [None]:
df_wnut.head()

In [None]:
df_wnut["tags"] = df_wnut["label_list"].apply(lambda x: [label_to_id[label] for label in x])

In [None]:
import numpy as np
import torch

# reworked prediction function with max_length and error exceptions to handle cuda runtime error
def bert_pred(model, tokenizer, df, text_col, output_col, max_length=128, batch_size=32):
    device = model.device
    print(f"Using device: {device}")

    # printing model config info
    if hasattr(model, 'config'):
        print(f"Model has {model.config.num_labels} labels")

    model.eval()
    all_predictions = []

    for i, row in df.iterrows():
        try:
            text = row[text_col]

            if isinstance(text, np.ndarray):
                if text.dtype.kind == 'U' or text.dtype.kind == 'S':
                    text_list = text.tolist()
                else:
                    text_list = [str(x) for x in text]

                if not text_list:
                    all_predictions.append([])
                    continue

                encoded = tokenizer(
                    text_list,
                    is_split_into_words=True,
                    return_tensors="pt",
                    max_length=max_length,
                    truncation=True,
                    padding="max_length"
                )
            elif isinstance(text, str):
                if not text.strip():
                    all_predictions.append([])
                    continue

                encoded = tokenizer(
                    text,
                    return_tensors="pt",
                    max_length=max_length,
                    truncation=True,
                    padding="max_length"
                )
            elif isinstance(text, list):
                if not text:
                    all_predictions.append([])
                    continue

                encoded = tokenizer(
                    text,
                    is_split_into_words=True,
                    return_tensors="pt",
                    max_length=max_length,
                    truncation=True,
                    padding="max_length"
                )
            else:
                encoded = tokenizer(
                    str(text),
                    return_tensors="pt",
                    max_length=max_length,
                    truncation=True,
                    padding="max_length"
                )

            encoded = {k: v.to(device) for k, v in encoded.items()}

            if i == 0:
                print(f"Input shape: {encoded['input_ids'].shape}")

            with torch.no_grad():
                outputs = model(**encoded)
                preds = torch.argmax(outputs.logits, dim=-1)[0].cpu().tolist()

                if isinstance(text, (list, np.ndarray)):
                    if isinstance(text, np.ndarray):
                        word_ids = tokenizer(
                            text.tolist(),
                            is_split_into_words=True,
                            max_length=max_length,
                            truncation=True,
                            padding="max_length"
                        ).word_ids()
                    else:
                        word_ids = tokenizer(
                            text,
                            is_split_into_words=True,
                            max_length=max_length,
                            truncation=True,
                            padding="max_length"
                        ).word_ids()

                    processed_preds = []
                    prev_word_id = None

                    for word_id, pred in zip(word_ids, preds):
                        if word_id is None:
                            continue
                        elif word_id != prev_word_id:
                            processed_preds.append(pred)
                        prev_word_id = word_id

                    all_predictions.append(processed_preds)
                else:
                    all_predictions.append(preds)

        except Exception as e:
            print(f"Error in row {i}: {e}")
            # adding empty predictions for failed rows
            all_predictions.append([])
            continue

    df[output_col] = all_predictions
    return df

In [None]:
bert_pred(roberta_tweetner, roberta_tweetner_t, df_wnut, "tokens", "roberta_pred")

In [None]:
metrics = evaluate_ner_predictions(df_wnut, "tags", "roberta_pred")

# Testing and Finetuning RoBERTa-tweetner7 with synthetic data

In [None]:
import pandas as pd
df_syn = pd.read_csv("./NER_data/synthetic_tweet_data_grouped2.csv")
df_syn_sm = df_syn[:25000]
df_syn_sm.head()

In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(df_syn_sm)

In [None]:
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [None]:
entity_dict_syn = {
    0: "B-corporation",
    1: "B-person",
    2: "B-product",
    3: "I-corporation",
    4: "I-person",
    5: "I-product",
    6: "O"
}

label_list = list(entity_dict_syn.values())
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

# def encode_labels(ds):
#      ds['new_tags'] = [label_to_id[label] for label in ds['labels']]
#      return ds

# def encode_labels(ds):
#     ds['new_tags'] = [
#         [label_to_id[label] for label in labels] if isinstance(labels, list) else []
#         for labels in ds['labels']
#     ]
#     return ds

# train_dataset = train_dataset.map(encode_labels)
# test_dataset = test_dataset.map(encode_labels)

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = roberta_tweetner_t(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )

    all_labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_id = None

        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id != previous_word_id:
                try:
                    label_ids.append(label[word_id])
                except IndexError:
                    label_ids.append(-100)
            else:
                label_ids.append(-100)
            previous_word_id = word_id

        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

## new syn data gen approach

In [None]:
persons = [
        "John Smith", "Emily Chen", "Michael Johnson", "Sarah Williams",
        "David Lee", "Maria Rodriguez", "James Brown", "Davis",
        "Robert Kim", "Jennifer Lopez", "Thomas Wilson", "Jessica Taylor", "Cook",
        "Carlos Vega", "Aisha Patel", "Daniel Park", "Olivia Nguyen", "Musk", "Smith"
    ]

corporations = [
        "Google", "Microsoft", "Apple", "Amazon", "Meta", "BlackBerry",
        "IBM", "Tesla", "Netflix", "Walmart", "JP Morgan",
        "Acme Corp", "TechSolutions", "Global Systems", "DataWorks",
        "Quantum Industries", "NexGen", "FutureSpace", "EcoSystems", "Nokia", "Motorola"
    ]

products = [
        "iPhone 13", "Galaxy S22", "Surface Pro", "PlayStation 5", "Xbox Series X",
        "MacBook Air", "Echo Dot", "AirPods Pro", "Tesla Model 3", "iPad Mini",
        "Dyson V11", "Fitbit Charge", "Nintendo Switch", "Kindle Paperwhite",
        "Roomba i7", "GoPro Hero", "Bose QuietComfort", "Instant Pot", "Echo", "AirTag", "ThinkPad"
    ]

events = [
        "CES 2023", "Web Summit", "SXSW", "TechCrunch Disrupt", "E3 Expo",
        "Google I/O", "WWDC", "Consumer Electronics Show", "Mobile World Congress",
        "Black Hat Conference", "DEF CON", "AWS re:Invent", "Game Developers Conference",
        "Dreamforce", "Comic-Con", "Coachella", "New York Fashion Week", "GamesCom", "AI-Con"
    ]

locations = [
        "New York", "San Francisco", "London", "Tokyo", "Berlin",
        "Paris", "Sydney", "Toronto", "Chicago", "Seattle",
        "Los Angeles", "Miami", "Singapore", "Hong Kong", "Milwaukee",
        "Dubai", "Barcelona", "Austin", "Stockholm", "Seoul", "Vienna"
    ]

templates = [
        "{person} from {corporation} announced that {product} will be showcased at {event}.",
        "At {event}, {person} demonstrated how {product} is revolutionizing {corporation}'s approach in {location}.",
        "{corporation} has selected {location} as the venue for {event}, where {person} will launch {product}.",
        "The new {product} developed by {corporation} will be presented by {person} during {event} in {location}.",
        "{person} confirmed that {corporation} will be expanding its {product} line.",
        "According to {person}, {corporation}'s latest {product} has been well-received at {event} in {location}.",
        "Reviews from {event} suggest that {person} made a strong case for {corporation}'s new {product} in the {location} market.",
        "{corporation} is planning to open a {product} store in {location}, announced {person} at {event}.",
        "The collaboration between {corporation} and {person} resulted in {product}, which will finally debut at {event} in {location}.",
        "Attendees at {event} in {location} were very impressed when {person} revealed {corporation}'s innovative {product}! The clapping didnt stop",
        "{person} traveled to {location} to promote {product} at {event} on behalf of {corporation}.",
        "The {product} team from {corporation}, led by {person}, won first prize at {event} in {location}. Let's go!",
        "Consumers in {location} can now purchase {product} after {corporation}'s expansion announcement by {person} at {event}.",
        "{product} is the must-have gadget of the year.",
        "I hate the new {product}, the older ones are much better.",
        "Less than 2 hours until they announce the details on the {product} giveaway!",
        "All eyes are on {corporation} after the announcement of their new {product}.",
        "It's time for {person} to leave {corporation}. What is he even doing.",
        "{corporation} has been selected as the top AI startup in {location}, wow!",
        "I am having so many issues with the {product}. {corporation} needs to fix this!",
        "Can not wait for {product} also. They should sell them down at {event}.",
        "Whats happening at {corporation}? {person} really needs to step up.",
        "{corporation} is giving free {product} to open source coders who are attending this meet-up.",
        "{person} was right! The {product} from {corporation} is revolutionary!",
        "Less than 2 hours until we announce the details on the {product} giveaway!",
        "{corporation} CEO {person}: Newest {product} rollout will begin next month!",
        "{corporation} has a temporary Retail Store in {location} for the {product} release today. Opens at 5pm.",
        "{person} said that {corporation} is working on something big.",
        "It's time for {person} to leave {corporation}. What is he even doing?",
        "{corporation} just keeps raising the bar with every {product} they launch. Crazy!",
        "{person} just hinted at new features in {corporation}'s upcoming {product}. I am hyped!",
        "Rumors say {corporation} is releasing {product} soon.",
        "Just watched the {corporation} keynote. {product} looks impressive.",
        "Is it just me, or does {corporation}'s {product} feel rushed and unfinished?",
        "The {product} is making me rethink my loyalty to {corporation}. Its not good.",
        "Who else is gonna get the new {product} next month?",
        "{corporation}'s industry party tonight was great for the launch of {product}.",
        "Attending {event} this week! Can't wait to see what {corporation} unveils about their upcoming {product}. Anyone else going?",
        "Is anyone else experiencing issues with the new {product} update? {corporation}'s support hasn't been helpful. #TechSupport",
        "Just switched from {product} to {product} and the difference is incredible. {corporation} really cooked with this one!",
        "Hot take: {corporation}'s approach to development is outdated. They need to focus more on usability if they want to compete with {corporation}",
        "The new update to {product} completely revolutionized my workflow. Thanks {corporation} for fixing the issue! #ProductivityTech",
        "Arrived at {event} in {location}! The {corporation} booth is already packed with people trying the new {product}. #TechConference",
        "Just spotted {person} from {corporation} at a restaurant in {location} right after {event}. Tried to ask about {product} rumors but no comment!",
        "I snuck into the VIP section at {event} in {location} and got a selfie with {person}! Check my Insta! #Winning",
        "PSA: Free {product} giveaways at {corporation}'s booth at {event} in {location}! Run don't walk, peeps! I got the last blue one",
        "This {product} launch line at {corporation}'s store in {location} is ridiculous. Been here 3hrs and moved like 10 feet, But I NEED it today! #TechAddict",
        "Shoutout to the nice {corporation} rep at {event} in {location} who gave me an extra {product} for my kid! Some tech people are actually decent humans",
        "My {product} just updated itself and now I can't find ANYTHING. Hey {corporation}, stop 'fixing' stuff that ain't broken! {person} needs to chill with these changes",
        "Omg {person} just liked my tweet criticizing {corporation}'s {product}! Screenshot this before they realize and unlike!",
        "The way {person} casually uses {product} in interviews makes it seem so cool, but when I bought it from {corporation} it's just... meh. Marketing wins again",
        "new CEO {person} has really not done much yet at {corporation}, hasnt he?"
    ]

In [None]:
len(templates)

In [None]:
import random
import pandas as pd

# reworked new function
def generate_ner_dataset(num_examples, templates=templates,persons=persons, corporations=corporations,
                         products=products, events=events, locations=locations):
    data = []

    for _ in range(num_examples):
        template = random.choice(templates)

        person = random.choice(persons)
        corporation = random.choice(corporations)
        product = random.choice(products)
        event = random.choice(events)
        location = random.choice(locations)

        sentence = template.format(
            person=person,
            corporation=corporation,
            product=product,
            event=event,
            location=location
        )

        tokens = []
        labels = []

        raw_words = []
        current_word = ""
        for char in sentence:
            if char.isalnum() or char in "-'":
                current_word += char
            else:
                if current_word:
                    raw_words.append(current_word)
                    current_word = ""
                if not char.isspace():
                    raw_words.append(char)
        if current_word:
            raw_words.append(current_word)

        i = 0
        while i < len(raw_words):
            token = raw_words[i]

            found_entity = False

            if i < len(raw_words) - 1 and f"{raw_words[i]} {raw_words[i+1]}" in persons:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                labels.append("B-person")
                labels.append("I-person")
                i += 2
                found_entity = True

            elif i < len(raw_words) - 2 and f"{raw_words[i]} {raw_words[i+1]} {raw_words[i+2]}" in products:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                tokens.append(raw_words[i+2])
                labels.append("B-product")
                labels.append("I-product")
                labels.append("I-product")
                i += 3
                found_entity = True
            elif i < len(raw_words) - 1 and f"{raw_words[i]} {raw_words[i+1]}" in products:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                labels.append("B-product")
                labels.append("I-product")
                i += 2
                found_entity = True

            elif i < len(raw_words) - 2 and f"{raw_words[i]} {raw_words[i+1]} {raw_words[i+2]}" in events:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                tokens.append(raw_words[i+2])
                labels.append("B-event")
                labels.append("I-event")
                labels.append("I-event")
                i += 3
                found_entity = True
            elif i < len(raw_words) - 1 and f"{raw_words[i]} {raw_words[i+1]}" in events:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                labels.append("B-event")
                labels.append("I-event")
                i += 2
                found_entity = True

            elif i < len(raw_words) - 1 and f"{raw_words[i]} {raw_words[i+1]}" in locations:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                labels.append("B-location")
                labels.append("I-location")
                i += 2
                found_entity = True

            if not found_entity:
                if token in [name.split()[0] for name in persons]:
                    tokens.append(token)
                    labels.append("B-person")
                    i += 1
                elif token in corporations:
                    tokens.append(token)
                    labels.append("B-corporation")
                    i += 1
                elif token in products:
                    tokens.append(token)
                    labels.append("B-product")
                    i += 1
                elif token in events:
                    tokens.append(token)
                    labels.append("B-event")
                    i += 1
                elif token in locations:
                    tokens.append(token)
                    labels.append("B-location")
                    i += 1
                else:
                    tokens.append(token)
                    labels.append("O")
                    i += 1

        data.append({"tokens": tokens, "labels": labels, "sentence": sentence})

    df = pd.DataFrame(data)

    return df

In [None]:
df_syn_new3 = generate_ner_dataset(10000)
df_syn_new3.head()

In [None]:
# converting named-tags to numeric tags for comparison with model output
def convert_to_numeric(label_list):
    return [label_to_id.get(label, -100) for label in label_list]

df_syn_new3["tags"] = df_syn_new3["labels"].apply(convert_to_numeric)

## predicting with RoBERTa-TweetNER7 on syn data

In [None]:
bert_pred(roberta_tweetner, roberta_tweetner_t, df_syn_new3, "tokens", "roberta_pred")

In [None]:
df_syn_new3["roberta_pred_labels"] = df_syn_new3["roberta_pred"].apply(tags_to_labels)

In [None]:
from nervaluate import Evaluator

true = df_syn_new3["labels"].values.tolist()
pred = df_syn_new3["roberta_pred_labels"].values.tolist()

evaluator = Evaluator(true, pred, tags=["corporation", "creative_work", "event", "group", "location", "person", "product"], loader="list")

results, results_by_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
print(results)

In [None]:
evaluate_ner_predictions(df_syn_new3, "tags", "roberta_pred")

## fine-tuning

In [None]:
from datasets import Dataset
dataset_syn = Dataset.from_pandas(df_syn_new3)

In [None]:
train_test_split = dataset_syn.train_test_split(test_size=0.2)
train_dataset_syn = train_test_split['train']
test_dataset_syn = train_test_split['test']

In [None]:
label_map = {tag: id for id, tag in enumerate(label_list)}

In [None]:
# works with synthetic data from above
def tokenize_and_align_labels(examples):
    tokenized_inputs = roberta_tweetner_t(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []

        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id != previous_word_id:
                if isinstance(label[word_id], str):
                    # converting string label to id
                    label_ids.append(label_map[label[word_id]])
                else:
                    label_ids.append(label[word_id])
            else:
                label_ids.append(-100)
            previous_word_id = word_id

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
train_dataset_syn = train_dataset_syn.map(tokenize_and_align_labels, batched=True)
test_dataset_syn = test_dataset_syn.map(tokenize_and_align_labels, batched=True)

In [None]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # flattening predictions and labels
    true_labels = [id_to_label[label] for label in labels.flatten() if label != -100]
    true_predictions = [id_to_label[pred] for (pred, label) in zip(predictions.flatten(), labels.flatten()) if label != -100]

    # calculating metrics
    report = classification_report(true_labels, true_predictions, output_dict=True)
    return {
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"]
    }

In [None]:
training_args = TrainingArguments(
    output_dir="./roberta-tweetner-syndata",
    evaluation_strategy="epoch",   # evaluating at every epoch
    learning_rate=1e-5,
    per_device_train_batch_size=64, # adjust batch size depending on RAM
    per_device_eval_batch_size=64,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=2
)

trainer = Trainer(
    model=roberta_tweetner,
    args=training_args,
    train_dataset=train_dataset_syn,
    eval_dataset=test_dataset_syn,
    tokenizer=roberta_tweetner_t,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

# Finetuning RoBERTa-base with corpus

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification

roberta_t = AutoTokenizer.from_pretrained("FacebookAI/roberta-base", add_prefix_space=True)

entity_dict = {
    0: "B-corporation",
    1: "B-event",
    2: "B-location",
    3: "B-person",
    4: "B-product",
    5: "I-corporation",
    6: "I-event",
    7: "I-location",
    8: "I-person",
    9: "I-product",
    10: "O"
}

label_list = list(entity_dict.values())
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

roberta = AutoModelForTokenClassification.from_pretrained(
    "FacebookAI/roberta-base",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

In [None]:
import pandas as pd

df_corpus = pd.read_json("NER_corpus.json", orient="records")
df_corpus.rename(columns={"labels":"original_labels"}, inplace=True)
df_corpus.head()

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_corpus)

def tokenize_and_align_labels(examples):
    tokenized_inputs = roberta_t(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:  # Special tokens
                label_ids.append(-100)
            elif word_id != previous_word_id:  # First token of a word
                label_ids.append(label[word_id])
            else:  # Subword tokens
                label_ids.append(-100)
            previous_word_id = word_id
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
train_test_split1 = tokenized_datasets.train_test_split(test_size=0.15, seed=42)
train_val_dataset = train_test_split1["train"]
test_dataset = train_test_split1["test"]

train_test_split2 = train_val_dataset.train_test_split(test_size=0.15, seed=42)
train_dataset = train_test_split2["train"]
val_dataset = train_test_split2["test"]

In [None]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # flattening predictions and labels
    true_labels = [id_to_label[label] for label in labels.flatten() if label != -100]
    true_predictions = [id_to_label[pred] for (pred, label) in zip(predictions.flatten(), labels.flatten()) if label != -100]

    # calculating metrics
    report = classification_report(true_labels, true_predictions, output_dict=True)
    return {
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"]
    }

training_args = TrainingArguments(
    output_dir="./ner-roberta-base",
    evaluation_strategy="epoch", # evaluating at every epoch
    learning_rate=3e-5,
    per_device_train_batch_size=32, # adjust batch size depending on RAM
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2
)

trainer = Trainer(
    model=roberta,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=roberta_t,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate(test_dataset)

In [None]:
trainer.save_model("./roberta-base-after-corpus")

## applying the model

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

roberta_corpus_t = AutoTokenizer.from_pretrained("./roberta-base-after-corpus")
roberta_corpus = AutoModelForTokenClassification.from_pretrained(
    "./roberta-base-after-corpus",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id)

In [None]:
df_corpus_test = test_dataset.to_pandas()
df_corpus_test.head()

In [None]:
bert_pred(roberta_corpus, roberta_corpus_t, df_corpus_test, "tokens", "roberta_pred")

In [None]:
from nervaluate import Evaluator

df_corpus_test["pred_labels"] = df_corpus_test["roberta_pred"].apply(lambda x: [id_to_label[i] for i in x])

true = df_corpus_test["original_labels"].values.tolist()
pred = df_corpus_test["pred_labels"].values.tolist()

evaluator = Evaluator(true, pred, tags=["corporation", "event", "location", "person", "product"], loader="list")

results, results_by_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
print(results)

In [None]:
print(results_by_tag)

In [None]:
metrics = evaluate_ner_predictions(df_corpus_test, "tags", "roberta_pred")

# Finetuning covid twitter bert

## with TweetNER7-data

In [2]:
# fine-tuning the covid-twitter-bert model, since most other bert-based models
# have already been fine-tuned on this data with most likely better resources
# https://huggingface.co/datasets/tner/tweetner7#main-models
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

model_name_covid = "digitalepidemiologylab/covid-twitter-bert-v2"
tokenizer_covid = AutoTokenizer.from_pretrained(model_name_covid)

#model_covid = AutoModelForTokenClassification.from_pretrained(model_name_covid)
#ner_pipeline_covid = pipeline("ner", model=model_covid, tokenizer=tokenizer_covid)

In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(tweetner7_big)

In [None]:
print(dataset["true_labels"][0])

In [None]:
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [None]:
label_list = list(entity_dict.values())
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

def encode_labels(ds):
    ds['new_tags'] = [label_to_id[label] for label in ds['true_labels']]
    return ds

train_dataset = train_dataset.map(encode_labels)
test_dataset = test_dataset.map(encode_labels)

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer_covid(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["new_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:  # Special tokens
                label_ids.append(-100)
            elif word_id != previous_word_id:  # First token of a word
                label_ids.append(label[word_id])
            else:  # Subword tokens
                label_ids.append(-100)
            previous_word_id = word_id
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
from transformers import AutoModelForTokenClassification

model_covid = AutoModelForTokenClassification.from_pretrained(
    model_name_covid,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

In [None]:
from transformers import DataCollatorForTokenClassification

# using a data collator for dynamic padding to solve error during tensor creation
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer_covid)

In [None]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # flattening predictions and labels
    true_labels = [id_to_label[label] for label in labels.flatten() if label != -100]
    true_predictions = [id_to_label[pred] for (pred, label) in zip(predictions.flatten(), labels.flatten()) if label != -100]

    # calculating metrics
    report = classification_report(true_labels, true_predictions, output_dict=True)
    return {
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"]
    }

training_args = TrainingArguments(
    output_dir="./ner-covid-twitter-bert",
    evaluation_strategy="epoch",   # evaluating at every epoch
    learning_rate=5e-5,
    per_device_train_batch_size=4, # adjust batch size depending on RAM
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2
)

trainer = Trainer(
    model=model_covid,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer_covid,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
# saving the fine-tuned model
model_covid.save_pretrained("./models/ner-covid-twitter-bert_after_first_finetuning")
tokenizer_covid.save_pretrained("./models/ner-covid-twitter-bert_after_first_finetuning")

In [None]:
# loading the saved model and applying it
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer_covid_f = AutoTokenizer.from_pretrained("./models/ner-covid-twitter-bert_after_first_finetuning")
model_covid_f = AutoModelForTokenClassification.from_pretrained("./models/ner-covid-twitter-bert_after_first_finetuning")

In [None]:
from transformers import pipeline

fine_tuned_model_path = "./models/covid-twitter-bert_after_first_finetuning"
pipeline_covid_f = pipeline("ner", model=fine_tuned_model_path, tokenizer=fine_tuned_model_path)

In [None]:
tweetner7_big["pred_covidtwitterbert_f"] = tweetner7_big.apply(lambda x: pipeline_covid_f(str(x["tokens"])), axis=1)

In [None]:
tweetner7_big.head()

In [None]:
tweetner7_big["pred_labels_covidtwitterbert_f"] = tweetner7_big.apply(output_to_labellist, axis=1)

In [None]:
from nervaluate import Evaluator

true = tweetner7_big["true_labels"].values.tolist()
pred = tweetner7_big["pred_labels_covidtwitterbert_f"].values.tolist()

evaluator = Evaluator(true, pred, tags=["corporation", "creative_work", "event", "group", "location", "person", "product"], loader="list")

results, results_by_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
print(results)

In [None]:
print(results_by_tag["product"])

## with NER-corpus

In [2]:
from transformers import AutoTokenizer

tokenizer_covid = AutoTokenizer.from_pretrained("digitalepidemiologylab/covid-twitter-bert-v2")

In [3]:
from transformers import AutoModelForTokenClassification

entity_dict = {
    0: "B-corporation",
    1: "B-event",
    2: "B-location",
    3: "B-person",
    4: "B-product",
    5: "I-corporation",
    6: "I-event",
    7: "I-location",
    8: "I-person",
    9: "I-product",
    10: "O"
}

label_list = list(entity_dict.values())
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

model_covid = AutoModelForTokenClassification.from_pretrained(
    "digitalepidemiologylab/covid-twitter-bert-v2",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

In [4]:
import pandas as pd

df_corpus = pd.read_json("NER_corpus.json", orient="records")
df_corpus.rename(columns={"labels":"original_labels"}, inplace=True)
df_corpus.head()

In [5]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_corpus)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer_covid(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:  # Special tokens
                label_ids.append(-100)
            elif word_id != previous_word_id:  # First token of a word
                label_ids.append(label[word_id])
            else:  # Subword tokens
                label_ids.append(-100)
            previous_word_id = word_id
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

In [6]:
train_test_split1 = tokenized_datasets.train_test_split(test_size=0.15, seed=42)
train_val_dataset = train_test_split1["train"]
test_dataset = train_test_split1["test"]

train_test_split2 = train_val_dataset.train_test_split(test_size=0.15, seed=42)
train_dataset = train_test_split2["train"]
val_dataset = train_test_split2["test"]

In [7]:
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification
import numpy as np
from sklearn.metrics import classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # flattening predictions and labels
    true_labels = [id_to_label[label] for label in labels.flatten() if label != -100]
    true_predictions = [id_to_label[pred] for (pred, label) in zip(predictions.flatten(), labels.flatten()) if label != -100]

    # calculating metrics
    report = classification_report(true_labels, true_predictions, output_dict=True)
    return {
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"]
    }

training_args = TrainingArguments(
    output_dir="./ner-covid-twitter-bert",
    evaluation_strategy="epoch", # evaluating at every epoch
    learning_rate=3e-5,
    per_device_train_batch_size=32, # adjust batch size depending on RAM
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2
)

trainer = Trainer(
    model=model_covid,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer_covid,
    compute_metrics=compute_metrics
)

In [8]:
trainer.train()

In [9]:
trainer.evaluate(test_dataset)

In [10]:
trainer.save_model("./ner-covid-twitter-bert_after_corpus")

In [11]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

bert_covid_corpus_t = AutoTokenizer.from_pretrained("./ner-covid-twitter-bert_after_corpus")
bert_covid_corpus = AutoModelForTokenClassification.from_pretrained(
    "./ner-covid-twitter-bert_after_corpus",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id)

In [12]:
corpus_test_df = test_dataset.to_pandas()
corpus_test_df.head()

In [16]:
import numpy as np
import torch

param_device = next(bert_covid_corpus.parameters()).device

# prediction function with max_length and error exceptions to handle cuda runtime error but predicting on GPU
def bert_pred(model, tokenizer, df, text_col, output_col, max_length=128, batch_size=32):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    #device = model.device
    print(f"Using device: {device}")

    # printing model config info
    if hasattr(model, 'config'):
        print(f"Model has {model.config.num_labels} labels")

    model.eval()
    all_predictions = []

    for i, row in df.iterrows():
        try:
            text = row[text_col]

            if isinstance(text, np.ndarray):
                if text.dtype.kind == 'U' or text.dtype.kind == 'S':
                    text_list = text.tolist()
                else:
                    text_list = [str(x) for x in text]

                if not text_list:
                    all_predictions.append([])
                    continue

                encoded = tokenizer(
                    text_list,
                    is_split_into_words=True,
                    return_tensors="pt",
                    max_length=max_length,
                    truncation=True,
                    padding="max_length"
                )
            elif isinstance(text, str):
                if not text.strip():
                    all_predictions.append([])
                    continue

                encoded = tokenizer(
                    text,
                    return_tensors="pt",
                    max_length=max_length,
                    truncation=True,
                    padding="max_length"
                )
            elif isinstance(text, list):
                if not text:
                    all_predictions.append([])
                    continue

                encoded = tokenizer(
                    text,
                    is_split_into_words=True,
                    return_tensors="pt",
                    max_length=max_length,
                    truncation=True,
                    padding="max_length"
                )
            else:
                encoded = tokenizer(
                    str(text),
                    return_tensors="pt",
                    max_length=max_length,
                    truncation=True,
                    padding="max_length"
                )

            encoded = {k: v.to(device) for k, v in encoded.items()}

            if i == 0:
                print(f"Input shape: {encoded['input_ids'].shape}")

            with torch.no_grad():
                outputs = model(**encoded)
                preds = torch.argmax(outputs.logits, dim=-1)[0].cpu().tolist()

                if isinstance(text, (list, np.ndarray)):
                    if isinstance(text, np.ndarray):
                        word_ids = tokenizer(
                            text.tolist(),
                            is_split_into_words=True,
                            max_length=max_length,
                            truncation=True,
                            padding="max_length"
                        ).word_ids()
                    else:
                        word_ids = tokenizer(
                            text,
                            is_split_into_words=True,
                            max_length=max_length,
                            truncation=True,
                            padding="max_length"
                        ).word_ids()

                    processed_preds = []
                    prev_word_id = None

                    for word_id, pred in zip(word_ids, preds):
                        if word_id is None:
                            continue
                        elif word_id != prev_word_id:
                            processed_preds.append(pred)
                        prev_word_id = word_id

                    all_predictions.append(processed_preds)
                else:
                    all_predictions.append(preds)

        except Exception as e:
            print(f"Error in row {i}: {e}")
            # adding empty predictions for failed rows
            all_predictions.append([])
            continue

    df[output_col] = all_predictions
    return df

In [17]:
bert_pred(bert_covid_corpus, bert_covid_corpus_t, corpus_test_df, "tokens", "covid_bert_pred")

In [18]:
corpus_test_df["covid_bert_pred"].explode().value_counts()

In [19]:
from nervaluate import Evaluator

corpus_test_df["pred_labels"] = corpus_test_df["covid_bert_pred"].apply(lambda x: [id_to_label[i] for i in x])

true = corpus_test_df["original_labels"].values.tolist()
pred = corpus_test_df["pred_labels"].values.tolist()

evaluator = Evaluator(true, pred, tags=["corporation", "event", "location", "person", "product"], loader="list")

results, results_by_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
print(results)

In [20]:
print(results_by_tag)

In [23]:
metrics = evaluate_ner_predictions(corpus_test_df, "tags", "covid_bert_pred")

### applying model to case study data

In [24]:
import pandas as pd

df_dell = pd.read_json("dell_cs_processed.json", orient="records")
df_dell.head()

In [25]:
bert_pred(bert_covid_corpus, bert_covid_corpus_t, df_dell, "tokens", "covid_bert_pred")

In [26]:
df_dell.to_json("dell_cs_after_ner.json", orient="records")

# fine-tuning BERT-base with tweetner7-data

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

#using cased BERT for NER because it may be advantageous (e.g. for persons, companies)
# google-bert/bert-base-cased
bert_base_cased_t = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

bert_base_cased = AutoModelForTokenClassification.from_pretrained(
    "google-bert/bert-base-cased",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

In [None]:
# steps between executed from loading and applying and syn data sections (until train- and test-dataset)
def tokenize_and_align_labels(examples):
    tokenized_inputs = bert_base_cased_t(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["tags"]): # or "new_tags"
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:  # Special tokens
                label_ids.append(-100)
            elif word_id != previous_word_id:  # First token of a word
                label_ids.append(label[word_id])
            else:  # Subword tokens
                label_ids.append(-100)
            previous_word_id = word_id
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # flattening predictions and labels
    true_labels = [id_to_label[label] for label in labels.flatten() if label != -100]
    true_predictions = [id_to_label[pred] for (pred, label) in zip(predictions.flatten(), labels.flatten()) if label != -100]

    # calculating metrics
    report = classification_report(true_labels, true_predictions, output_dict=True)
    return {
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"]
    }

In [None]:
training_args = TrainingArguments(
    output_dir="./ner-bert-base-cased",
    evaluation_strategy="epoch",   # evaluating at every epoch
    learning_rate=5e-5,
    per_device_train_batch_size=64, # adjust batch size depending on RAM
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=2
)

trainer = Trainer(
    model=bert_base_cased,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=bert_base_cased_t,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
bert_base_cased.save_pretrained("bert-base-cased-ner-tweetner7")
bert_base_cased_t.save_pretrained("bert-base-cased-ner-tweetner7")

In [None]:
bert_c_ft_t = AutoTokenizer.from_pretrained("bert-base-cased-ner-tweetner7")

bert_c_ft = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased-ner-tweetner7",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

In [None]:
bert_pred(bert_c_ft, bert_c_ft_t, tweetner7_test20, "tokens", "bert_pred")

In [None]:
metrics = evaluate_ner_predictions(tweetner7_test20, "tags", "bert_pred")
print(metrics)

## 2nd fine-tuning with tweetner7-bigger

In [None]:
tweetner7_validation20 = ds["validation_2020"].to_pandas()
tweetner7_validation21 = ds["validation_2021"].to_pandas()

# leaving "test_2020"-part out of training data to test
tweetner7_test20 = ds["test_2020"].to_pandas()

tweetner7_bigger = pd.concat([tweetner7_big, tweetner7_validation20, tweetner7_validation21, tweetner7_test20])
tweetner7_bigger.reset_index(drop=True, inplace=True)
tweetner7_bigger.dropna(inplace=True)

#tweetner7_test20.reset_index(drop=True, inplace=True)
#tweetner7_test20.dropna(inplace=True)

len(tweetner7_bigger)

In [None]:
from datasets import Dataset

#train_dataset_bigger = Dataset.from_pandas(tweetner7_bigger)
#test_dataset_bigger = Dataset.from_pandas(tweetner7_test20)

dataset = Dataset.from_pandas(tweetner7_bigger)

# setting seed so that validation set is deterministic
train_test_split = dataset.train_test_split(test_size=0.05, seed=42)
train_dataset_bigger = train_test_split['train']
test_dataset_bigger = train_test_split['test']

In [None]:
train_dataset_bigger = train_dataset_bigger.map(tokenize_and_align_labels, batched=True)
test_dataset_bigger = test_dataset_bigger.map(tokenize_and_align_labels, batched=True)

In [None]:
training_args = TrainingArguments(
    output_dir="./ner-bert-base-cased",
    evaluation_strategy="epoch",   # evaluating at every epoch
    learning_rate=1e-5,
    per_device_train_batch_size=64, # adjust batch size depending on RAM
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=2
)

trainer = Trainer(
    model=bert_base_cased,
    args=training_args,
    train_dataset=train_dataset_bigger,
    eval_dataset=test_dataset_bigger,
    tokenizer=bert_base_cased_t,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
bert_base_cased.save_pretrained("bert-base-cased-ner-tweetner7-bigger")
bert_base_cased_t.save_pretrained("bert-base-cased-ner-tweetner7-bigger")

In [None]:
bert_c_ft_bigger_t = AutoTokenizer.from_pretrained("bert-base-cased-ner-tweetner7-bigger")

bert_c_ft_bigger = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased-ner-tweetner7-bigger",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

In [None]:
test_dataset_bigger_df = test_dataset_bigger.to_pandas()

In [None]:
bert_pred(bert_c_ft_bigger, bert_c_ft_bigger_t, test_dataset_bigger_df, "tokens", "bert_pred")

In [None]:
metrics = evaluate_ner_predictions(test_dataset_bigger_df, "tags", "bert_pred")
print(metrics)

## applying model on unlabeled data and analyzing results

In [None]:
# predicting on already processed Tweets-Bigtech-data
df_bigtech = pd.read_csv("tweets_bigtech_10ksample.csv")
df_bigtech.head()

In [None]:
df_bigtech_sm = df_bigtech.sample(n=1000, random_state=42)
df_bigtech_sm.reset_index(drop=True, inplace=True)

In [None]:
df_bigtech_sm["tokens"] = df_bigtech_sm.apply(lambda x: x["text"].split(), axis=1)

In [None]:
bert_pred(bert_c_ft_bigger, bert_c_ft_bigger_t, df_bigtech_sm, "tokens", "bert_pred")

In [None]:
df_bigtech_sm["pred_labels"] = df_bigtech_sm["bert_pred"].apply(tags_to_labels)

In [None]:
df_bigtech_sm.head()

In [None]:
for row in df_bigtech_sm[900:].iterrows():
    print(row[1]["text"])
    print(row[1]["pred_labels"])

# fine-tuning BERT-base with NER-corpus

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# reduced entity dict fitting the corpus
entity_dict_sm = {
    0: "B-corporation",
    1: "B-event",
    2: "B-location",
    3: "B-person",
    4: "B-product",
    5: "I-corporation",
    6: "I-event",
    7: "I-location",
    8: "I-person",
    9: "I-product",
    10: "O"
}

label_list = list(entity_dict_sm.values())
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

bert_base_cased_t = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

bert_base_cased = AutoModelForTokenClassification.from_pretrained(
    "google-bert/bert-base-cased",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

In [None]:
import pandas as pd

df_nerc = pd.read_json("NER_corpus.json", orient="records")
df_nerc.head()

In [None]:
len(df_nerc)

In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(df_nerc)

In [None]:
#label_map = {tag: id for id, tag in enumerate(label_list)}

In [None]:
# version of function that worked with synthetic data
def tokenize_and_align_labels(examples):
    tokenized_inputs = bert_base_cased_t(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []

        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id != previous_word_id:
                if isinstance(label[word_id], str):
                    # converting string label to id
                    label_ids.append(label_map[label[word_id]])
                else:
                    label_ids.append(label[word_id])
            else:
                label_ids.append(-100)
            previous_word_id = word_id

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
train_test_split1 = tokenized_dataset.train_test_split(test_size=0.15, seed=42)
train_val_dataset = train_test_split1['train']
test_dataset = train_test_split1['test']

train_test_split2 = train_val_dataset.train_test_split(test_size=0.15, seed=42)
train_dataset = train_test_split2['train']
val_dataset = train_test_split2['test']

In [None]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # flattening predictions and labels
    true_labels = [id_to_label[label] for label in labels.flatten() if label != -100]
    true_predictions = [id_to_label[pred] for (pred, label) in zip(predictions.flatten(), labels.flatten()) if label != -100]

    # calculating metrics
    report = classification_report(true_labels, true_predictions, output_dict=True)
    return {
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"]
    }

In [None]:
training_args = TrainingArguments(
    output_dir="./bert-base-cased-ner-corpus",
    evaluation_strategy="epoch",   # evaluating at every epoch
    learning_rate=3e-5,
    per_device_train_batch_size=32, # adjust batch size depending on RAM
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2
)

trainer = Trainer(
    model=bert_base_cased,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=bert_base_cased_t,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate(test_dataset)

In [None]:
bert_base_cased.save_pretrained("./bert-base-cased-ner-corpus")
bert_base_cased_t.save_pretrained("./bert-base-cased-ner-corpus")

## applying model

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

bert_corpus_t = AutoTokenizer.from_pretrained("./bert-base-cased-ner-corpus")

bert_corpus = AutoModelForTokenClassification.from_pretrained(
    "./bert-base-cased-ner-corpus",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

In [None]:
df_corpus_test = test_dataset.to_pandas()
df_corpus_test.head()

In [None]:
df_corpus_test["true_labels"] = df_corpus_test["tags"].apply(lambda x: [id_to_label[tag] for tag in x])

In [None]:
# version of function from WNUT data
bert_pred(bert_corpus, bert_corpus_t, df_corpus_test, "tokens", "bert_pred")

In [None]:
from nervaluate import Evaluator

df_corpus_test["bert_pred_labels"] = df_corpus_test["bert_pred"].apply(lambda x: [id_to_label[i] for i in x])

true = df_corpus_test["true_labels"].values.tolist()
pred = df_corpus_test["bert_pred_labels"].values.tolist()

evaluator = Evaluator(true, pred, tags=["corporation", "event", "location", "person", "product"], loader="list")

results, results_by_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
print(results)

In [None]:
print(results_by_tag)

In [None]:
metrics = evaluate_ner_predictions(df_corpus_test, "tags", "bert_pred")

In [None]:
df_tbt_app = pd.read_json("tweets_bigtech_10k_application.json", orient="records")
df_tbt_app.head()

In [None]:
# applying model to unlabeled data for 2nd run through visualizations
bert_pred(bert_corpus, bert_corpus_t, df_tbt_app, "tokens", "bert_pred")

In [None]:
df_tbt_app.to_json("tweets_bigtech_10k_application_afterNER.json", orient="records")

In [None]:
for index, row in df_bigtech_sm[:100].iterrows():
    print(row["text"])
    print(row["bert_pred"])

In [None]:
# applying model to dell data to create time plot
df_dell = pd.read_json("sentiment_dell_processed.json", orient="records")
df_dell.head()

In [None]:
bert_pred(bert_corpus, bert_corpus_t, df_dell, "tokens", "ner_tags")

In [None]:
df_dell["ner_labels"] = df_dell["ner_tags"].apply(lambda x: [id_to_label[tag] for tag in x])

In [None]:
df_dell.to_json("dell_afterNER.json", orient="records")