In [1]:
#!pip install huggingface_hub transformers torch accelerate datasets peft nervaluate

In [2]:
#!pip install -U bitsandbytes

In [None]:
from huggingface_hub import login
login(token="")

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_name = "meta-llama/Llama-2-7b-hf"
llama2_7b_t = AutoTokenizer.from_pretrained(model_name)

In [None]:
# pad token definitions for both tokenizer and model allow for batch size > 1
llama2_7b_t.pad_token = llama2_7b_t.eos_token
#model.config.pad_token_id = model.config.eos_token_id

In [None]:
import pandas as pd
from datasets import load_dataset

ds = load_dataset("tner/tweetner7")

In [None]:
import pandas as pd

tweetner7_train = ds["train_all"].to_pandas()
tweetner7_test = ds["test_2021"].to_pandas()

tweetner7_big = pd.concat([tweetner7_train, tweetner7_test])
len(tweetner7_big)

In [None]:
tweetner7_validation20 = ds["validation_2020"].to_pandas()
tweetner7_validation21 = ds["validation_2021"].to_pandas()
tweetner7_test20 = ds["test_2020"].to_pandas()

tweetner7_bigger = pd.concat([tweetner7_big, tweetner7_validation20, tweetner7_validation21, tweetner7_test20])
len(tweetner7_bigger)

In [None]:
entity_dict_tweetner = {
    0: "B-corporation",
    1: "B-creative_work",
    2: "B-event",
    3: "B-group",
    4: "B-location",
    5: "B-person",
    6: "B-product",
    7: "I-corporation",
    8: "I-creative_work",
    9: "I-event",
    10: "I-group",
    11: "I-location",
    12: "I-person",
    13: "I-product",
    14: "O"
}

def tags_to_labels(col):
    result = []
    for i in col:
        label = entity_dict_tweetner[i]
        result.append(label)

    return result

# fine-tuning with tweetner7

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(tweetner7_bigger)

train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [None]:
label_list = list(entity_dict_tweetner.values())
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = llama2_7b_t(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:  # Special tokens
                label_ids.append(-100)
            elif word_id != previous_word_id:  # First token of a word
                label_ids.append(label[word_id])
            else:  # Subword tokens
                label_ids.append(-100)
            previous_word_id = word_id
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

In [None]:
model.config.pad_token_id = model.config.eos_token_id

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,  # rank of the low-rank adaptation
    lora_alpha=32,  # scaling factor
    target_modules=["q_proj", "v_proj"],  # target modules for LoRA
    lora_dropout=0.1,  # dropout rate
    bias="none",  # bias handling
    task_type="TOKEN_CLS" # NER is a form of token classification
)

model = get_peft_model(model, lora_config)

In [None]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # flattening predictions and labels
    true_labels = [id_to_label[label] for label in labels.flatten() if label != -100]
    true_predictions = [id_to_label[pred] for (pred, label) in zip(predictions.flatten(), labels.flatten()) if label != -100]

    # calculating metrics
    report = classification_report(true_labels, true_predictions, output_dict=True)
    return {
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"]
    }

training_args = TrainingArguments(
    output_dir="./ner-llama-2-7b",
    evaluation_strategy="epoch",   # evaluating at every epoch
    learning_rate=5e-5,
    per_device_train_batch_size=128, # adjust batch size depending on resources,
    per_device_eval_batch_size=128,  # high batch size because NER needs less RAM than SA
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500, # higher number to save time
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=llama2_7b_t,
    compute_metrics=compute_metrics
)

In [None]:
# with 2 epochs, tweetner7-big
trainer.train()

In [None]:
model.save_pretrained("ner-llama-2-7b-finetuned-tweetner7")
llama2_7b_t.save_pretrained("ner-llama-2-7b-finetuned-tweetner7")

In [None]:
trainer.evaluate(test_dataset)

In [None]:
# with 3 epochs, tweetner7-big
trainer.train()

In [None]:
model.save_pretrained("ner-llama-2-7b-finetuned-tweetner7-3")
llama2_7b_t.save_pretrained("ner-llama-2-7b-finetuned-tweetner7-3")

In [None]:
trainer.evaluate(test_dataset)

In [None]:
# with 4 epochs, tweetner7bigger
trainer.train()

In [None]:
model.save_pretrained("ner-llama-2-7b-finetuned-tweetner7-4")
llama2_7b_t.save_pretrained("ner-llama-2-7b-finetuned-tweetner7-4")

In [None]:
trainer.evaluate(test_dataset)

## applying model

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
from transformers import BitsAndBytesConfig
from peft import PeftModel

# first loading base model, then reloading fine-tuned model on top of it with PEFT

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForTokenClassification.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    quantization_config=quantization_config,
    device_map="auto",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

model_ft = PeftModel.from_pretrained(
    base_model,
    "ner-llama-2-7b-finetuned-tweetner7-4"
)

tokenizer_ft = AutoTokenizer.from_pretrained("ner-llama-2-7b-finetuned-tweetner7-4")

In [None]:
test_df = pd.DataFrame(test_dataset)
test_df.head()

In [16]:
def llama_pred(model, tokenizer, df, text_col, tag_col=None, max_length=128):
    # setting padding tokens
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

    # moving model to GPU and setting it to evaluation-mode
    model.to(model.device)
    model.eval()

    batch_size = 32
    num_batches = (len(df) + batch_size - 1) // batch_size
    all_predictions = []

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))
        if start_idx >= len(df):
            break

        batch_tokens = df[text_col][start_idx:end_idx].tolist()

        # tokenizing with the same parameters as during training
        batch_inputs = tokenizer(
            batch_tokens,
            truncation=True,
            padding="max_length",
            max_length=max_length,
            is_split_into_words=True,  # very important for token classification
            return_tensors="pt"
        )

        batch_inputs = {key: value.to(model.device) for key, value in batch_inputs.items()}

        with torch.no_grad():
            outputs = model(**batch_inputs)
            batch_predictions = torch.argmax(outputs.logits, dim=-1)

            processed_predictions = []
            for j, preds in enumerate(batch_predictions):
                word_ids = tokenizer(batch_tokens[j], is_split_into_words=True).word_ids()
                previous_word_id = None
                word_preds = []

                for word_id, pred in zip(word_ids, preds):
                    if word_id is None:  # handling special tokens
                        continue
                    elif word_id != previous_word_id:  # first token of a word
                        word_preds.append(pred.item())
                    previous_word_id = word_id

                processed_predictions.append(word_preds)

            all_predictions.extend(processed_predictions)

    df["llama_ft_prediction"] = all_predictions
    return df

In [None]:
test_df = llama_pred(model_ft, tokenizer_ft, test_df, "tokens")

In [None]:
test_df["true_labels"] = test_df["tags"].apply(tags_to_labels)
test_df["pred_labels"] = test_df["llama_ft_prediction"].apply(tags_to_labels)
test_df.head()

In [None]:
from nervaluate import Evaluator

true = test_df["true_labels"].values.tolist()
pred = test_df["pred_labels"].values.tolist()

evaluator = Evaluator(true, pred, tags=["corporation", "creative_work", "event", "group", "location", "person", "product"], loader="list")

results, results_by_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
print(results)

In [35]:
from sklearn.metrics import precision_score, recall_score, classification_report

# function returns dictionary containing precision, recall and f1-score
def evaluate_ner_predictions(df, true_col="tags", pred_col="llama_ft_prediction"):
    y_true_flat = []
    y_pred_flat = []

    for true_seq, pred_seq in zip(df[true_col], df[pred_col]):
       # skipping padding/special tokens (marked as -100 in ground truth)
       for true_label, pred_label in zip(true_seq, pred_seq):
            if true_label != -100:
               y_true_flat.append(true_label)
               y_pred_flat.append(pred_label)

    precision = precision_score(y_true_flat, y_pred_flat, average='weighted')
    recall = recall_score(y_true_flat, y_pred_flat, average='weighted')
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print(classification_report(y_true_flat, y_pred_flat))

    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [None]:
metrics = evaluate_ner_predictions(test_df)

## predicting on new data

In [None]:
df_tweets_bigtech = pd.read_csv("tweets_bigtech_10ksample.csv")
df_tweets_bigtech.head()

In [None]:
df_tweets_bigtech["tokens"] = df_tweets_bigtech["text"].apply(lambda x: x.split())
df_tweets_bigtech = llama_pred(model_ft, tokenizer_ft, df_tweets_bigtech, "tokens")

In [None]:
df_tweets_bigtech["pred_labels"] = df_tweets_bigtech["llama_ft_prediction"].apply(tags_to_labels)
df_tweets_bigtech.head()

In [None]:
df_tweets_bigtech.to_csv("10k_tweets_bigtech_after_llama_ner.csv")

# fine tuning with corpus

In [4]:
import pandas as pd

df_corpus = pd.read_json("NER_corpus.json", orient="records")
df_corpus = df_corpus.rename(columns={"labels":"original_labels"})
df_corpus.head()

In [5]:
entity_dict = {
    0: "B-corporation",
    1: "B-event",
    2: "B-location",
    3: "B-person",
    4: "B-product",
    5: "I-corporation",
    6: "I-event",
    7: "I-location",
    8: "I-person",
    9: "I-product",
    10: "O"
}

label_list = list(entity_dict.values())
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

In [6]:
# examining length to determine maxlength
df_corpus["tokens"].str.len().agg(['mean','max','std'])

In [None]:
len(df_corpus[df_corpus["tokens"].apply(len)>=128])

In [6]:
from transformers import AutoTokenizer
import torch
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_name = "meta-llama/Llama-2-7b-hf"
llama_t = AutoTokenizer.from_pretrained(model_name)

llama_t.pad_token = llama_t.eos_token

In [7]:
from transformers import AutoModelForTokenClassification

llama = AutoModelForTokenClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

llama.config.pad_token_id = llama.config.eos_token_id

In [8]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

llama = prepare_model_for_kbit_training(llama)

lora_config = LoraConfig(
    r=16,  # rank of the low-rank adaptation
    lora_alpha=32,  # scaling factor
    target_modules=["k_proj", "q_proj", "v_proj", "o_proj"],  # target modules for LoRA
    lora_dropout=0.05,  # dropout rate
    bias="none",  # bias handling
    task_type="TOKEN_CLS" # NER is a form of token classification
)

llama = get_peft_model(llama, lora_config)

In [9]:
from datasets import Dataset

def tokenize_and_align_labels(examples):
    tokenized_inputs = llama_t(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id != previous_word_id:
                label_ids.append(label[word_id])
            else:
                label_ids.append(-100)
            previous_word_id = word_id
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

dataset = Dataset.from_pandas(df_corpus)
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

In [10]:
train_test_split1 = tokenized_datasets.train_test_split(test_size=0.15, seed=42)
train_val_dataset = train_test_split1['train']
test_dataset = train_test_split1['test']

train_test_split2 = train_val_dataset.train_test_split(test_size=0.15, seed=42)
train_dataset = train_test_split2['train']
val_dataset = train_test_split2['test']

In [11]:
print(id_to_label)
label_names = list(entity_dict.values())
print(label_names)

In [13]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import classification_report

def compute_metrics(p):
     predictions, labels = p
     predictions = np.argmax(predictions, axis=2)

     # flattening predictions and labels
     true_labels = [id_to_label[label] for label in labels.flatten() if label != -100]
     true_predictions = [id_to_label[pred] for (pred, label) in zip(predictions.flatten(), labels.flatten()) if label != -100]

     # calculating metrics
     report = classification_report(true_labels, true_predictions, output_dict=True)
     return {
         "precision": report["weighted avg"]["precision"],
         "recall": report["weighted avg"]["recall"],
         "f1": report["weighted avg"]["f1-score"]
     }

training_args = TrainingArguments(
    output_dir="./ner-llama-2-7b",
    evaluation_strategy="epoch", # evaluating at every epoch
    save_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=32, # adjust batch size depending on resources
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    weight_decay=0.001,
    fp16=False,
    bf16=True,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    label_names=label_names
)

trainer = Trainer(
    model=llama,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=llama_t,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate(test_dataset)

In [None]:
trainer.save_model("./llama2-7b-after-ner-corpus")

In [12]:
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification
import numpy as np
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader
from tqdm import tqdm

training_args = TrainingArguments(
    output_dir="./ner-llama-2-7b",
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=4,
    weight_decay=0.001,
    fp16=False,
    bf16=True,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    num_train_epochs=7,
    save_total_limit=7,
    label_names=label_names
)

trainer = Trainer(
    model=llama,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=llama_t
)

trainer.train()

In [13]:
from transformers import BitsAndBytesConfig
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_llama = AutoModelForTokenClassification.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    quantization_config=quantization_config,
    device_map="auto",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

In [19]:
from sklearn.metrics import classification_report

best_f1 = 0
best_epoch = 0

def calculate_metrics(df, true_label_col, pred_label_col):
    all_true = []
    all_pred = []

    for true, pred in zip(df[true_label_col], df[pred_label_col]):
        all_true.extend(true)
        all_pred.extend(pred[:len(true)])


    report = classification_report(all_true, all_pred, output_dict=True)

    return {
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"]
    }

val_df = (val_dataset).to_pandas()

folders = [141, 282, 423, 564, 705, 846, 980]
for epoch in range(7):
    foldernr = folders[epoch]
    checkpoint_dir = f"./ner-llama-2-7b/checkpoint-{str(foldernr)}"
    print(f"Loading model from {checkpoint_dir}")

    model = PeftModel.from_pretrained(
    base_llama,
    checkpoint_dir)
    tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir)

    val_df_copy = val_df.copy()
    val_df_with_preds = llama_pred(
        model,
        tokenizer,
        val_df_copy,
        text_col="tokens",
        max_length=128
    )

    metrics = calculate_metrics(
        val_df_with_preds,
        true_label_col="tags",
        pred_label_col="llama_ft_prediction"
    )

    f1 = metrics["f1"]
    precision = metrics["precision"]
    recall = metrics["recall"]

    print(f"Epoch {epoch+1}: Precision={precision:.4f}, Recall={recall:.4f}, F1={f1:.4f}")

    if f1 > best_f1:
        best_f1 = f1
        best_epoch = epoch

print(f"Best epoch according to F1-score: {best_epoch+1}, F1: {best_f1:.4f}")

## applying the model

### after 2 epochs

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
from transformers import BitsAndBytesConfig
from peft import PeftModel

# first loading base model, then reloading fine-tuned model on top of it with PEFT

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_llama = AutoModelForTokenClassification.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    quantization_config=quantization_config,
    device_map="auto",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

llama_corpus = PeftModel.from_pretrained(
    base_model,
    "./llama2-7b-after-ner-corpus"
)

llama_corpus_t = AutoTokenizer.from_pretrained("./llama2-7b-after-ner-corpus")

In [22]:
corpus_test = test_dataset.to_pandas()
corpus_test.head()

In [18]:
# adapted version to handle numpy arrays, probably due to json storing
def llama_pred(model, tokenizer, df, text_col, max_length=128):
    # setting padding tokens
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

    # moving model to GPU and setting it to evaluation-mode
    model.to(model.device)
    model.eval()

    batch_size = 32
    num_batches = (len(df) + batch_size - 1) // batch_size
    all_predictions = []

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))
        if start_idx >= len(df):
            break

        # converting numpy-arrays to lists
        batch_tokens = [tokens.tolist() if hasattr(tokens, 'tolist') else tokens for tokens in df[text_col][start_idx:end_idx]]

        encodings = tokenizer(
            batch_tokens,
            truncation=True,
            padding="max_length",
            max_length=max_length,
            is_split_into_words=True,
            return_tensors="pt"
        )

        batch_inputs = {key: value.to(model.device) for key, value in encodings.items()}

        with torch.no_grad():
            outputs = model(**batch_inputs)
            batch_predictions = torch.argmax(outputs.logits, dim=-1)

            processed_predictions = []
            for j, (tokens, preds) in enumerate(zip(batch_tokens, batch_predictions)):
                word_ids = encodings.word_ids(batch_index=j)
                previous_word_id = None
                word_preds = []

                word_pred_map = {}
                for word_id, pred in zip(word_ids, preds):
                  pred_id = pred.item()
                  if word_id is not None:  # Skip special tokens
                      if word_id not in word_pred_map:
                          word_pred_map[word_id] = []
                      word_pred_map[word_id].append(pred.item())

                for word_idx in sorted(word_pred_map.keys()):
                    predictions = word_pred_map[word_idx]
                    most_common = max(set(predictions), key=predictions.count)
                    word_preds.append(most_common)

                processed_predictions.append(word_preds)

            all_predictions.extend(processed_predictions)

    df["llama_ft_prediction"] = all_predictions
    return df

In [None]:
corpus_test = llama_pred(llama_corpus, llama_corpus_t, corpus_test, "tokens")

In [None]:
from nervaluate import Evaluator

corpus_test["true_labels"] = corpus_test["tags"].apply(lambda x: [id_to_label[i] for i in x])
corpus_test["pred_labels"] = corpus_test["llama_ft_prediction"].apply(lambda x: [id_to_label[i] for i in x])

true = corpus_test["true_labels"].values.tolist()
pred = corpus_test["pred_labels"].values.tolist()

evaluator = Evaluator(true, pred, tags=["corporation", "event", "location", "person", "product"], loader="list")

results, results_by_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
print(results)

In [None]:
for ent in results_by_tag.keys():
  print(f"{ent}: {results_by_tag[ent]}")

In [None]:
metrics = evaluate_ner_predictions(corpus_test)

### after new method with checkpointing

In [40]:
# checking detailed entity-level performance of model after 3, 4 and 5 epochs on test set

# first loading base model, then reloading fine-tuned model on top of it with PEFT

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_llama = AutoModelForTokenClassification.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    quantization_config=quantization_config,
    device_map="auto",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

llama_corpus_3 = PeftModel.from_pretrained(
    base_llama,
    "./ner-llama-2-7b/checkpoint-423"
)

llama_corpus_3_t = AutoTokenizer.from_pretrained("./ner-llama-2-7b/checkpoint-423")

In [41]:
corpus_test3 = llama_pred(llama_corpus_3, llama_corpus_3_t, corpus_test, "tokens")

In [42]:
from nervaluate import Evaluator

corpus_test3["true_labels"] = corpus_test3["tags"].apply(lambda x: [id_to_label[i] for i in x])
corpus_test3["pred_labels"] = corpus_test3["llama_ft_prediction"].apply(lambda x: [id_to_label[i] for i in x])

true = corpus_test3["true_labels"].values.tolist()
pred = corpus_test3["pred_labels"].values.tolist()

evaluator = Evaluator(true, pred, tags=["corporation", "event", "location", "person", "product"], loader="list")

results, results_by_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
print(results)

In [43]:
for ent in results_by_tag.keys():
  print(f"{ent}: {results_by_tag[ent]}")

In [44]:
llama_corpus_4 = PeftModel.from_pretrained(
    base_llama,
    "./ner-llama-2-7b/checkpoint-564"
)

llama_corpus_4_t = AutoTokenizer.from_pretrained("./ner-llama-2-7b/checkpoint-564")

In [45]:
corpus_test4 = llama_pred(llama_corpus_4, llama_corpus_4_t, corpus_test, "tokens")

In [46]:
corpus_test4["true_labels"] = corpus_test4["tags"].apply(lambda x: [id_to_label[i] for i in x])
corpus_test4["pred_labels"] = corpus_test4["llama_ft_prediction"].apply(lambda x: [id_to_label[i] for i in x])

true = corpus_test4["true_labels"].values.tolist()
pred = corpus_test4["pred_labels"].values.tolist()

evaluator = Evaluator(true, pred, tags=["corporation", "event", "location", "person", "product"], loader="list")

results, results_by_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
print(results)

In [47]:
for ent in results_by_tag.keys():
  print(f"{ent}: {results_by_tag[ent]}")

In [49]:
metrics = evaluate_ner_predictions(corpus_test4)

In [50]:
llama_corpus_5 = PeftModel.from_pretrained(
    base_llama,
    "./ner-llama-2-7b/checkpoint-705"
)

llama_corpus_5_t = AutoTokenizer.from_pretrained("./ner-llama-2-7b/checkpoint-705")

In [51]:
corpus_test5 = llama_pred(llama_corpus_5, llama_corpus_5_t, corpus_test, "tokens")

In [52]:
corpus_test5["true_labels"] = corpus_test5["tags"].apply(lambda x: [id_to_label[i] for i in x])
corpus_test5["pred_labels"] = corpus_test5["llama_ft_prediction"].apply(lambda x: [id_to_label[i] for i in x])

true = corpus_test5["true_labels"].values.tolist()
pred = corpus_test5["pred_labels"].values.tolist()

evaluator = Evaluator(true, pred, tags=["corporation", "event", "location", "person", "product"], loader="list")

results, results_by_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
print(results)

In [53]:
for ent in results_by_tag.keys():
  print(f"{ent}: {results_by_tag[ent]}")