In [2]:

from datasets import load_dataset

conll = load_dataset("conll2003")

In [3]:
conll["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [4]:
label_list = conll["train"].features[f"ner_tags"].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-7b1", add_prefix_space=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

In [6]:
label_all_tokens = True

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [7]:
tokenized_dataset = conll.map(tokenize_and_align_labels, batched=True)

In [8]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [9]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [10]:
import evaluate

seqeval = evaluate.load("seqeval")

In [11]:
import numpy as np


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [12]:
label2id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}

In [13]:
id2label = {0 : 'O', 1 : 'B-PER', 2 : 'I-PER', 3 : 'B-ORG', 4 : 'I-ORG', 5 : 'B-LOC', 6 : 'I-LOC', 7 : 'B-MISC', 8 : 'I-MISC'}

In [14]:
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_config, get_peft_model,  PrefixTuningConfig, TaskType, PeftType, PrefixEncoder
import torch
from datasets import load_dataset
import os
from torch.utils.data import DataLoader
from tqdm import tqdm

In [15]:
from transformers import AutoConfig
model_name="bigscience/bloomz-7b1"
config = AutoConfig.from_pretrained(model_name)
config

BloomConfig {
  "_name_or_path": "bigscience/bloomz-7b1",
  "apply_residual_connection_post_layernorm": false,
  "architectures": [
    "BloomForCausalLM"
  ],
  "attention_dropout": 0.0,
  "attention_softmax_in_fp32": true,
  "bias_dropout_fusion": true,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_dropout": 0.0,
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "masked_softmax_fusion": true,
  "model_type": "bloom",
  "n_head": 32,
  "n_inner": null,
  "n_layer": 30,
  "offset_alibi": 100,
  "pad_token_id": 3,
  "pretraining_tp": 4,
  "seq_length": 2048,
  "skip_bias_add": true,
  "skip_bias_add_qkv": false,
  "slow_but_exact": false,
  "transformers_version": "4.36.2",
  "unk_token_id": 0,
  "use_cache": true,
  "vocab_size": 250880
}

In [16]:
config._name_or_path=model_name
config.hidden_size=4096
config.num_hidden_layers=30
config.n_head=32
config.num_labels=9

config.hidden_dropout = 0.1
config.transform=False
config.text='classify the token of the text:'

In [17]:
from falconSKT import  PrefixForTokenClassification

model = PrefixForTokenClassification.from_pretrained(
    model_name,
    config=config,

)

Some weights of PromptForTokenClassification were not initialized from the model checkpoint at bigscience/bloomz-7b1 and are newly initialized: ['transformer.h.0.mlp.dense_4h_to_h.weight', 'transformer.h.9.mlp.dense_4h_to_h.bias', 'transformer.h.14.post_attention_layernorm.weight', 'transformer.h.16.post_attention_layernorm.weight', 'transformer.h.17.self_attention.dense.weight', 'transformer.h.17.mlp.dense_4h_to_h.bias', 'transformer.h.14.input_layernorm.weight', 'transformer.h.6.post_attention_layernorm.weight', 'transformer.h.13.self_attention.dense.bias', 'transformer.h.20.input_layernorm.bias', 'transformer.h.27.input_layernorm.bias', 'transformer.h.2.post_attention_layernorm.weight', 'transformer.h.0.post_attention_layernorm.weight', 'transformer.h.11.self_attention.dense.weight', 'transformer.h.10.self_attention.query_key_value.weight', 'transformer.h.10.mlp.dense_h_to_4h.bias', 'transformer.h.5.input_layernorm.bias', 'transformer.h.20.self_attention.query_key_value.bias', 'tran

Prompt sequence length 7


In [18]:
from transformers import TrainingArguments, Trainer


training_args = TrainingArguments(
    output_dir='./r_task',
    #learning_rate=1e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_total_limit=2,
    save_steps=100,
    logging_steps=100,
    load_best_model_at_end=True,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
100,1.962,0.613682,0.321164,0.37737,0.347006,0.832037
200,0.4592,0.420407,0.490348,0.52378,0.506513,0.885804
300,0.3784,0.355434,0.549511,0.576728,0.562791,0.901568
400,0.3247,0.318993,0.588116,0.630712,0.60867,0.914739
500,0.2664,0.309581,0.600635,0.627396,0.613724,0.918163
600,0.2422,0.285857,0.60486,0.678375,0.639512,0.922604
700,0.2281,0.264139,0.635337,0.687804,0.66053,0.926926
800,0.2499,0.266324,0.641806,0.686561,0.663429,0.928299
900,0.1953,0.261654,0.639182,0.68998,0.663611,0.930147
1000,0.2122,0.248676,0.645273,0.711532,0.676785,0.932232


KeyboardInterrupt: 