Training

In [None]:
#installation of packages:
!pip install autotrain-advanced transformers datasets spacy 



In [None]:
!autotrain setup --update-torch

> [1mINFO    Installing latest xformers[0m
> [1mINFO    Successfully installed latest xformers[0m
> [1mINFO    Installing latest PyTorch[0m
> [1mINFO    Successfully installed latest PyTorch[0m


In [None]:
from datasets import load_dataset
import json

# Load the dataset
raw_dataset = load_dataset("tner/fin")

# Concatenate tokens into text sequence
def concatenate_tokens(example):
    example["text"] = " ".join(example["tokens"])
    return example

preprocessed_dataset = raw_dataset.map(concatenate_tokens)

# Convert dataset to JSON-serializable format
json_data = {}
for split_name, split_dataset in preprocessed_dataset.items():
    json_data[split_name] = []
    for i in range(len(split_dataset)):
        json_data[split_name].append({
            "tokens": split_dataset["tokens"][i],
            "tags": split_dataset["tags"][i],
            "text": split_dataset["text"][i]
        })

# Write JSON data to a file
with open("/content/drive/MyDrive/Dataset/dataset.json", "w") as json_file:
    json.dump(json_data['train'], json_file)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
#Fine-Tune
!autotrain llm --train --project-name 'Fin-NER' --model tner/roberta-large-fin --data-path '/content/drive/MyDrive/Dataset' --text_column 'text' --use-peft --lr 0.0002 --auto_find_batch_size --epochs 5 --trainer sft --quantization int8 --mp fp16 --model_max_length 512 --block_size 512 > training.log

> [1mINFO    Running LLM[0m
> [1mINFO    Params: Namespace(version=False, text_column='text', rejected_text_column='rejected', prompt_text_column='prompt', model_ref=None, warmup_ratio=0.1, optimizer='adamw_torch', scheduler='linear', weight_decay=0.0, max_grad_norm=1.0, add_eos_token=False, block_size=512, peft=True, lora_r=16, lora_alpha=32, lora_dropout=0.05, logging_steps=-1, evaluation_strategy='epoch', save_total_limit=1, save_strategy='epoch', auto_find_batch_size=True, mixed_precision='fp16', quantization='int8', model_max_length=512, trainer='sft', target_modules=None, merge_adapter=False, use_flash_attention_2=False, dpo_beta=0.1, chat_template=None, padding=None, train=True, deploy=False, inference=False, username=None, backend='local-cli', token=None, repo_id=None, push_to_hub=False, model='tner/roberta-large-fin', project_name='Fin-NER', seed=42, epochs=5, gradient_accumulation=1, disable_gradient_checkpointing=False, lr=0.0002, log='none', data_path='/content/drive/MyD

Inference

In [None]:
import json
import spacy
from spacy import displacy
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer

# Load the JSON configuration
with open(r"/content/drive/MyDrive/Fin-NER/adapter_config.json", 'r') as f:
    config = json.load(f)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(config['base_model_name_or_path'])

# Load the adapter-based model
model = AutoModelForTokenClassification.from_pretrained(
    config['base_model_name_or_path'],  # Use the base model path
    revision=config['revision']         # Use the revision if available
)

# Define the pipeline
classifier = pipeline("ner", model=model, tokenizer=tokenizer, config=config, use_fast=True)

# Process the text
text = '''UBS hopes the flexibility will boost its attractiveness as an employer in the banking sector. It has not yet set a date for employees’ return to the office. Only UBS workers in roles that require them to be in the office, such as those in supervisory positions, or in trading and branch roles, will have less flexibility, the bank said. However, an internal analysis of the 72,000 UBS employees globally showed that around two-thirds are in roles that would allow them to combine working remotely and in the office. The Swiss bank’s approach stands in contrast to some of the major Wall Street banks. Goldman Sachs, for example, asked its employees in the U.S. and U.K. to come back into the office this month. JPMorgan Chase also told its U.S. workers that it was aiming to get half of its employees rotating through the office by July. JPMorgan CEO Jamie Dimon has said he believes that by “sometime in September, October it will look just like it did before.” Morgan Stanley CEO James Gorman has also been outspoken on the matter. “If you can go into a restaurant in New York City, you can come into the office and we want you in the office,” Gorman reportedly said.'''
ner_results = classifier([text])

def visualize(pipeline_output, texts):

    """ Visualizes text and their Named entities.

    Args:
        pipeline_output (list): Output of the pipeline.
        texts (list): List containing original text.

    Returns:
        Nothing

    """

    for i in range(len(ner_results)):
        entities = []
        for ents in ner_results[i]:
            entities.append({"end": ents["end"], "label": ents["entity"], "start": ents["start"]})
        displacy.render({
            "ents": entities,
            "text": texts[i]
        }, style="ent", manual=True)

visualize(ner_results, [text])