<h2>Welcome to GLiNER-Studio!</h2>
<h3><a href="https://www.loom.com/share/a0b54b3509b74a5f928ae7fd9114ddef?sid=ed7df24c-25ba-4bb7-a723-798bcb08609e" target="_blank">Video Quickstart</a></h3>
<h3>With GLiNER-Studio, you can effortlessly fine-tune any GLiNER-based model on your custom dataset to handle the following tasks:</h3>
<ol>
    <li><b>Named Entity Recognition (NER):</b> Identifies and categorizes entities such as names, organizations, dates, and other specific items in the text.</li>
    <li><b>Relation Extraction:</b> Detects and classifies relationships between entities within the text.</li>
    <li><b>Summarization:</b> Extract the most important sentences that summarize the input text, capturing the essential information.</li>
    <li><b>Sentiment Extraction:</b> Identify parts of the text that signal a positive, negative, or neutral sentiment.</li>
    <li><b>Key-Phrase Extraction:</b> Identifies and extracts important phrases and keywords from the text.</li>
    <li><b>Question-answering:</b> Finding an answer in the text given a question.</li>
    <li><b>Open Information Extraction:</b> Extracts pieces of text given an open prompt from a user, for example, product description extraction.</li>
    <li><b>Text Cleaning:</b> Clear the text from unnecessary parts according to the prompt.</li>
</ol>
<h3>Remember, information extraction is not just about data; it's about insights. Let's uncover those insights together!</h3>

<!-- Links Section -->
<p>
    <a href="https://www.knowledgator.com/" target="_blank">Visit our website</a> |
    <a href="https://www.linkedin.com/company/knowledgator/" target="_blank">Follow on LinkedIn</a> |
    <a href="https://huggingface.co/knowledgator/" target="_blank">Hugging Face Profile</a> |
    <a href="https://twitter.com/knowledgator" target="_blank">Follow on X</a> |
    <a href="https://blog.knowledgator.com/" target="_blank">Follow on Medium</a> |
    <a href="https://discord.com/invite/dkyeAgs9DG" target="_blank">Join our Discord</a>
</p>

<h3>Please, cite if you have used GLiNER-Studio to finetune your model:</h3>
<pre>
'''
@misc{stepanov2024gliner,
      title={GLiNER multi-task: Generalist Lightweight Model for Various Information Extraction Tasks},
      author={Ihor Stepanov and Mykhailo Shtopko},
      year={2024},
      eprint={2406.12925},
      archivePrefix={arXiv},
      primaryClass={id='cs.LG' full_name='Machine Learning' is_active=True alt_name=None in_archive='cs' is_general=False description='Papers on all aspects of machine learning research (supervised, unsupervised, reinforcement learning, bandit problems, and so on) including also robustness, explanation, fairness, and methodology. cs.LG is also an appropriate primary category for applications of machine learning methods.'}
}
'''


In [1]:
# @title Installations
!pip install gradio gliner
!pip install accelerate -U
!pip install transformers huggingface_hub

import gradio as gr
import re
import os
import json
import pandas as pd
from typing import *
import random
import shutil
import zipfile
import torch
from gliner import GLiNER
from gliner import GLiNERConfig, GLiNER
from gliner.training import Trainer, TrainingArguments
from gliner.data_processing.collator import DataCollatorWithPadding
from gliner.utils import load_config_as_namespace
from gliner.data_processing import WordsSplitter, GLiNERDataset

if not os.path.exists("models"):
        os.makedirs("models")
if not os.path.exists("data"):
        os.makedirs("data")

# List of available models
AVAILABLE_MODELS = [
    "knowledgator/gliner-multitask-large-v0.5",
    "urchade/gliner_multi-v2.1",
    "urchade/gliner_large_bio-v0.1",
    "numind/NuNER_Zero",
    "EmergentMethods/gliner_medium_news-v2.1",
]

Collecting gradio
  Downloading gradio-5.12.0-py3-none-any.whl.metadata (16 kB)
Collecting gliner
  Downloading gliner-0.2.16-py3-none-any.whl.metadata (8.8 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.5.4 (from gradio)
  Downloading gradio_client-1.5.4-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.2.2 (from gradio)
  Do

In [None]:
# @title #Upload your sentences examples
import gradio as gr
import os
import shutil

# Ensure the /data directory exists
os.makedirs("data", exist_ok=True)

# Function to save the uploaded file
def save_file(uploaded_file):
    if uploaded_file is None:
        return "No file uploaded."

    # Define the path where the file will be saved
    save_path = os.path.join("data")

    try:
        # Save the file with the new name
        shutil.copy(uploaded_file.name, save_path)
        return f"File saved to {save_path}"
    except Exception as e:
        return f"An error occurred: {str(e)}"

# Gradio Interface
with gr.Blocks() as loader:
    gr.Markdown("# File Upload and Save Example")

    # File uploader component
    file_uploader = gr.File(label="Upload your file here")

    # Button to trigger the file save function
    save_button = gr.Button("Save File")

    # Output textbox to show the result
    output = gr.Textbox(label="Result")

    # Link the button to the save_file function
    save_button.click(fn=save_file, inputs=file_uploader, outputs=output)

# Launch the interface
loader.launch(share=True, inline=True)


**If you don't have the final dataset, upload sentences examples to auto annotate them, otherwise, upload your file and skip until validation.**

**Run the cell above ☝️**

**Or you can write your custom function for loading of a dataset 👇**
\
\
**🛑 If you have already annotated dataset, please scroll down, there is a way to load it directly 🛑**



In [2]:
# Example
sentences = [
        "IBM Watson defeated human champions in the game of Jeopardy!",
        "The Amazon rainforest is known as the lungs of the Earth.",
        "Sydney Opera House is an iconic symbol of Australia.",
        "The quick brown fox jumps over the lazy dog.",
        "A journey of a thousand miles begins with a single step.",
        "To be or not to be, that is the question.",
        "All that glitters is not gold.",
        "The early bird catches the worm.",
        "Google is building a new office in New York.",
        "The movie Inception was directed by Christopher Nolan."
        "Jeff Bezos founded Amazon in 1994.",
    ]

In [3]:
# @title Prepare Data for Manual Annotation
def tokenize_text(text):
    """Tokenize the input text into a list of tokens."""
    return re.findall(r'\w+(?:[-_]\w+)*|\S', text)

def prepare_data_for_manual_annotation(sentences):
  annotated_data = []
  for text in sentences:
    annotated_data.append({"tokenized_text": tokenize_text(text), "ner": [], "validated": False})
  with open("data/annotated_data.json", "wt") as file:
    json.dump(annotated_data, file)

prepare_data_for_manual_annotation(sentences)


In [4]:
# @title Auto Annotation

# Provided post-processing functions
def tokenize_text(text):
    """Tokenize the input text into a list of tokens."""
    return re.findall(r'\w+(?:[-_]\w+)*|\S', text)

def transform_data(data):
    tokens = tokenize_text(data['text'])
    spans = []

    for entity in data['entities']:
        entity_tokens = tokenize_text(entity['word'])
        entity_length = len(entity_tokens)

        # Find the start and end indices of each entity in the tokenized text
        for i in range(len(tokens) - entity_length + 1):
            if tokens[i:i + entity_length] == entity_tokens:
                spans.append([i, i + entity_length - 1, entity['entity']])
                break

    return {"tokenized_text": tokens, "ner": spans, "validated": False}

def merge_entities(entities):
    if not entities:
        return []
    merged = []
    current = entities[0]
    for next_entity in entities[1:]:
        if next_entity['entity'] == current['entity'] and (next_entity['start'] == current['end'] + 1 or next_entity['start'] == current['end']):
            current['word'] += ' ' + next_entity['word']
            current['end'] = next_entity['end']
        else:
            merged.append(current)
            current = next_entity
    merged.append(current)
    return merged

def annotate_text(
    model, text, labels: List[str], threshold: float, nested_ner: bool
) -> Dict:
    labels = [label.strip() for label in labels]
    r = {
        "text": text,
        "entities": [
            {
                "entity": entity["label"],
                "word": entity["text"],
                "start": entity["start"],
                "end": entity["end"],
                "score": 0,
            }
            for entity in model.predict_entities(
                text, labels, flat_ner=not nested_ner, threshold=threshold
            )
        ],
    }
    r["entities"] = merge_entities(r["entities"])
    return transform_data(r)

class AutoAnnotator:
    def __init__(
        self, model: int = "knowledgator/gliner-multitask-large-v0.5",
        device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
        ) -> None:

        self.model = GLiNER.from_pretrained(model).to(device)
        self.annotated_data = []
        self.stat = {
            "total": None,
            "current": -1
        }

    def auto_annotate(
            self, data: List[str], labels: List[str],
            prompt: Union[str, List[str]] = None, threshold: float = 0.5, nested_ner: bool = False
            ) -> List[Dict]:
        self.stat["total"] = len(data)
        self.stat["current"] = -1  # Reset current progress
        for text in data:
            self.stat["current"] += 1
            if isinstance(prompt, list):
                prompt_text = random.choice(prompt)
            else:
                prompt_text = prompt
            text = f"{prompt_text}\n{text}" if prompt_text else text

            annotation = annotate_text(self.model, text, labels, threshold, nested_ner)

            if not annotation["ner"]:  # If no entities identified
                annotation = {"tokenized_text": tokenize_text(text), "ner": [], "validated": False}

            self.annotated_data.append(annotation)
        return self.annotated_data

# Define a global annotator
annotator = None

# Function to annotate data
def annotate(model, labels, threshold, prompt):
    global annotator
    try:
        labels = [label.strip() for label in labels.split(",")]
        annotator = AutoAnnotator(model)
        annotated_data = annotator.auto_annotate(sentences, labels, prompt, threshold)
        with open("data/annotated_data.json", "wt") as file:
            json.dump(annotated_data, file)
        return "Successfully annotated and saved as data/annotated_data.json"
    except Exception as e:
        return str(e)


# Gradio interface
with gr.Blocks() as annotator_interface:
    labels = gr.Textbox(label="Labels", placeholder="Enter your comma-separated labels here", scale=2)
    model = gr.Dropdown(label="Choose the model which will be used for annotation", choices=AVAILABLE_MODELS)
    threshold = gr.Slider(0, 1, value=0.3, step=0.01, label="Threshold", info="Lower the threshold to increase how many entities get predicted.")
    prompt = gr.Textbox(label="Prompt", placeholder="Enter your annotation prompt here", scale=2)
    submit_btn = gr.Button("Annotate data")
    output_info = gr.Textbox(label="Processing info:")

    submit_btn.click(fn=annotate, inputs=[model, labels, threshold, prompt], outputs=output_info)

annotator_interface.launch(inline=True)


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7a046546d9cef5bac5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




**Run the cell above ☝️ to auto-annotate the dataset with one of the available GLiNER models**

**Run the cell above ☝️ to load already annotated dataset.**

**⚡ Skip it if you auto-annotated dataset ⚡**

In [5]:
# @title Dataset Viewer

class DynamicDataset:
    def __init__(
            self, data: List[Dict[str, Union[List[Union[int, str]], bool]]]
                 ) -> None:
        self.data = data
        self.data_len = len(self.data)
        self.current = -1
        for example in self.data:
            if not "validated" in example.keys():
                example["validated"] = False

    def next_example(self):
        self.current += 1
        if self.current > self.data_len-1:
          self.current = self.data_len -1
        elif self.current < 0:
          self.current = 0

    def previous_example(self):
        self.current -= 1
        if self.current > self.data_len-1:
          self.current = self.data_len -1
        elif self.current < 0:
          self.current = 0

    def example_by_id(self, id):
        self.current = id
        if self.current > self.data_len-1:
          self.current = self.data_len -1
        elif self.current < 0:
          self.current = 0

    def validate(self):
        self.data[self.current]["validated"] = True

    def load_current_example(self):
        return self.data[self.current]


def tokenize_text(text):
    """Tokenize the input text into a list of tokens."""
    return re.findall(r'\w+(?:[-_]\w+)*|\S', text)


def join_tokens(tokens):
    # Joining tokens with space, but handling special characters correctly
    text = ""
    for token in tokens:
        if token in {",", ".", "!", "?", ":", ";", "..."}:
            text = text.rstrip() + token
        else:
            text += " " + token
    return text.strip()

def prepare_for_highlight(data):
    tokens = data["tokenized_text"]
    ner = data["ner"]

    highlighted_text = []
    current_entity = None
    entity_tokens = []
    normal_tokens = []

    for idx, token in enumerate(tokens):
        # Check if the current token is the start of a new entity
        if current_entity is None or idx > current_entity[1]:
            if entity_tokens:
                highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
                entity_tokens = []
            current_entity = next((entity for entity in ner if entity[0] == idx), None)

        # If current token is part of an entity
        if current_entity and current_entity[0] <= idx <= current_entity[1]:
            if normal_tokens:
                highlighted_text.append((" ".join(normal_tokens), None))
                normal_tokens = []
            entity_tokens.append(token + " ")
        else:
            if entity_tokens:
                highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
                entity_tokens = []
            normal_tokens.append(token + " ")

    # Append any remaining tokens
    if entity_tokens:
        highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
    if normal_tokens:
        highlighted_text.append((" ".join(normal_tokens), None))
    # Clean up spaces before punctuation
    cleaned_highlighted_text = []
    for text, label in highlighted_text:
        cleaned_text = re.sub(r'\s(?=[,\.!?…:;])', '', text)
        cleaned_highlighted_text.append((cleaned_text, label))

    return cleaned_highlighted_text

def extract_tokens_and_labels(data: List[Dict[str, Union[str, None]]]) -> Dict[str, Union[List[str], List[Tuple[int, int, str]]]]:
    tokens = []
    ner = []

    token_start_idx = 0

    for entry in data:
        char = entry['token']
        label = entry['class_or_confidence']

        # Tokenize the current text chunk
        token_list = tokenize_text(char)

        # Append tokens to the main tokens list
        tokens.extend(token_list)

        if label:
            token_end_idx = token_start_idx + len(token_list) - 1
            ner.append((token_start_idx, token_end_idx, label))

        token_start_idx += len(token_list)

    return tokens, ner

def update_example(data):
    global dynamic_dataset
    tokens, ner = extract_tokens_and_labels(data)
    dynamic_dataset.data[dynamic_dataset.current]["tokenized_text"] = tokens
    dynamic_dataset.data[dynamic_dataset.current]["ner"] = ner
    return prepare_for_highlight(dynamic_dataset.load_current_example())

def validate_example():
    global dynamic_dataset
    dynamic_dataset.data[dynamic_dataset.current]["validated"] = True
    return [("The example was validated!", None)]

def next_example():
    dynamic_dataset.next_example()
    return prepare_for_highlight(dynamic_dataset.load_current_example()), dynamic_dataset.current

def previous_example():
    dynamic_dataset.previous_example()
    return prepare_for_highlight(dynamic_dataset.load_current_example()), dynamic_dataset.current

def save_dataset(inp):
  with open("data/annotated_data.json", "wt") as file:
    json.dump(dynamic_dataset.data, file)
  return [("The validates dataset was saved as data/annotated_data.json", None)]

with open("data/annotated_data.json", 'rt') as dataset:
  ANNOTATED_DATA = json.load(dataset)
dynamic_dataset = DynamicDataset(ANNOTATED_DATA)
DATASET_LEN = len(dynamic_dataset.data)

with gr.Blocks() as dataset_viewer:
    bar = gr.Slider(minimum=0, maximum=DATASET_LEN -1, step=1, label="Progress", interactive=False)
    with gr.Row():
        previous_btn = gr.Button("Previous example")
        apply_btn = gr.Button("Apply changes")
        next_btn = gr.Button("Next example")
    validate_btn = gr.Button("Validate")
    save_btn = gr.Button("Save validated dataset")

    inp_box = gr.HighlightedText(value=None, interactive=True)
    apply_btn.click(fn=update_example, inputs=inp_box, outputs=inp_box)
    save_btn.click(fn=save_dataset, inputs=inp_box, outputs=inp_box)
    validate_btn.click(fn=validate_example, inputs=None, outputs=inp_box)
    next_btn.click(fn=next_example, inputs=None, outputs=[inp_box,bar])
    previous_btn.click(fn=previous_example, inputs=None, outputs=[inp_box,bar])

dataset_viewer.launch(share=True, inline=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7e246b4553a26ff639.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Click `Next example` to access the first dataset item. Click `Apply changes` to save your annotation.

\

To annotate an entity just highlight a text and write an appropriate label name

\
⚡ Don't forget to `Save validated dataset`

In [10]:
# @title Train the model

os.environ["TOKENIZERS_PARALLELISM"] = "true"

def load_and_prepare_data(train_path, split_ratio):
    if not os.path.exists(train_path):
        raise FileNotFoundError(f"The file {train_path} does not exist.")

    with open(train_path, "r") as f:
        data = json.load(f)
    random.seed(42)
    random.shuffle(data)
    train_data = data[:int(len(data) * split_ratio)]
    test_data = data[int(len(data) * split_ratio):]
    return train_data, test_data

def create_models_directory():
    if not os.path.exists("models"):
        os.makedirs("models")





def train_model(model_name, custom_model_name, train_path, split_ratio, learning_rate, weight_decay, batch_size, epochs, compile_model):
    global train_data, train_data
    create_models_directory()

    device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
    print(f"Using device: {device}")

    print("Loading model...")
    model = GLiNER.from_pretrained(model_name)

    print("Loading and preparing data...")
    train_data, test_data = load_and_prepare_data(train_path, split_ratio)

    with open("data/test.json", "wt") as file:
      json.dump(test_data, file)
    print(f"Training data size: {len(train_data)}, Testing data size: {len(test_data)}")

    train_dataset = transform_dataset(train_data, model.config, data_processor=model.data_processor)
    test_dataset = transform_dataset(test_data, model.config, data_processor=model.data_processor)
    data_collator = DataCollatorWithPadding(model.config)

    if compile_model:
        print("Compiling model for faster training...")
        torch.set_float32_matmul_precision('high')
        model.to(device)
        model.compile_for_training()
    else:
        model.to(device)

    training_args = TrainingArguments(
        output_dir="models",
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        others_lr=learning_rate,
        others_weight_decay=weight_decay,
        lr_scheduler_type="linear",
        warmup_ratio=0.1,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        evaluation_strategy="epoch",
        save_steps=1000,
        save_total_limit=10,
        dataloader_num_workers=8,
        use_cpu=(device == torch.device('cpu')),
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=model.data_processor.transformer_tokenizer,
        data_collator=data_collator,
    )

    print("Starting training...")
    trainer.train()
    model.save_pretrained(f"models/{custom_model_name}")

    return "Training completed successfully."

# Gradio interface
def gradio_train(model_name, custom_model_name, train_path, split_ratio, learning_rate, weight_decay, batch_size, epochs, compile_model):
    train_path = os.path.join("data", train_path)
    try:
        return train_model(model_name, custom_model_name, train_path, split_ratio, learning_rate, weight_decay, batch_size, epochs, compile_model)
    except Exception as e:
        return f"An error occurred: {e}"

with gr.Blocks() as trainer_interface:
    gr.Markdown("# GLiNER Training Interface")
    train_path = os.listdir("data")
    #local_models = [f"models/{local_model}" for local_model in os.listdir("models")]
    with gr.Row():
      model_name = gr.Dropdown(label="Choose the parent model", choices=AVAILABLE_MODELS, value="knowledgator/gliner-multitask-large-v0.5")
      custom_model_name = gr.Textbox(label="The name of your custom model", placeholder="Enter the name of your new model")
      train_path = gr.Dropdown(label= "Choose the dataset",choices=train_path, value="annotated_data.json")
      split_ratio = gr.Slider(label="Train/Test Split Ratio", minimum=0.1, maximum=0.9, step=0.1, value=0.9)
    with gr.Row():
      learning_rate = gr.Slider(label="Learning Rate", minimum=1e-6, maximum=1e-4, step=1e-6, value=5e-6)
      weight_decay = gr.Slider(label="Weight Decay", minimum=0, maximum=0.1, step=0.01, value=0.01)
      batch_size = gr.Slider(label="Batch Size", minimum=1, maximum=128, step=1, value=8)
      epochs = gr.Slider(label="Number of Epochs", minimum=1, maximum=10, step=1, value=1)
    compile_model = gr.Checkbox(label="Compile Model for Faster Training", value=False)
    train_btn = gr.Button("Start Training")

    output_info = gr.Textbox(label="Training Info")

    train_btn.click(fn=gradio_train, inputs=[model_name, custom_model_name, train_path, split_ratio, learning_rate, weight_decay, batch_size, epochs, compile_model], outputs=output_info)

trainer_interface.launch(inline=True)


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://01df963c29ca6519df.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [11]:
def transform_dataset(data, model_config, data_processor):
    return GLiNERDataset(data, model_config, data_processor=data_processor)

def preprocess_gliner(model_name, custom_model_name, train_path, split_ratio):
    global train_data, train_data
    create_models_directory()

    device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
    print(f"Using device: {device}")

    print("Loading model...")
    model = GLiNER.from_pretrained(model_name)

    print("Loading and preparing data...")
    train_data, test_data = load_and_prepare_data(train_path, split_ratio)

    with open("data/test.json", "wt") as file:
      json.dump(test_data, file)
    print(f"Training data size: {len(train_data)}, Testing data size: {len(test_data)}")

    train_dataset = transform_dataset(train_data, model.config, data_processor=model.data_processor)
    test_dataset = transform_dataset(test_data, model.config, data_processor=model.data_processor)
    data_collator = DataCollatorWithPadding(model.config)

    return train_dataset, test_dataset, data_collator


train_dataset, test_dataset, data_collator = preprocess_gliner(
    model_name="knowledgator/gliner-multitask-large-v0.5",
    custom_model_name="my_model",
    train_path="data/annotated_data.json",
    split_ratio=0.7)

Using device: cuda:0
Loading model...


Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

Loading and preparing data...
Training data size: 7, Testing data size: 3
Collecting all entities...


100%|██████████| 7/7 [00:00<00:00, 1421.52it/s]


Total number of entity classes:  3
Collecting all entities...


100%|██████████| 3/3 [00:00<00:00, 24576.00it/s]

Total number of entity classes:  3





In [17]:
train_dataset.head()

AttributeError: 'GLiNERDataset' object has no attribute 'head'

**Choose a model and set training parameters for your needs**

In [None]:
# @title Fast Mertics

# Load the test.json file
with open('data/test.json', 'r') as file:
    test_data = json.load(file)

with open('data/annotated_data.json', 'r') as file:
    annotated_data = json.load(file)

# Extract all labels from each example
all_labels = []
for example in annotated_data:
    ner_data = example.get("ner", [])
    for entity in ner_data:
        label = entity[2]  # Assuming the label is the third element in the entity list
        if label not in all_labels:
            all_labels.append(label)

def evaluate_model(model_name):
    model_path = f"models/{model_name}"
    model = GLiNER.from_pretrained(model_path, load_tokenizer=True, local_files_only=True)

    def get_for_one_path(test_dataset, entity_types):
        # evaluate the model
        results, f1 = model.evaluate(test_dataset, flat_ner=True, threshold=0.5, batch_size=12, entity_types=entity_types)
        return results, f1

    results, f1 = get_for_one_path(test_data, all_labels)
    output_info = f"F1 Score: {f1:.2f}" + "\n" + results
    return output_info

with gr.Blocks() as evaluation_interface:
    gr.Markdown("# GLiNER Evaluation Interface")
    models = os.listdir("models")
    model_name = gr.Dropdown(label="Choose the model", choices=models, value=models[0])

    evaluate_btn = gr.Button("Evaluate Model")
    output_info = gr.Textbox(label="Evaluation Info")

    evaluate_btn.click(fn=evaluate_model, inputs=model_name, outputs=output_info)

# Suppress all prints
evaluation_interface.launch()


In [None]:
# @title NER Inferance

class Model:
  def __init__(self) -> None:
      self.previous_path = None
      self.path = None
      self.model = None
  def get_model(self, path):
      self.previous_path = None
      self.path = path
      if self.path != self.previous_path:
          self.model = GLiNER.from_pretrained(f"models/{self.path}", load_tokenizer=True).to("cuda" if torch.cuda.is_available() else "cpu")
      self.previous_path = self.path
      return self.model

model_generator = Model()

text1 = """
"I recently purchased the Sony WH-1000XM4 Wireless Noise-Canceling Headphones from Amazon and I must say, I'm thoroughly impressed. The package arrived in New York within 2 days, thanks to Amazon Prime's expedited shipping.

The headphones themselves are remarkable. The noise-canceling feature works like a charm in the bustling city environment, and the 30-hour battery life means I don't have to charge them every day. Connecting them to my Samsung Galaxy S21 was a breeze, and the sound quality is second to none.

I also appreciated the customer service from Amazon when I had a question about the warranty. They responded within an hour and provided all the information I needed.

However, the headphones did not come with a hard case, which was listed in the product description. I contacted Amazon, and they offered a 10% discount on my next purchase as an apology.

Overall, I'd give these headphones a 4.5/5 rating and highly recommend them to anyone looking for top-notch quality in both product and service."""


text3 = """
Several studies have reported its pharmacological activities, including anti-inflammatory, antimicrobial, and antitumoral effects.
The effect of E-anethole was studied in the osteosarcoma MG-63 cell line, and the antiproliferative activity was evaluated by an MTT assay.
It showed a GI50 value of 60.25 μM with apoptosis induction through the mitochondrial-mediated pathway. Additionally, it induced cell cycle arrest at the G0/G1 phase, up-regulated the expression of p53, caspase-3, and caspase-9, and down-regulated Bcl-xL expression.
Moreover, the antitumoral activity of anethole was assessed against oral tumor Ca9-22 cells, and the cytotoxic effects were evaluated by MTT and LDH assays.
It demonstrated a LD50 value of 8 μM, and cellular proliferation was 42.7% and 5.2% at anethole concentrations of 3 μM and 30 μM, respectively.
It was reported that it could selectively and in a dose-dependent manner decrease cell proliferation and induce apoptosis, as well as induce autophagy, decrease ROS production, and increase glutathione activity. The cytotoxic effect was mediated through NF-kB, MAP kinases, Wnt, caspase-3 and -9, and PARP1 pathways. Additionally, treatment with anethole inhibited cyclin D1 oncogene expression, increased cyclin-dependent kinase inhibitor p21WAF1, up-regulated p53 expression, and inhibited the EMT markers.
"""

text5 = """
Dr. Paul Hammond, a renowned neurologist at Johns Hopkins University, has recently published a paper in the prestigious journal "Nature Neuroscience". His research focuses on a rare genetic mutation, found in less than 0.01% of the population, that appears to prevent the development of Alzheimer's disease. Collaborating with researchers at the University of California, San Francisco, the team is now working to understand the mechanism by which this mutation confers its protective effect. Funded by the National Institutes of Health, their research could potentially open new avenues for Alzheimer's treatment.
"""

ner_examples = [
    [
        text5,
        "neurologist, scientist, gene, disease, biological process, city, journal, university",
        0.5,
        False
    ],
    [
        text1,
        "product, brand, location, features, rating",
        0.5,
        False
    ],
    [
        text3,
        "cell line, protein, metric, substance",
        0.5,
        False
    ]]

def merge_entities(entities):
    if not entities:
        return []
    merged = []
    current = entities[0]
    for next_entity in entities[1:]:
        if next_entity['entity'] == current['entity'] and (next_entity['start'] == current['end'] + 1 or next_entity['start'] == current['end']):
            current['word'] += ' ' + next_entity['word']
            current['end'] = next_entity['end']
        else:
            merged.append(current)
            current = next_entity
    merged.append(current)
    return merged

def process(
    model_name, text, labels: str, threshold: float, nested_ner: bool
) -> Dict[str, Union[str, int, float]]:
    model = model_generator.get_model(model_name)
    labels = [label.strip() for label in labels.split(",")]
    r = {
        "text": text,
        "entities": [
            {
                "entity": entity["label"],
                "word": entity["text"],
                "start": entity["start"],
                "end": entity["end"],
                "score": 0,
            }
            for entity in model.predict_entities(
                text, labels, flat_ner=not nested_ner, threshold=threshold
            )
        ],
    }
    r["entities"] =  merge_entities(r["entities"])
    return r

#model = GLiNER.from_pretrained(f"{MODEL_NAME}", load_tokenizer=True).to("cuda" if torch.cuda.is_available() else "cpu")
with gr.Blocks(title="NER Task") as ner_interface:
    models = os.listdir("models")
    if not models:
        print("No models found in the 'models' directory.")
    model_name = gr.Dropdown(label="Model Name", choices=models)
    input_text = gr.Textbox(label="Text input", placeholder="Enter your text here")
    labels = gr.Textbox(label="Labels", placeholder="Enter your labels here (comma separated)", scale=2)
    threshold = gr.Slider(0, 1, value=0.3, step=0.01, label="Threshold", info="Lower the threshold to increase how many entities get predicted.")
    nested_ner = gr.Checkbox(label="Nested NER", info="Allow for nested NER?")
    output = gr.HighlightedText(label="Predicted Entities")
    submit_btn = gr.Button("Submit")
    examples = gr.Examples(
        ner_examples,
        fn=process,
        inputs=[input_text, labels, threshold, nested_ner],
        outputs=output,
        cache_examples=False
    )
    theme=gr.themes.Base()

    input_text.submit(fn=process, inputs=[model_name, input_text, labels, threshold, nested_ner], outputs=output)
    labels.submit(fn=process, inputs=[model_name, input_text, labels, threshold, nested_ner], outputs=output)
    threshold.release(fn=process, inputs=[model_name, input_text, labels, threshold, nested_ner], outputs=output)
    submit_btn.click(fn=process, inputs=[model_name, input_text, labels, threshold, nested_ner], outputs=output)
    nested_ner.change(fn=process, inputs=[model_name, input_text, labels, threshold, nested_ner], outputs=output)

ner_interface.launch()

In [None]:
# @title Upload model to Google Drive
drive.mount('/content/drive')

def zip_directory(model_name):
    model_path = f"models/{model_name}"
    zip_path = f"{model_path}.zip"

    if os.path.exists(model_path):
        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
            for root, dirs, files in os.walk(model_path):
                for file in files:
                    file_path = os.path.join(root, file)
                    arcname = os.path.relpath(file_path, start=model_path)
                    zipf.write(file_path, arcname)
        return zip_path
    else:
        return None

def upload_to_drive(zip_path, drive_folder='My Drive'):
    if zip_path and os.path.exists(zip_path):
        destination_dir = f'/content/drive/{drive_folder}'
        os.makedirs(destination_dir, exist_ok=True)
        destination = f'{destination_dir}/{os.path.basename(zip_path)}'
        shutil.move(zip_path, destination)
        return f"File uploaded to {destination}"
    else:
        return "Zip file not found."

def zip_and_upload(model_name, drive_path):
    zip_path = zip_directory(model_name)
    if zip_path:
        upload_message = upload_to_drive(zip_path, drive_folder=drive_path)
        return f"Directory '{model_name}' zipped successfully as '{zip_path}'. {upload_message}"
    else:
        return f"Directory '{model_name}' not found."

with gr.Blocks() as to_drive:
    gr.Markdown("# GLiNER Model Zipper and Uploader")

    models = os.listdir("models")
    model_name = gr.Dropdown(label="Choose the model", choices=models, value=models[0])
    drive_path = gr.Textbox(label="Google Drive Path", placeholder="Enter the path on Google Drive (e.g., 'My Drive/Models')", value='My Drive/Models')
    upload_btn = gr.Button("Zip and Upload Model")
    output_info = gr.Textbox(label="Output Info")
    upload_btn.click(fn=zip_and_upload, inputs=[model_name, drive_path], outputs=output_info)

# Launch the Gradio interface
to_drive.launch(inline=True)