In [1]:
!pip install setfit==1.0.3 transformers==4.37.0

Collecting setfit==1.0.3
  Downloading setfit-1.0.3-py3-none-any.whl.metadata (11 kB)
Collecting transformers==4.37.0
  Downloading transformers-4.37.0-py3-none-any.whl.metadata (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.3.0 (from setfit==1.0.3)
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting sentence-transformers>=2.2.1 (from setfit==1.0.3)
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting evaluate>=0.3.0 (from setfit==1.0.3)
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers==4.37.0)
  Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting pyarrow>=15.0.0 (from datasets>=2.3.0->setfit==1.0.3)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Col

# Intent Detection

In [2]:
from datasets import DatasetDict, Dataset
from setfit import SetFitModel, Trainer, TrainingArguments
import random
import shutil
import pandas as pd

In [3]:
path = "/content/football_sft_model"

  and should_run_async(code)


### Data Preparation

In [4]:
data = {
    'greet-hi': [
                            'Hi',
                            'Hoooo',
                            'Hello buddy',
                            'Salut',
                            'Hey',
                            'Good morning',
                            'Bonsoir',
                            'Salut mon ami'
                        ],
    'greet-who_are_you': [
                        'qui êtes-vous',
                        'who are you',
                        'how can you help me',
                        "comment pouvez vous m'aider",
                        'why do I need you',
                        "pourquoi j'ai besoin de toi",
                        'help please',
                        "Aidez-moi, s'il vous plaît",
                        'I can not understand you',
                        'je ne te comprends pas'
                    ],
    'greet-good_bye': ['good bye',
                    'au revoir',
                    'See you later',
                    'A plus tard',
                    'stop',
                    'arrêter',
                    'exit',
                    'sortir'],
    'matches-team_next_match': [
                        'when will be the matches of Liverpool',
                        'quand auront lieu les matchs de Liverpool',
                        'when will be the matches of Aston Villa',
                        'quand auront lieu les matchs de Aston Villa',
                        'man city matches',
                        'Arsenal matches',
                        'Arsenal matchs',
                        'man city matchs',
                        'what the calendar of chelsea matches',
                        'à quoi correspond le calendrier de chelsea',
                        'Will Liverpool play any matches soon',
                        'Est-ce que Liverpool jouera bientôt des matchs',
                        'Will Liverpool play any matches next days',
                        'Est-ce que Liverpool jouera des matchs les jours prochains',
                        'upcoming games for Manchester United',
                        'prochains matchs de Tottenham Hotspur',
                        'Barcelona match schedule',
                        'Real Madrid fixtures',
                        'when are the matches of Bayern Munich',
                        'calendrier des matchs pour Paris Saint-Germain',
                        'any upcoming fixtures for Juventus',
                        'próximos partidos de Atletico Madrid',
                        'which teams will AC Milan face next',
                        'quels sont les prochains matchs de Borussia Dortmund'
                    ],
    'matches-match_time': [
                            'when liverpool will play with man city',
                            'quand liverpool jouera avec man city',
                            'crystal place vs chelsea',
                            'Bournemouth vs Brentford',
                            'Bournemouth vs Brentford',
                            'Norwich City vs Newcastle United',
                            'Southampton vs West Ham United',
                            'tell me the time of Everton vs Leicester City',
                            'tell me the time of Wolverhampton vs Chelsea',
                            'will Manchester United play with chelsea',
                            'will Brentford play with Bournemouth',
                            'Dis-moi le temps de Everton contre Leicester City',
                            'dis-moi le temps de Wolverhampton vs Chelsea',
                            'est-ce que Manchester United jouera avec chelsea',
                            'est-ce que Brentford jouera avec Bournemouth'
                        ],
    'matches-match_result': [
                            'what is the score of Brentford match',
                            'what is the score of Wolverhampton match',
                            'score of Everton match',
                            'who won in Norwich City vs West Ham United match',
                            'who won in Bournemouth vs West Ham United match',
                            'who won in Liverpool vs Newcastle United match',
                            'did Liverpool defeted man city',
                            'Liverpool and West Ham result',
                            'Chelsea and Norwich final result',
                            'quel est le score du match Brentford',
                            'quel est le score du match Wolverhampton',
                            'score du match Everton',
                            'qui a gagné le match Norwich City contre Norwich City',
                            'qui a gagné le match Bournemouth contre West Ham United',
                            'qui a gagné le match Liverpool contre Newcastle United',
                            'est-ce que Liverpool a vaincu man city',
                            'Résultat de Liverpool et West Ham',
                            'Résultat final de Chelsea et Norwich'
                        ]
}

In [5]:
train_data, dev_data = [], []
intents = list(data.keys())
idx = 0

for intent, prompts in data.items():

    random.Random(1).shuffle(prompts)
    intent_id = intents.index(intent)

    train_prompts = prompts[:-3]
    dev_prompts = prompts[-3:]

    for prompt in train_prompts:
        train_data.append({
            "idx": idx,
            "prompt": prompt,
            "intent": intent_id
        })
        idx+=1

    for prompt in dev_prompts:
        dev_data.append({
            "idx": idx,
            "prompt": prompt,
            "intent": intent_id
        })
        idx+=1

random.Random(1).shuffle(train_data)
random.Random(1).shuffle(dev_data)

# Create full dataset dictionary
dataset = DatasetDict({
    "train": Dataset.from_pandas(pd.DataFrame(train_data)),
    "dev": Dataset.from_pandas(pd.DataFrame(dev_data))
})

### Train

In [6]:
# Embedding Model
embed_model = SetFitModel.from_pretrained(
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    labels = intents
)

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.12k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [7]:
# Training parameters

args = TrainingArguments(
    batch_size = 4,
    num_epochs = 4,
    logging_steps = 10,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True
)

trainer = Trainer(
    model = embed_model,
    args = args,
    train_dataset = dataset["train"],
    eval_dataset = dataset["dev"],
    metric = "accuracy",
    column_mapping = {"prompt": "text", "intent": "label"}
)

Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset


Map:   0%|          | 0/65 [00:00<?, ? examples/s]

In [8]:
trainer.train()

***** Running training *****
  Num unique pairs = 3316
  Batch size = 4
  Num epochs = 4
  Total optimization steps = 3316


Epoch,Training Loss,Validation Loss,Embedding Loss,Rate
1,No log,No log,0.0093,1.7e-05
2,No log,No log,0.0071,1.1e-05
3,No log,No log,0.0071,6e-06
4,No log,No log,0.0064,0.0


  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

Loading best SentenceTransformer model from step 3316.


### Evaluation

In [9]:
metrics = trainer.evaluate(dataset["dev"])
print(metrics)

  and should_run_async(code)
Applying column mapping to the evaluation dataset
***** Running evaluation *****


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

{'accuracy': 1.0}


### Saving Locally

In [10]:
# save and zip locally
trainer.model.save_pretrained(path)
shutil.make_archive(path, "zip", path)

'/content/football_sft_model.zip'

In [None]:
# push to hf
trainer.model.push_to_hub(
    repo_id = "botpress_football_sft_model",
    private = True,
)

model_head.pkl:   0%|          | 0.00/19.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

unigram.json:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/MuhammadHelmy/botpress_football_sft_model/commit/89d443ab6d167a29fbf942910ab932d7b9e86f28', commit_message='Push model using huggingface_hub.', commit_description='', oid='89d443ab6d167a29fbf942910ab932d7b9e86f28', pr_url=None, pr_revision=None, pr_num=None)

### Load from local

In [21]:
intent_model = SetFitModel.from_pretrained("MuhammadHelmy/botpress_football_sft_model")

### Testing Inferencing

In [22]:
preds = intent_model.predict([
    "Hi everybody!",
    "When Liverpool will play?",
    "من الذي فاز في مباراة الأهلي والزمالك؟"
])

In [23]:
preds

['greet-hi', 'matches-match_time', 'matches-match_result']

In [24]:
probs = intent_model.predict_proba([
    "Hi everybody!",
    "When Liverpool will play?",
    "من الذي فاز في مباراة الأهلي والزمالك؟"
]).tolist()

In [25]:
probs

[[0.9393362511344264,
  0.0252376577175864,
  0.008357781965532702,
  0.008715160074826986,
  0.00805322318198944,
  0.01029992592563807],
 [0.01075641756463522,
  0.009948115822205611,
  0.0096619337604315,
  0.04186757393320049,
  0.9167862472702646,
  0.010979711649262607],
 [0.0014734082482331652,
  0.0020127048162162124,
  0.001548008371522715,
  0.0014040633566333015,
  0.0011047533259049493,
  0.9924570618814896]]

# Named Entity Recognition

In [None]:
!pip install gliner==0.2.7

Collecting gliner==0.2.7
  Downloading gliner-0.2.7-py3-none-any.whl.metadata (15 kB)
Collecting transformers>=4.38.2 (from gliner==0.2.7)
  Downloading transformers-4.43.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting onnxruntime (from gliner==0.2.7)
  Downloading onnxruntime-1.18.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.3 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers>=4.38.2->gliner==0.2.7)
  Using cached tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting coloredlogs (from onnxruntime->gliner==0.2.7)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime->gliner==0.2.7)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading gliner-0.2.7-py3-none-any.whl (45 kB)
[2K

In [None]:
from gliner import GLiNER
import pprint

In [None]:
ner_model = GLiNER.from_pretrained("urchade/gliner_multi-v2.1")

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

gliner_config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.77k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.16G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]



In [None]:
samples = [
    "who won between Manchester City and Arsenal?",
    "كيف انتهت مباراة نورويتش سيتي وتشيلسي"
]

labels = ["team_name"]

for text in samples:
    entities = ner_model.predict_entities(text, labels)
    pprint.pprint(entities)
    print("-------------------------")

[{'end': 31,
  'label': 'team_name',
  'score': 0.7424863576889038,
  'start': 16,
  'text': 'Manchester City'},
 {'end': 43,
  'label': 'team_name',
  'score': 0.517137885093689,
  'start': 36,
  'text': 'Arsenal'}]
-------------------------
[{'end': 29,
  'label': 'team_name',
  'score': 0.7897310256958008,
  'start': 17,
  'text': 'نورويتش سيتي'}]
-------------------------


In [None]:
txt = "Mohamed Mohamed Mohamed Aboutrika is an Egyptian retired professional footballer who played as an attacking midfielder and a forward for Al-ahly. He was voted second place in the African Footballer of the Year award in 2008, and was one of five nominees for the 2006 award, and one of the ten nominated for the 2013 award.".strip()
labels = ["team_name", "person_name", "event"]

entities = ner_model.predict_entities(txt, labels)
pprint.pprint(entities)

[{'end': 33,
  'label': 'person_name',
  'score': 0.8084983229637146,
  'start': 0,
  'text': 'Mohamed Mohamed Mohamed Aboutrika'},
 {'end': 144,
  'label': 'team_name',
  'score': 0.8981382846832275,
  'start': 137,
  'text': 'Al-ahly'},
 {'end': 215,
  'label': 'event',
  'score': 0.8969795107841492,
  'start': 179,
  'text': 'African Footballer of the Year award'},
 {'end': 272,
  'label': 'event',
  'score': 0.8427568078041077,
  'start': 262,
  'text': '2006 award'},
 {'end': 321,
  'label': 'event',
  'score': 0.8318695425987244,
  'start': 311,
  'text': '2013 award'}]
