# Syntethic Data

## Generation

In [None]:
!pip install langchain_openai==0.2.12

In [None]:
import pandas as pd

In [None]:
!wget https://raw.githubusercontent.com/MasrourTawfik/Textra_Insights/main/Files/PCG_Refined.csv

--2024-12-15 23:05:19--  https://raw.githubusercontent.com/MasrourTawfik/Textra_Insights/main/Files/PCG_Refined.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 120894 (118K) [text/plain]
Saving to: ‘PCG_Refined.csv’


2024-12-15 23:05:19 (6.96 MB/s) - ‘PCG_Refined.csv’ saved [120894/120894]



In [None]:
PCG_Refined = pd.read_csv("PCG_Refined.csv")
PCG_Refined.head()

Unnamed: 0,Id,Title,Definition
0,2111,Frais de constitution,Les frais de constitution sont les dépenses en...
1,2112,Frais préalables au démarrage,Les frais préalables au démarrage sont les dép...
2,2113,Frais d'augmentation du capital,Les frais d'augmentation du capital sont les c...
3,2114,"Frais sur opérations de fusions, scissions et ...","Les frais sur opérations de fusions, scissions..."
4,2116,Frais de prospection,Les frais de prospection sont les coûts engagé...


In [None]:
print("Number of accounts :",len(PCG_Refined["Id"].unique()))

Number of accounts : 83


In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
import time
from tqdm import tqdm
import random
import pandas as pd
import json

GITHUB_TOKEN = "xxxxxxxxxxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxxxxxx"

In [None]:
class SyntheticData(BaseModel):
    generated_scenarios: str = Field("", description="Les 30 nouveaux scénarios générés en format json")



System_Prompt = """
### Système Prompt

Tu es un expert en comptabilité marocaine et en application du **Plan Comptable Marocain (PCM)**. Ton rôle est de générer 30 scénarios réalistes et concis pour chaque définition de compte que je te fournirai. Ces scénarios doivent décrire des transactions spécifiques effectuées par une entreprise.

**Exigences pour chaque scénario :**
- Chaque scénario doit être précis, réaliste et s'appliquer à des entreprises marocaines ou des environnements similaires.
- Donner des 30 descriptions un petit peut longues .
- Respecte les règles et l’esprit du Plan Comptable Marocain.
- Les réponses doivent être formatées en **JSON** avec cette structure :

 "id": 1,
 "prompt": "Scénario réaliste 1"

 "id": 2,
 "prompt": "Scénario réaliste 2"

Les valeurs pour `id` doivent être numériques croissantes, et `prompt` doit contenir la description concise du scénario.
"""


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", System_Prompt),
        ("human", """
       La definition depuis le Plan Comptable Marocain est la suivante :
        {Definition}
        """),
    ]
)

LLM = ChatOpenAI(model="gpt-4o-mini",
    #temperature=0,
    api_key=GITHUB_TOKEN,
    base_url="https://models.inference.ai.azure.com",
)

LLM_Structered = LLM.with_structured_output(SyntheticData)
LLM_Generator = prompt | LLM_Structered

In [None]:
def Run_Generator(definition):
  Input = {"Definition": definition}
  response = LLM_Generator.invoke(Input)
  return response

def Run_And_Parse(AccountID, PCG_Refined):
    Definition = PCG_Refined[PCG_Refined["Id"] == AccountID]["Definition"].values[0]

    df = pd.DataFrame(columns=["anchor", "positive", "negative"])

    for i in range(3):
        print(f"Iteration {i+1}")
        try:
            response = Run_Generator(Definition)
            parsed_json = json.loads(response.generated_scenarios)
            break
        except Exception as e:
            print(f"Error: {e}")
            time.sleep(1)
            continue

    for item in parsed_json:
        anchor_prompt = item["prompt"]
        positive_definition = Definition
        negative_definitions = PCG_Refined[PCG_Refined["Id"] != AccountID]["Definition"].tolist()
        negative_definition = random.choice(negative_definitions)

        while negative_definition == Definition:
            negative_definition = random.choice(negative_definitions)

        new_row = pd.DataFrame([{
            "anchor": anchor_prompt,
            "positive": positive_definition,
            "negative": negative_definition
        }])

        df = pd.concat([df, new_row], ignore_index=True)

    return df

def Run_ALL(PCG_Refined):
    Final = pd.DataFrame(columns=["anchor", "positive", "negative"])
    counter = 0
    for account_id in tqdm(PCG_Refined["Id"].unique(), desc="Generating Data..."):
      try:
        result_df = Run_And_Parse(account_id, PCG_Refined)
        Final = pd.concat([Final, result_df], ignore_index=True)
        if counter == 10:
          print("Sleeping for 60 seconds...")
          time.sleep(60)
          counter = 0
          # Save your csv in case a crush occured you will not start from the bigening again
          print("Saving CSV...")
          Final.to_csv("Final.csv", index=False)
        else :
          counter += 1
          time.sleep(2)

      except Exception:
        continue

    Final.to_csv("Final.csv", index=False)
    return Final

## Test Code with a definition

In [None]:
example = PCG_Refined.iloc[53]["Definition"]
response = Run_Generator(example)
parsed_json = json.loads(response.generated_scenarios)

print(example)
print(json.dumps(parsed_json, indent=4, ensure_ascii=False))

Ce compte enregistre les paiements effectués pour des services d'intermédiaires et d'honoraires. Cela inclut les commissions versées à des agents commerciaux, les honoraires d'avocats ou d'experts-comptables, ainsi que les frais liés aux actes juridiques et aux contentieux.

1. Une entreprise paie une commission de 5% à un agent immobilier pour la vente d'un bien. 2. Un avocat reçoit un honoraire de 150 euros pour une consultation juridique. 3. Un expert-comptable facture 1 000 euros pour la préparation des états financiers d'une entreprise. 4. Un notaire est rémunéré pour 300 euros pour la rédaction d'un acte de vente. 5. Une société paie des frais de 200 euros à un huissier pour l'exécution d'un jugement. 6. Un consultant reçoit 800 euros pour un audit de l'entreprise. 7. Une agence de publicité perçoit une commission de 10% sur les ventes générées par une campagne. 8. Un courtier en assurance est rémunéré pour une prime de 2 500 euros. 9. Un architecte facture 5 000 euros pour la co

In [None]:
Run_ALL(PCG_Refined)

Generating Data...:   0%|          | 0/84 [00:00<?, ?it/s]

Iteration 1


Generating Data...:   1%|          | 1/84 [00:16<22:56, 16.58s/it]

Iteration 1


Generating Data...:   2%|▏         | 2/84 [00:25<16:37, 12.17s/it]

Iteration 1


Generating Data...:   4%|▎         | 3/84 [00:33<13:42, 10.15s/it]

Iteration 1
Error: Extra data: line 1 column 5935 (char 5934)
Iteration 2


Generating Data...:   5%|▍         | 4/84 [01:11<28:04, 21.06s/it]

Iteration 1
Error: Extra data: line 1 column 6917 (char 6916)
Iteration 2


Generating Data...:   6%|▌         | 5/84 [01:45<33:51, 25.71s/it]

Iteration 1


Generating Data...:   7%|▋         | 6/84 [01:56<27:05, 20.84s/it]

Iteration 1
Error: Expecting ',' delimiter: line 1 column 37 (char 36)
Iteration 2


Generating Data...:   8%|▊         | 7/84 [02:28<31:21, 24.44s/it]

Iteration 1


Generating Data...:  10%|▉         | 8/84 [02:49<29:38, 23.40s/it]

Iteration 1
Error: Extra data: line 1 column 5406 (char 5405)
Iteration 2


Generating Data...:  11%|█         | 9/84 [03:14<29:58, 23.98s/it]

Iteration 1
Error: Extra data: line 1 column 4332 (char 4331)
Iteration 2
Error: Extra data: line 1 column 6656 (char 6655)
Iteration 3


Generating Data...:  12%|█▏        | 10/84 [03:58<37:09, 30.13s/it]

Iteration 1
Sleeping for 60 seconds...


Generating Data...:  13%|█▎        | 11/84 [05:11<52:25, 43.09s/it]

Saving CSV...
Iteration 1


Generating Data...:  14%|█▍        | 12/84 [05:21<39:51, 33.22s/it]

Iteration 1


Generating Data...:  15%|█▌        | 13/84 [05:36<32:35, 27.54s/it]

Iteration 1
Error: Extra data: line 1 column 3442 (char 3441)
Iteration 2


Generating Data...:  17%|█▋        | 14/84 [06:02<31:34, 27.06s/it]

Iteration 1
Error: Extra data: line 1 column 3724 (char 3723)
Iteration 2
Error: Invalid control character at: line 1 column 359 (char 358)
Iteration 3
Error: Extra data: line 1 column 4244 (char 4243)


Generating Data...:  18%|█▊        | 15/84 [06:37<34:00, 29.57s/it]

Iteration 1
Error: Extra data: line 1 column 7073 (char 7072)
Iteration 2


Generating Data...:  19%|█▉        | 16/84 [07:13<35:37, 31.44s/it]

Iteration 1


Generating Data...:  20%|██        | 17/84 [07:25<28:30, 25.53s/it]

Iteration 1


Generating Data...:  21%|██▏       | 18/84 [07:40<24:48, 22.56s/it]

Iteration 1


Generating Data...:  23%|██▎       | 19/84 [07:51<20:36, 19.03s/it]

Iteration 1


Generating Data...:  24%|██▍       | 20/84 [08:01<17:29, 16.40s/it]

Iteration 1


Generating Data...:  25%|██▌       | 21/84 [08:14<16:10, 15.40s/it]

Iteration 1


Generating Data...:  26%|██▌       | 22/84 [08:25<14:32, 14.07s/it]

Iteration 1
Error: Extra data: line 1 column 4055 (char 4054)
Iteration 2
Error: Extra data: line 1 column 6307 (char 6306)
Iteration 3
Sleeping for 60 seconds...


Generating Data...:  27%|██▋       | 23/84 [10:00<38:55, 38.29s/it]

Saving CSV...
Iteration 1
Error: Extra data: line 1 column 5501 (char 5500)
Iteration 2


Generating Data...:  29%|██▊       | 24/84 [10:24<34:02, 34.04s/it]

Iteration 1


Generating Data...:  30%|██▉       | 25/84 [10:42<28:32, 29.02s/it]

Iteration 1


Generating Data...:  31%|███       | 26/84 [10:52<22:34, 23.35s/it]

Iteration 1


Generating Data...:  32%|███▏      | 27/84 [11:06<19:41, 20.72s/it]

Iteration 1
Error: Extra data: line 1 column 7295 (char 7294)
Iteration 2
Error: Extra data: line 1 column 3696 (char 3695)
Iteration 3


Generating Data...:  33%|███▎      | 28/84 [11:39<22:47, 24.42s/it]

Iteration 1


Generating Data...:  35%|███▍      | 29/84 [11:57<20:22, 22.23s/it]

Iteration 1
Error: Extra data: line 1 column 5119 (char 5118)
Iteration 2
Error: Extra data: line 1 column 6871 (char 6870)
Iteration 3
Error: Extra data: line 1 column 6733 (char 6732)


Generating Data...:  36%|███▌      | 30/84 [13:21<36:54, 41.01s/it]

Iteration 1
Error: Extra data: line 1 column 4374 (char 4373)
Iteration 2


Generating Data...:  37%|███▋      | 31/84 [13:50<32:49, 37.16s/it]

Iteration 1


Generating Data...:  38%|███▊      | 32/84 [14:14<28:51, 33.30s/it]

Iteration 1


Generating Data...:  39%|███▉      | 33/84 [14:23<22:06, 26.01s/it]

Iteration 1


Generating Data...:  40%|████      | 34/84 [14:33<17:35, 21.10s/it]

Iteration 1
Sleeping for 60 seconds...


Generating Data...:  42%|████▏     | 35/84 [15:48<30:31, 37.37s/it]

Saving CSV...
Iteration 1


Generating Data...:  43%|████▎     | 36/84 [16:10<26:15, 32.82s/it]

Iteration 1


Generating Data...:  44%|████▍     | 37/84 [16:20<20:18, 25.93s/it]

Iteration 1
Error: Extra data: line 1 column 3800 (char 3799)
Iteration 2


Generating Data...:  45%|████▌     | 38/84 [16:43<19:18, 25.20s/it]

Iteration 1
Error: Extra data: line 1 column 6392 (char 6391)
Iteration 2


Generating Data...:  46%|████▋     | 39/84 [17:24<22:25, 29.91s/it]

Iteration 1


Generating Data...:  48%|████▊     | 40/84 [17:44<19:39, 26.80s/it]

Iteration 1


Generating Data...:  49%|████▉     | 41/84 [18:00<17:01, 23.75s/it]

Iteration 1
Error: Extra data: line 1 column 5890 (char 5889)
Iteration 2
Error: Extra data: line 1 column 7988 (char 7987)
Iteration 3
Error: Extra data: line 1 column 7543 (char 7542)


Generating Data...:  50%|█████     | 42/84 [19:04<25:05, 35.83s/it]

Iteration 1
Error: Extra data: line 1 column 3406 (char 3405)
Iteration 2


Generating Data...:  51%|█████     | 43/84 [19:23<20:59, 30.71s/it]

Iteration 1
Error: Extra data: line 1 column 6795 (char 6794)
Iteration 2


Generating Data...:  52%|█████▏    | 44/84 [20:15<24:39, 36.98s/it]

Iteration 1


Generating Data...:  54%|█████▎    | 45/84 [20:26<18:58, 29.20s/it]

Iteration 1


Generating Data...:  55%|█████▍    | 46/84 [20:45<16:34, 26.17s/it]

Iteration 1
Sleeping for 60 seconds...


Generating Data...:  56%|█████▌    | 47/84 [21:57<24:32, 39.81s/it]

Saving CSV...
Iteration 1


Generating Data...:  57%|█████▋    | 48/84 [22:09<18:57, 31.59s/it]

Iteration 1


Generating Data...:  58%|█████▊    | 49/84 [22:29<16:25, 28.16s/it]

Iteration 1


Generating Data...:  60%|█████▉    | 50/84 [22:43<13:31, 23.86s/it]

Iteration 1


Generating Data...:  61%|██████    | 51/84 [22:58<11:41, 21.25s/it]

Iteration 1


Generating Data...:  62%|██████▏   | 52/84 [23:47<15:47, 29.62s/it]

Iteration 1


Generating Data...:  63%|██████▎   | 53/84 [24:07<13:45, 26.63s/it]

Iteration 1


Generating Data...:  64%|██████▍   | 54/84 [24:21<11:27, 22.91s/it]

Iteration 1


Generating Data...:  65%|██████▌   | 55/84 [24:42<10:41, 22.13s/it]

Iteration 1
Error: Extra data: line 1 column 3133 (char 3132)
Iteration 2


Generating Data...:  67%|██████▋   | 56/84 [25:04<10:20, 22.16s/it]

Iteration 1


Generating Data...:  68%|██████▊   | 57/84 [25:24<09:44, 21.66s/it]

Iteration 1
Sleeping for 60 seconds...


Generating Data...:  69%|██████▉   | 58/84 [26:38<16:10, 37.31s/it]

Saving CSV...
Iteration 1


Generating Data...:  70%|███████   | 59/84 [26:57<13:17, 31.91s/it]

Iteration 1


Generating Data...:  71%|███████▏  | 60/84 [27:10<10:29, 26.24s/it]

Iteration 1


Generating Data...:  73%|███████▎  | 61/84 [27:32<09:33, 24.92s/it]

Iteration 1
Error: Extra data: line 1 column 5713 (char 5712)
Iteration 2
Error: Extra data: line 1 column 5026 (char 5025)
Iteration 3
Error: Extra data: line 1 column 5070 (char 5069)


Generating Data...:  74%|███████▍  | 62/84 [28:28<12:34, 34.30s/it]

Iteration 1


Generating Data...:  75%|███████▌  | 63/84 [28:42<09:52, 28.19s/it]

Iteration 1


Generating Data...:  76%|███████▌  | 64/84 [28:53<07:38, 22.92s/it]

Iteration 1
Error: Extra data: line 1 column 5870 (char 5869)
Iteration 2


Generating Data...:  77%|███████▋  | 65/84 [29:25<08:08, 25.71s/it]

Iteration 1
Error: Extra data: line 1 column 2969 (char 2968)
Iteration 2


Generating Data...:  79%|███████▊  | 66/84 [29:50<07:40, 25.57s/it]

Iteration 1


Generating Data...:  80%|███████▉  | 67/84 [30:07<06:30, 22.95s/it]

Iteration 1


Generating Data...:  81%|████████  | 68/84 [30:17<05:02, 18.93s/it]

Iteration 1
Error: Extra data: line 1 column 3939 (char 3938)
Iteration 2


Generating Data...:  82%|████████▏ | 69/84 [30:43<05:16, 21.10s/it]

Iteration 1
Sleeping for 60 seconds...


Generating Data...:  83%|████████▎ | 70/84 [31:57<08:39, 37.11s/it]

Saving CSV...
Iteration 1


Generating Data...:  85%|████████▍ | 71/84 [32:19<07:03, 32.55s/it]

Iteration 1


Generating Data...:  86%|████████▌ | 72/84 [32:31<05:16, 26.39s/it]

Iteration 1


Generating Data...:  87%|████████▋ | 73/84 [32:41<03:53, 21.24s/it]

Iteration 1
Error: Extra data: line 1 column 6458 (char 6457)
Iteration 2


Generating Data...:  88%|████████▊ | 74/84 [33:10<03:56, 23.64s/it]

Iteration 1


Generating Data...:  89%|████████▉ | 75/84 [33:32<03:29, 23.28s/it]

Iteration 1


Generating Data...:  90%|█████████ | 76/84 [33:45<02:40, 20.01s/it]

Iteration 1


Generating Data...:  92%|█████████▏| 77/84 [34:12<02:35, 22.19s/it]

Iteration 1
Error: Extra data: line 1 column 4204 (char 4203)
Iteration 2
Error: Extra data: line 1 column 3662 (char 3661)
Iteration 3
Error: Extra data: line 1 column 4150 (char 4149)


Generating Data...:  93%|█████████▎| 78/84 [34:44<02:31, 25.22s/it]

Iteration 1
Error: Extra data: line 1 column 8788 (char 8787)
Iteration 2
Error: Extra data: line 1 column 3030 (char 3029)
Iteration 3


Generating Data...:  94%|█████████▍| 79/84 [35:52<03:09, 37.99s/it]

Iteration 1
Error: Extra data: line 1 column 4841 (char 4840)
Iteration 2


Generating Data...:  95%|█████████▌| 80/84 [36:18<02:17, 34.39s/it]

Iteration 1


Generating Data...:  96%|█████████▋| 81/84 [36:35<01:27, 29.25s/it]

Iteration 1
Sleeping for 60 seconds...


Generating Data...:  98%|█████████▊| 82/84 [38:01<01:32, 46.24s/it]

Saving CSV...
Iteration 1


Generating Data...:  99%|█████████▉| 83/84 [38:14<00:36, 36.36s/it]

Iteration 1


Generating Data...: 100%|██████████| 84/84 [38:26<00:00, 27.45s/it]


Unnamed: 0,anchor,positive,negative
0,Une entreprise de services a engagé un avocat ...,Les frais de constitution sont les dépenses li...,Les avances et acomptes sur immobilisations co...
1,Un entrepreneur a dû payer un notaire 3 000 di...,Les frais de constitution sont les dépenses li...,Les pertes sur créances irrécouvrables représe...
2,"Pour répondre aux exigences légales, une socié...",Les frais de constitution sont les dépenses li...,Les immobilisations corporelles en cours compr...
3,L'immatriculation de l'entreprise au registre ...,Les frais de constitution sont les dépenses li...,Les terrains bâtis désignent les parcelles de ...
4,Un expert-comptable a été engagé pour établir ...,Les frais de constitution sont les dépenses li...,Les impôts et taxes directs sont des prélèveme...
...,...,...,...
1668,Un café doit évaluer la valeur de ses stocks a...,Les dotations d'exploitation aux provisions po...,Les ouvrages d'infrastructure sont des constru...
1669,Un distributeur de produits cosmétiques consta...,Les dotations d'exploitation aux provisions po...,Les frais d'acquisition des immobilisations so...
1670,Une entreprise de nettoyage doit évaluer ses c...,Les dotations d'exploitation aux provisions po...,Les redevances de crédit-bail sont les paiemen...
1671,Un vendeur de voitures d'occasion doit ajuster...,Les dotations d'exploitation aux provisions po...,Les charges sociales désignent les contributio...


## Push your data to HuggingFace

In [None]:
!pip install datasets

In [None]:
Final = pd.read_csv("Final.csv")
Final.shape

(1643, 3)

- You need a HuggingFace acces token, to push your data .
  - Go to your Huggingface profile
  - Settings
  - Acces Tokens
  - Create a new token
  - Write
  - Copy the generated token

In [None]:
from datasets import Dataset, DatasetDict
from huggingface_hub import HfFolder


repo_id = "username/Reponame"
auth_token = "hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxx"

In [None]:
#To shuffle the rows of the Final DataFrame before splitting

Final = Final.sample(frac=1, random_state=42).reset_index(drop=True)

test_size = 0.2
val_size = 0.1

test_split = int(len(Final) * test_size)
val_split = int(len(Final) * val_size)

train_df = Final.iloc[:-test_split-val_split]
test_df = Final.iloc[-test_split:]
val_df = Final.iloc[-test_split-val_split:-test_split]

dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "test": Dataset.from_pandas(test_df),
    "validation": Dataset.from_pandas(val_df)
})

dataset.push_to_hub(repo_id, token=auth_token)


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Noureddinesa/Invoices_embedding_3/commit/16dc23eadb0daa82573a6dc1a2c4321fa9bc727e', commit_message='Upload dataset', commit_description='', oid='16dc23eadb0daa82573a6dc1a2c4321fa9bc727e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Noureddinesa/Invoices_embedding_3', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Noureddinesa/Invoices_embedding_3'), pr_revision=None, pr_num=None)

# Install Necessary Packages

In [1]:
!pip install transformers[torch]



In [2]:
!pip install -U sentence-transformers



In [3]:
!pip install datasets



In [4]:
#https://sbert.net/docs/sentence_transformer/training_overview.html
#https://huggingface.co/BAAI/bge-large-en


In [5]:
from sentence_transformers import SentenceTransformer
import torch


#   Load Dataset

In [6]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("username/datasetname")
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]
test_dataset = dataset["test"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


More infromation about the dataset over here: # https://sbert.net/docs/sentence_transformer/training_overview.html#dataset


In [7]:
train_dataset

Dataset({
    features: ['anchor', 'positive', 'negative'],
    num_rows: 1151
})

In [8]:
train_dataset.to_pandas()
# Anchor: The original sentence or query.
# Positive answer: A correct or relevant response to the anchor.
# Negative answer: An incorrect or irrelevant response to the anchor

Unnamed: 0,anchor,positive,negative
0,Une collectivité locale verse un acompte de 1 ...,Les avances et acomptes sur immobilisations co...,Les achats de matières et fournitures consomma...
1,Une société de sécurité engage un service de t...,Les transports regroupent les frais liés au dé...,Les redevances de crédit-bail sont les paiemen...
2,"Lors de l'importation de boissons gazeuses, l'...",Les impôts et taxes indirects sont des prélève...,Le mobilier de bureau désigne l'ensemble des m...
3,Une entreprise technologique développe un prot...,Les immobilisations corporelles en cours compr...,"Le compte ""6141. Etudes, recherches et documen..."
4,Une entreprise de construction à Essaouira res...,Les charges sociales désignent les contributio...,L'achat de marchandises du groupe B désigne l'...
...,...,...,...
1146,Un terrain pour un centre culturel a été aména...,Les terrains aménagés représentent des parcell...,Les services bancaires désignent les frais ass...
1147,Une société de distribution lance un projet de...,"Le compte ""6141. Etudes, recherches et documen...",Les entretiens et réparations désignent les dé...
1148,Un cabinet d'architecture verse une avance de ...,Les avances et acomptes sur immobilisations co...,Les frais d'acquisition des immobilisations so...
1149,Un hôtel doit payer des taxes sur les spectacl...,"Les impôts, taxes et droits assimilés sont des...","Les frais liés aux opérations de fusion, sciss..."


# Load Model

In [9]:
# 1. Load a model to finetune with 2. (Optional) model card data

#popular embedding models:
#https://huggingface.co/nomic-ai/nomic-embed-text-v1
#https://huggingface.co/BAAI/bge-large-en

model = SentenceTransformer("Alibaba-NLP/gte-multilingual-base",trust_remote_code=True)

Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Setting up Training Arguments

In [10]:
from sentence_transformers import SentenceTransformerTrainer, SentenceTransformerTrainingArguments
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers



In [11]:
# 3. Define a loss function
loss = MultipleNegativesRankingLoss(model)


In [12]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/mpnet-base-all-nli-triplet",
    # Optional training parameters:
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=100,
    logging_steps=50,
    run_name="mpnet-base-all-nli-triplet",  # Will be used in W&B if `wandb` is installed
)

# Train

In [13]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss
)


In [14]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmonojob111[0m ([33mmonojob111-jjbnnmn[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
50,0.4665,0.122677
100,0.1407,0.099875
150,0.0839,0.06537
200,0.0607,0.092077
250,0.0383,0.060587
300,0.0567,0.046577
350,0.018,0.054437
400,0.0143,0.050594
450,0.0037,0.049877
500,0.0044,0.059414


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

TrainOutput(global_step=864, training_loss=0.051894362208528305, metrics={'train_runtime': 1557.2924, 'train_samples_per_second': 2.217, 'train_steps_per_second': 0.555, 'total_flos': 0.0, 'train_loss': 0.051894362208528305, 'epoch': 3.0})

# Test - Model any good?

In [None]:
from sentence_transformers.evaluation import TripletEvaluator

test_evaluator = TripletEvaluator(
    anchors=test_dataset["anchor"],
    positives=test_dataset["positive"],
    negatives=test_dataset["negative"],
    name="all-nli-test",
)

test_evaluator(model)


{'all-nli-test_cosine_accuracy': 1.0}

In [15]:

# 8. Save the trained model
model.save_pretrained("models/mpnet-base-all-nli-triplet/checkpoint-864")
auth_token = "hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# 9. (Optional) Push it to the Hugging Face Hub
model.push_to_hub("Model_Name",token=auth_token)

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

'https://huggingface.co/Noureddinesa/Invoices_gte-multilingual-base/commit/b0b85eb8671baa926cb4fa2b96e6727655da6629'