# Define training grid

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
TRAINING_GRID = {
    "base_model": [
        "bert-base-multilingual-cased",
        "bert-base-multilingual-uncased",
    ],
    "class_imbalance": [1, 2, 3],
    "device": ['cpu'],
    "epochs": [4, 6, 8],
    "batch_size": [8]
}

In [3]:
from typing import List
from copy import copy

def generate_hyperparameters_grid(values: dict) -> List[dict]:
  def recursive_parameters_generation(keys: List[str], i: int=0, current_params: dict = {}):
    grid = []
    for value in values[keys[i]]:
      current_params[keys[i]] = value
      if i < len(keys) - 1:
        grid += recursive_parameters_generation(keys=keys, i=i+1, current_params=current_params)
      else:
        grid.append(copy(current_params))
    return grid

  return recursive_parameters_generation(list(values.keys()))

In [4]:
generate_hyperparameters_grid(values= TRAINING_GRID)

[{'base_model': 'bert-base-multilingual-cased',
  'class_imbalance': 1,
  'device': 'cpu',
  'epochs': 4,
  'batch_size': 8},
 {'base_model': 'bert-base-multilingual-cased',
  'class_imbalance': 1,
  'device': 'cpu',
  'epochs': 6,
  'batch_size': 8},
 {'base_model': 'bert-base-multilingual-cased',
  'class_imbalance': 1,
  'device': 'cpu',
  'epochs': 8,
  'batch_size': 8},
 {'base_model': 'bert-base-multilingual-cased',
  'class_imbalance': 2,
  'device': 'cpu',
  'epochs': 4,
  'batch_size': 8},
 {'base_model': 'bert-base-multilingual-cased',
  'class_imbalance': 2,
  'device': 'cpu',
  'epochs': 6,
  'batch_size': 8},
 {'base_model': 'bert-base-multilingual-cased',
  'class_imbalance': 2,
  'device': 'cpu',
  'epochs': 8,
  'batch_size': 8},
 {'base_model': 'bert-base-multilingual-cased',
  'class_imbalance': 3,
  'device': 'cpu',
  'epochs': 4,
  'batch_size': 8},
 {'base_model': 'bert-base-multilingual-cased',
  'class_imbalance': 3,
  'device': 'cpu',
  'epochs': 6,
  'batch_siz

# Build csv

In [5]:
import os
import re

folder_path = "legislatie/"

data = []
pattern = r"(#\d+(\.\d+)?)([\s\S]*?)(?=#\d+(\.\d+)?|$)"

for file in os.listdir(folder_path):
  file_path = os.path.join(folder_path, file)

  with open(file_path, "r", encoding="utf-8") as f:
      content = f.read()

      # Use regex to find all paragraphs that start with (1), (2), etc.
      paragraphs = re.findall(pattern, content)

      # Store each paragraph with the file name

      for match in paragraphs:
        paragraph_text = match[2]
        data.append({"file_name": file, "paragraph": paragraph_text.strip()})

# Convert to DataFrame
paragraphs = pd.DataFrame(data)

# Display the DataFrame
print(paragraphs)

    file_name                                          paragraph
0   art10.txt  Este interzisă circulaţia pe drumurile publice...
1   art10.txt  Constatarea deficienţelor vehiculelor se face ...
2   art12.txt  Vehiculele care circulă pe drumurile publice t...
3   art12.txt  Pentru a circula pe drumurile publice, vehicul...
4   art12.txt  Vehiculele care nu sunt supuse înmatriculării ...
5   art12.txt  Este interzisă conducerea pe drumurile publice...
6   art13.txt  Autovehiculele şi remorcile se înmatriculează ...
7   art13.txt  Sunt exceptate de la prevederile alin. (1) mop...
8   art13.txt  Autovehiculele, remorcile și tractoarele agric...
9   art13.txt  Autovehiculele și remorcile destinate a fi tra...
10  art13.txt  Până la înmatriculare, vehiculele prevăzute la...
11  art13.txt  La cerere, instituțiilor din sistemul de apăra...
12  art13.txt  Pot beneficia de autorizații și numere pentru ...
13  art13.txt  Autorizația de circulație pentru probe este va...
14  art13.txt  Evidența v

In [6]:

questions = ["Care sunt echipamentele obligatorii pe care trebuie să le aibă un autovehicul pentru a putea circula pe drumurile publice?",
             "Ce condiție trebuie să îndeplinească autovehiculele, remorcile și tramvaiele pentru a fi înmatriculate, înregistrate sau admise în circulație?",\
             "Care sunt categoriile de vehicule exceptate de la obligația de omologare?",
             "Care sunt categoriile de vehicule exceptate de la obligația de omologare?",
             "Cine stabilește categoriile de vehicule care pot fi admise în circulație fără omologare?",
             "Ce document atestă omologarea unui vehicul?",
             "Unde se efectuează inspecția tehnică periodică a vehiculelor?",
             "Unde se efectuează inspecția tehnică periodică a vehiculelor?",
             "Ce categorii de vehicule au interdicția de a circula pe drumurile publice?",
             "Cine are responsabilitatea de a constata deficiențele vehiculelor și de a verifica starea tehnică a acestora în trafic?",
             "Care sunt categoriile de vehicule exceptate de la obligația de înmatriculare sau înregistrare pentru a circula pe drumurile publice?",
             "Ce obligație au vehiculele înmatriculate sau înregistrate pentru a putea circula pe drumurile publice?",
             "În ce condiții pot circula pe drumurile publice vehiculele care nu sunt supuse înmatriculării sau înregistrării?",
             "Unde se înmatriculează autovehiculele și remorcile și în ce condiții?",
             "Ce categorii de vehicule sunt exceptate de la obligația de înmatriculare?",
             "La ce instituții se înregistrează autovehiculele, remorcile și tractoarele agricole sau forestiere din dotarea unor instituții de stat?",
             "În ce condiții pot circula vehiculele înainte de a fi înmatriculate?",
             "Cine poate beneficia de autorizații și numere pentru probe pentru vehiculele care se supun înmatriculării?",
             "Cine ține evidența vehiculelor înmatriculate și pe ce criteriu teritorial?",
             "La ce autorități se înregistrează tramvaiele, troleibuzele, mopedele, tractoarele agricole sau forestiere și alte vehicule menționate?",
             "La ce autorități se înregistrează tramvaiele, troleibuzele, mopedele, tractoarele agricole sau forestiere și alte vehicule menționate?",
             "În ce condiții pot autoritățile din domeniul apărării, ordinii publice și securității naționale să prelucreze date din Registrul de evidență a vehiculelor înregistrate?"]


pgs = [paragraphs.iloc[0].paragraph,
       paragraphs.iloc[16].paragraph,
       paragraphs.iloc[17].paragraph,
       paragraphs.iloc[18].paragraph,
       paragraphs.iloc[19].paragraph,
       paragraphs.iloc[20].paragraph,
       paragraphs.iloc[22].paragraph,
       paragraphs.iloc[23].paragraph,
       paragraphs.iloc[14].paragraph,
       paragraphs.iloc[15].paragraph,
       paragraphs.iloc[10].paragraph,
       paragraphs.iloc[11].paragraph,
       paragraphs.iloc[12].paragraph,
       paragraphs.iloc[1].paragraph,
       paragraphs.iloc[2].paragraph,
       paragraphs.iloc[3].paragraph,
       paragraphs.iloc[5].paragraph,
       paragraphs.iloc[7].paragraph,
       paragraphs.iloc[9].paragraph,
       paragraphs.iloc[24].paragraph,
       paragraphs.iloc[25].paragraph,
       paragraphs.iloc[26].paragraph]


In [7]:
dataset = pd.DataFrame()
dataset['paragraphs'] = pgs
dataset['questions'] = questions

In [8]:
dataset.to_csv("dataset.csv", sep='#')

In [8]:
from tqdm import tqdm
from typing import List, Tuple, Union, Optional
import ast
from sentence_transformers import util, InputExample
from sklearn.model_selection import train_test_split

class ReRankingDataLoader:
  def __init__(
        self,
        dict_ds: pd.DataFrame,
        raw_ds: pd.DataFrame,
        class_imbalance: int = 1
    ):
        self.dict_ds = dict_ds
        self.raw_ds = raw_ds
        self.class_imbalance = class_imbalance

  def load_data(self) -> Union[pd.DataFrame, Tuple[List[InputExample], List[InputExample]]]:
    dataset  = []
    for i in tqdm(range(len(self.raw_ds))):
        question_row = self.raw_ds.iloc[i]
        descs = question_row.paragraphs
        if not descs:
            continue
        sample = pd.DataFrame()
        sample['description'] = [descs]

        sample['question'] = question_row['questions']
        sample['Score'] = 1


        negative_questions = self.dict_ds.loc[~self.dict_ds.paragraph.isin(list(sample.description))]
        if self.class_imbalance:
            negative_questions = negative_questions.sample(self.class_imbalance * len(sample))

        negative_questions = negative_questions[['paragraph']]
        negative_questions=negative_questions.rename(columns={'paragraph':'description'  })
        negative_questions['question'] = question_row['questions']
        negative_questions['Score'] = 0
        sample = pd.concat([sample, negative_questions], ignore_index=True)
        dataset.append(sample)
    dataset = pd.concat(dataset).reset_index(drop=True)
    print(dataset.columns)

    train, test = train_test_split(dataset, test_size=0.15, random_state=42)
    train_samples = []
    for i in tqdm(range(len(train))):
        current_sample = train.iloc[i]

        train_samples.append(
            InputExample(
                texts=[current_sample["description"], current_sample["question"]],
                label=current_sample["Score"],
            )
        )
    dev_samples = []
    for i in tqdm(range(len(test))):
        current_sample = test.iloc[i]

        dev_samples.append(
            InputExample(
                texts=[current_sample["description"], current_sample["question"]],
                label=current_sample["Score"],
            )
        )


    return (train_samples, dev_samples)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]


In [9]:
loader = ReRankingDataLoader(
    dict_ds=paragraphs,
    raw_ds=dataset,
    class_imbalance=1)

# train_samples, dev_samples = loader.load_data()
train_samples, dev_samples = loader.load_data()

100%|██████████| 22/22 [00:00<00:00, 500.10it/s]


Index(['description', 'question', 'Score'], dtype='object')


100%|██████████| 37/37 [00:00<00:00, 26793.72it/s]
100%|██████████| 7/7 [00:00<00:00, 18157.16it/s]


In [11]:
print(len(train_samples))

37


# Model

In [10]:
from torch.utils.data import DataLoader
import math
from dataclasses import dataclass

@dataclass
class ReRankerConfig:
    base_model: str
    train_data: Union[List[InputExample], DataLoader]
    dev_data: List[InputExample]
    device: str
    epochs: int
    batch_size: int
    class_imbalance: int
    warmup_steps: Optional[int] = None

    def __post_init__(self):
        if not isinstance(self.train_data, DataLoader):
            self.train_data = DataLoader(
                self.train_data, shuffle=True, batch_size=self.batch_size
            )
        if not self.warmup_steps:
            self.warmup_steps = math.ceil(len(self.train_data) * self.epochs * 0.1)

    def to_dict(self):
        return {
            "base_model": self.base_model,
            "train_size": len(self.train_data) * self.batch_size,
            "dev_size": len(self.dev_data),
            "device": self.device,
            "epochs": self.epochs,
            "batch_size": self.batch_size,
            "warmup_steps": self.warmup_steps,
            "class_imbalance": self.class_imbalance
        }

In [11]:
from sentence_transformers import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


class ReRanker:
    def __init__(self, config: ReRankerConfig) -> None:
        self.__config = config
        self.device = config.device
        self.epochs = config.epochs
        self.warmup_steps = config.warmup_steps
        self.class_imbalance = config.class_imbalance
        self.trained = False
        self.model = CrossEncoder(config.base_model, num_labels=1, device=self.device)
        self.evaluator = CEBinaryClassificationEvaluator.from_input_examples(config.dev_data, name=f"{config.base_model} re-ranker tables")
        self.prediction_threshold = 0.5

    def __call__(self, x: List[List[str]]) -> np.array:
        """
        The predict function
        @param x: The sample for the model to do predictions on. It should be a list of lists containing the pair `[candidate, query]`
        @retruns np.array: Returns an array with the probability for each pair to be a match
        """
        if self.trained:
            return self.model.predict(x).tolist()
        else:
            raise Exception(
                "Model is not trained! Train before making predictions!"
            )

    @property
    def config(self):
        return self.config.to_dict()

    def fit(
        self,
        train_dataset: List[InputExample] = None,
        test_dataset: List[InputExample] = None,
        output_path: str = None,
        ) -> Union[None, dict]:
        """
        The main training function
        @param train_dataset: The list of InputExamples used to train the model
        @param test_dataset: The list of InputExamples used to evaluate the model. If `None` is provided then no evaluation will occur.
        @param output_path: The path where the model weights should be saved locally. If `None` is provided then the model weigths will not be saved.
        @returns Union[None, dict]: Returns either `None` or the results from evaluation
        """

        if train_dataset:
            train = train_dataset
        else:
            train = self.__config.train_data

        if test_dataset:
            test = test_dataset
        else:
            test = self.__config.dev_data

        self.model.fit(
            train_dataloader=train,
            evaluator=self.evaluator,
            evaluation_steps=math.ceil(len(train) * 0.5),
            warmup_steps=self.warmup_steps,
            output_path=output_path,
        )
        self.trained = True

        if test:
            return self.evaluate(test)

    def evaluate(self, dataset: List[InputExample]):
        evaluation_inputs = [t.texts for t in dataset]
        evaluation_outputs = [t.label for t in dataset]

        predictions = self.model.predict(evaluation_inputs) > self.prediction_threshold

        return {
            "accuracy_score": accuracy_score(evaluation_outputs, predictions),
            "f1_score": f1_score(evaluation_outputs, predictions),
            "precision_score": precision_score(evaluation_outputs, predictions),
            "recall_score": recall_score(evaluation_outputs, predictions),
        }

# Training

In [12]:
import mlflow

mlflow.set_tracking_uri("http://mlflow:5000")
mlflow.set_experiment("Reranker Experiment")

results_df = pd.DataFrame()
for config in generate_hyperparameters_grid(TRAINING_GRID):

    loader = ReRankingDataLoader(
        dict_ds=paragraphs,
        raw_ds=dataset,
        class_imbalance=config["class_imbalance"])

    train_samples, dev_samples = loader.load_data()

    train_config = ReRankerConfig(
        base_model=config["base_model"],
        train_data=train_samples,
        dev_data=dev_samples,
        device=config["device"],
        epochs=config["epochs"],
        batch_size=config["batch_size"],
        class_imbalance=config["class_imbalance"]
    )
    with mlflow.start_run() as run:
        model=ReRanker(train_config)
        mlflow.log_params(train_config.to_dict())
        metrics = model.fit()
        mlflow.pyfunc.log_model(
            "reranking_crossencoder",
            python_model=model,
            input_example=[["Reference sentence", "Validation sentence"]],
        )
        mlflow.log_metrics(metrics)
        metrics["run_id"] = run.info.run_id

    print("\n------------------------Model Configurations ---------------------------")
    print("Model:", train_config.base_model,
      "\nDevice:",  train_config.device,
      "\nEpochs:", train_config.epochs,
      "\nBatch Size:", train_config.batch_size,
      "\nClass Imbalance:", train_config.class_imbalance)
    print("\nMetrics: ", metrics)
    metrics_df = pd.DataFrame([metrics])
    config_df = pd.DataFrame([config])
    run_df = pd.concat([config_df, metrics_df], axis=1)
    results_df = pd.concat([results_df, run_df])
    break

100%|██████████| 22/22 [00:00<00:00, 530.94it/s]


Index(['description', 'question', 'Score'], dtype='object')


100%|██████████| 37/37 [00:00<00:00, 24547.49it/s]
100%|██████████| 7/7 [00:00<00:00, 9464.90it/s]
The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Do

🏃 View run rogue-tern-514 at: http://mlflow:5000/#/experiments/363813354049388447/runs/c88c693ef0a44916be3e14979e9a72ec
🧪 View experiment at: http://mlflow:5000/#/experiments/363813354049388447

------------------------Model Configurations ---------------------------
Model: bert-base-multilingual-cased 
Device: cpu 
Epochs: 4 
Batch Size: 8 
Class Imbalance: 1

Metrics:  {'accuracy_score': 0.42857142857142855, 'f1_score': 0.3333333333333333, 'precision_score': 1.0, 'recall_score': 0.2, 'run_id': 'c88c693ef0a44916be3e14979e9a72ec'}


In [13]:
model(["Care sunt dotarile necesare pentru autovehicule?", paragraphs.iloc[0].paragraph])

0.4791676700115204

In [19]:
!pip install \
  transformers==4.31.0 \
  sentence-transformers==2.2.2 \
  huggingface-hub==0.16.4

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [15]:
!pip install accelerate==0.31.0

Collecting accelerate==0.31.0
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.31.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [15]:
loaded_model = mlflow.pyfunc.load_model("models:/ReRanker Demo@production")

Downloading artifacts: 100%|██████████| 7/7 [00:09<00:00,  1.42s/it] 


In [16]:
loaded_model(["Care sunt dotarile necesare pentru autovehicule?", paragraphs.iloc[0].paragraph])

TypeError: 'PyFuncModel' object is not callable

In [24]:
print(loaded_model.metadata.get_input_schema().inputs)

[Array(string) (required)]


In [28]:
loaded_model._model_impl.predict([["Care sunt dotarile necesare pentru autovehicule?", paragraphs.iloc[0].paragraph]])

[0.4765057861804962]