In [29]:
import pandas as pd

In [30]:
from transformers import AutoTokenizer, AutoModel
import torch

In [31]:
import os

In [32]:
from tqdm import tqdm

In [33]:
tokenizer = AutoTokenizer.from_pretrained('antoinelouis/biencoder-camembert-base-mmarcoFR')
model = AutoModel.from_pretrained('antoinelouis/biencoder-camembert-base-mmarcoFR')

In [34]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [35]:
def get_embeddings(chunck, model, tokenizer):
    """
    Get the embedding of a passage
    :param chunck: the passage
    :param model: the model
    :param tokenizer: the tokenizer
    :return:
    """
    # Tokenize sentences
    encoded_input = tokenizer(chunck, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    chunk_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    return chunk_embeddings.view(-1).numpy().tolist()

In [36]:
def read_file(filename):
    """
    To read the file
    :param filename: string the name of the file
    :return: string  the content of the file
    """
    with open(f'judilibre_json/data/{filename}') as file:
        content = file.read()
    return content

In [37]:
def split_text_into_passages(text, model, tokenizer, max_chars_per_section=128, prev_include=2):
    """
    Split text into passage with maximum number of char per passage and number of char of the previous
    passage to include in the following
    :param text:  string the text to split
    :param max_chars_per_section: int the max number of char per passage
    :param prev_include: int number of char of the previous to include in the following
    :return:
    """
    passages = []
    current_passage = []
    passage_ends = []
    lines = text.splitlines()
    for i in tqdm(range(len(lines))):
        line = lines[i]
        words = line.split()
        for j, word in enumerate(words):
            if len(" ".join(current_passage)) + len(word) + 1 <= max_chars_per_section:
                current_passage.append(word)
            else:
                passages.append(" ".join(current_passage))
                passage_ends.append(i + 1)
                if prev_include > 0:
                    current_passage = passages[-1].split()[-prev_include:]
    if current_passage:
        passages.append(" ".join(current_passage))
        passage_ends.append(len(lines))

    chunks = list(range(1, len(passages) + 1))

    df = pd.DataFrame({'chunck': chunks, 'line': passage_ends, 'passage': passages})
    df['embedding'] = df['passage'].apply(lambda chunck: get_embeddings(chunck, model, tokenizer))
    return df


In [38]:
def get_all_tsv_file():
    """
    Get the name of all data file
    :return:
    """
    path = 'judilibre_json/data'
    tsv_files = []
    if os.path.exists(path) and os.path.isdir(path):
        tsv_files = [f for f in os.listdir(path) if f.endswith(".tsv")]
    return tsv_files

In [39]:
def create_table(model, tokenizer, max_chars_per_section=128, prev_include=2):
    all_tsv_files = get_all_tsv_file()
    all_dfs = []
    for i in tqdm(range(len(all_tsv_files))):
        filename = all_tsv_files[i]
        id_dec, _ = os.path.splitext(filename)
        text = read_file(filename)
        df = split_text_into_passages(text, model, tokenizer, max_chars_per_section, prev_include)
        n_rows, _ = df.shape
        df.insert(0, 'id_dec', [id_dec] * n_rows)
        all_dfs.append(df)

    df_judilibre_v = pd.concat(all_dfs, ignore_index=True)
    df_judilibre_v.to_csv(f'judilibre_v/judilibre_v_{len(all_tsv_files)}.tsv', index=False)


In [40]:
create_table(model, tokenizer)

  0%|          | 0/30 [00:00<?, ?it/s]
100%|██████████| 89/89 [00:00<00:00, 125561.07it/s]
  3%|▎         | 1/30 [00:04<01:59,  4.11s/it]
100%|██████████| 137/137 [00:00<00:00, 108911.99it/s]
  7%|▋         | 2/30 [00:10<02:37,  5.63s/it]
100%|██████████| 137/137 [00:00<00:00, 14692.02it/s]
 10%|█         | 3/30 [00:15<02:14,  4.99s/it]
100%|██████████| 92/92 [00:00<00:00, 131833.27it/s]
 13%|█▎        | 4/30 [00:18<01:55,  4.46s/it]
100%|██████████| 48/48 [00:00<00:00, 105021.70it/s]
 17%|█▋        | 5/30 [00:20<01:30,  3.63s/it]
100%|██████████| 115/115 [00:00<00:00, 124895.12it/s]
 20%|██        | 6/30 [00:25<01:33,  3.90s/it]
100%|██████████| 112/112 [00:00<00:00, 147538.33it/s]
 23%|██▎       | 7/30 [00:29<01:30,  3.93s/it]
100%|██████████| 56/56 [00:00<00:00, 161319.38it/s]
 27%|██▋       | 8/30 [00:30<01:10,  3.22s/it]
100%|██████████| 68/68 [00:00<00:00, 146638.91it/s]
 30%|███       | 9/30 [00:33<01:02,  3.00s/it]
100%|██████████| 186/186 [00:00<00:00, 151601.35it/s]
 33%|███▎

In [41]:
df = pd.read_csv('judilibre_v/judilibre_v_30.tsv')

In [42]:
df

Unnamed: 0,id_dec,chunck,line,passage,embedding
0,JURITEXT6163873c947dd77ae6de0264,1,5,Grosses délivrées REPUBLIQUE FRANCAISE aux par...,"[0.19662752747535706, 0.1327817291021347, -0.1..."
1,JURITEXT6163873c947dd77ae6de0264,2,8,"4 ARRET 16 FEVRIER 2011 (n° 54, 5 pages) Numér...","[0.002567474963143468, 0.2334602177143097, -0...."
2,JURITEXT6163873c947dd77ae6de0264,3,12,Cour : du 15 Avril 2009 Tribunal de Commerce d...,"[-0.049415867775678635, -0.13155849277973175, ..."
3,JURITEXT6163873c947dd77ae6de0264,4,16,poursuites et de son représentant légal [Adres...,"[0.04500657320022583, 0.02714894898235798, 0.2..."
4,JURITEXT6163873c947dd77ae6de0264,5,18,"Cour assistée Me LE ROC'H Armelle, avocat au b...","[-0.03509717434644699, 0.0044215163215994835, ..."
...,...,...,...,...,...
3225,JURITEXT63ff0297002ac605de15b669,182,154,le fondement l'article 700 du code de procédur...,"[-0.08968550711870193, 0.11280424892902374, 0...."
3226,JURITEXT63ff0297002ac605de15b669,183,155,dépens en d'appel. Prononcé publiquement par m...,"[-0.08342866599559784, -0.1942174732685089, -0..."
3227,JURITEXT63ff0297002ac605de15b669,184,156,ayant été avisées dans les conditions prévues ...,"[0.09881575405597687, 0.22274042665958405, -0...."
3228,JURITEXT63ff0297002ac605de15b669,185,156,"par Madame Charbonnier, Conseillère faisant fo...","[0.10842771083116531, 0.048915620893239975, 0...."
