In [15]:
!pip install langchain
!pip install sentence-transformers
!pip install PyPDF2
!pip install pinecone-client
!pip install transformers
!pip install fpdf



In [16]:
import os
from PyPDF2 import PdfReader
import math
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
)
from getpass import getpass
import torch
from torch.cuda.amp import autocast

In [17]:
# Function to read the PDF content
def read_pdf(file_path):
    """
    Reads text from a PDF file.

    Args:
        file_path (str): Path to the PDF file.

    Returns:
        str: Combined text of all pages.
    """
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text.strip()

In [18]:
# Function to split content into chunks
def create_chunks(text, chunk_size, overlap):
    """
    Splits text into chunks with overlapping context.

    Args:
        text (str): The input text.
        chunk_size (int): The maximum size of each chunk (characters).
        overlap (int): The number of overlapping characters between chunks.

    Returns:
        list: List of text chunks.
    """
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        word_length = len(word) + 1
        if current_length + word_length > chunk_size:
            chunks.append(" ".join(current_chunk))
            overlap_words = (
                current_chunk[-math.ceil(overlap / len(current_chunk)) :]
                if current_chunk
                else []
            )
            current_chunk = overlap_words
            current_length = sum(len(w) + 1 for w in current_chunk)

        current_chunk.append(word)
        current_length += word_length

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

In [19]:
file_path = "/content/AFFAIRE C.P. ET M.N. c. FRANCE.pdf"
document_text = read_pdf(file_path)
chunks = create_chunks(document_text, chunk_size=3000, overlap=500)

In [20]:
len(chunks)

17

In [21]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = getpass("Enter your Hugging Face API token: ")

Enter your Hugging Face API token: ··········


In [22]:
model_name = "facebook/mbart-large-50-many-to-one-mmt"
model = MBartForConditionalGeneration.from_pretrained(model_name)
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)

In [23]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartEncoderLayer(
          (self_attn): MBartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
    

In [24]:
def translate_chunks(chunks, src_lang, batch_size):
    """
    Translates a list of text chunks into English using MBart50.

    Args:
        chunks (list): List of text chunks in the source language.
        src_lang (str): Source language code.
        batch_size (int): Number of chunks to process in a single batch.

    Returns:
        list: Translated text chunks in English.
    """
    translations = []
    tokenizer.src_lang = src_lang

    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        encoded_input = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)

        with autocast():
            generated_tokens = model.generate(
                **encoded_input,
                max_length=128,
                num_beams=1
            )

        # Decode translations
        batch_translations = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        translations.extend(batch_translations)

    return translations

In [25]:
# Chunk translation
translated_chunks = translate_chunks(chunks, src_lang="fr_XX", batch_size=8)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  with autocast():


In [26]:
for i, translation in enumerate(translated_chunks, 1):
    print(f"Chunk {i} Translation:\n{translation}\n")

Chunk 1 Translation:
PART FIVE Case C.P. and M.N. v. FRANCE (Applications Nos. 56513/17 and 56515/17) JUDGMENT Art 8 Privacy and family life Refusal by the domestic courts to examine the action of the applicant, claiming to be the biological father of a child, challenging the legally established paternity with a view to establishing his own, in application of the rules for calculating the five-year limitation period combined with the obligation to bring the child before the courts in the case Applicant having failed

Chunk 2 Translation:
The applicant’s former partner had recognised the latter before his birth on 4 December 2007. JUDGMENT C.P. and M.N. v. FRANCE 25. At the beginning of March 2012, the applicant left her former partner and entered into a civil solidarity pact (PACS) with the applicant on 14 March 2012. 6. On 12 December 2012, the applicant applied to the Family Court (JAF) for the measures relating to the two children to be fixed and requested the establishment of an al

In [43]:
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec

In [44]:
pc = Pinecone(api_key=getpass("Enter your Pinecone API token: "))

Enter your Pinecone API token: ··········


In [49]:
index_name = "legal-llm"
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

pc.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

index = pc.Index(index_name)

In [50]:
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

embeddings = embedding_model.encode(translated_chunks)

for i, chunk in enumerate(translated_chunks):
    index.upsert(vectors=[(f"doc_{i}", embeddings[i], {"text": chunk})])

print("Translated chunks embedded and stored in Pinecone.")

Translated chunks embedded and stored in Pinecone.
