In [None]:
!pip install  spacy faiss-cpu torch transformers huggingface_hub
!python -m spacy download fr_core_news_sm
!python -m spacy download en_core_web_sm
!pip uninstall bitsandbytes
!pip install bitsandbytes
!pip install pdfplumber
!pip install langchain
!pip install pandas
!pip install langdetect
!pip install rouge_score


Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.8.0.post1
Collecting fr-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m

In [None]:
import os
import spacy
import faiss
import numpy as np
import torch
from transformers import BertTokenizer, BertModel, pipeline
from huggingface_hub import login
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop_words
from spacy.lang.en.stop_words import STOP_WORDS as en_stop_words
import pdfplumber
import re
import math
from langchain.text_splitter import RecursiveCharacterTextSplitter
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import pandas as pd
from langdetect import detect
import time

# Connexion à Hugging Face
login(token=os.getenv('HF_TOKEN'))  # Utilise le secret HF_TOKEN

# Configuration de l'environnement
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'

class DdbDataLoader:
    def __init__(self, directory_path):
        self.directory_path = directory_path
        self.nlp_fr = spacy.load('fr_core_news_sm')
        self.nlp_en = spacy.load('en_core_web_sm')
        self.finbert_tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
        self.finbert_model = BertModel.from_pretrained('yiyanghkust/finbert-tone')
        self.vector_dim = self.finbert_model.config.hidden_size
        self.indices = {}  # Dictionnaire pour stocker les index FAISS par PDF
        self.metadata_by_pdf = {}  # Dictionnaire pour stocker les metadata par PDF
        self.text_chunks_by_pdf = {}  # Dictionnaire pour stocker les chunks par PDF
        self.embeddings_by_pdf = {}  # Dictionnaire pour stocker les embeddings par PDF

        # Initialiser le pipeline avec le modèle Gemma
        self.pipe = pipeline(
            "text-generation",
            model="google/gemma-2-2b-it",
            model_kwargs={"torch_dtype": torch.bfloat16},
            device="cuda",
        )
        # Initialiser le pipeline de traduction
        self.translator = pipeline(
            "translation_en_to_fr",
            model="Helsinki-NLP/opus-mt-en-fr"
        )
        # Initialiser le calculateur ROUGE
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

    def get_pdf_names(self):
        return list(self.indices.keys())

    def extract_text_from_pdf(self, file_path):
        text = ""
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text
        return text

    def clean_text(self, text, lang='fr'):
        cleaned_text = text.lower().strip()
        nlp = self.nlp_fr if lang == 'fr' else self.nlp_en
        doc = nlp(cleaned_text)
        stop_words = fr_stop_words if lang == 'fr' else en_stop_words
        cleaned_tokens = [token.text.lower() for token in doc if token.text.lower() not in stop_words]
        return ' '.join(cleaned_tokens)

    def load_pdfs(self):
        pdf_texts = {}
        for file_name in os.listdir(self.directory_path):
            if file_name.endswith('.pdf'):
                file_path = os.path.join(self.directory_path, file_name)
                print(f"Loading {file_path}...")
                text = self.extract_text_from_pdf(file_path)
                cleaned_text = self.clean_text(text, lang='fr')
                pdf_texts[file_name] = cleaned_text
        return pdf_texts

    def text_to_chunks(self, text, chunk_size=250, chunk_overlap=10):
        # Utiliser LangChain pour diviser le texte en chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,  # Taille maximale de chaque chunk
            chunk_overlap=chunk_overlap,  # Chevauchement entre les chunks pour préserver le contexte
            separators=["\n\n", "\n", ".", " ", ""]
        )
        chunks = text_splitter.split_text(text)
        return chunks

    def embed_chunks_with_finbert(self, text_chunks, batch_size=32):
        all_embeddings = []
        for i in range(0, len(text_chunks), batch_size):
            batch_chunks = text_chunks[i:i + batch_size]
            inputs = self.finbert_tokenizer(batch_chunks, return_tensors='pt', truncation=True, padding=True, max_length=510)

            with torch.no_grad():
                outputs = self.finbert_model(**inputs)

            # Calcul des embeddings (moyenne des états cachés)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

            # S'assurer que chaque embedding a la même dimension
            if len(batch_embeddings.shape) == 1:  # Si un seul vecteur, l'ajouter directement
                batch_embeddings = [batch_embeddings]
            for emb in batch_embeddings:
                if emb.shape == (self.vector_dim,):
                    all_embeddings.append(emb)
                else:
                    print(f"Embedding incorrect trouvé avec la forme: {emb.shape}")

        return np.array(all_embeddings)

    def store_embeddings_and_metadata(self, pdf_name, embeddings, text_chunks, metadata):
        index = faiss.IndexFlatL2(self.vector_dim)
        index.add(embeddings)
        self.indices[pdf_name] = index
        self.metadata_by_pdf[pdf_name] = metadata
        self.text_chunks_by_pdf[pdf_name] = text_chunks
        self.embeddings_by_pdf[pdf_name] = embeddings

    def build_index(self):
        pdf_texts = self.load_pdfs()

        for file_name, cleaned_text in pdf_texts.items():
            text_chunks = self.text_to_chunks(cleaned_text)
            embeddings = self.embed_chunks_with_finbert(text_chunks)

            # Stocker les embeddings, les chunks et les metadata pour chaque PDF séparément
            metadata = [(file_name, idx) for idx in range(len(text_chunks))]
            self.store_embeddings_and_metadata(file_name, embeddings, text_chunks, metadata)

        return self.indices

    def preprocess_query(self, query, lang='fr'):
        cleaned_query = query.lower().strip()
        nlp = self.nlp_fr if lang == 'fr' else self.nlp_en
        doc = nlp(cleaned_query)
        stop_words = fr_stop_words if lang == 'fr' else en_stop_words
        cleaned_tokens = [token.text.lower() for token in doc if token.text.lower() not in stop_words and not token.is_punct]
        return ' '.join(cleaned_tokens)

    def embed_query(self, query):
        inputs = self.finbert_tokenizer(query, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            outputs = self.finbert_model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

    def search_index(self, query_embedding, index, top_k):
        distances, indices = index.search(np.array([query_embedding]), top_k)
        nearest_distances = distances[0]
        nearest_indices = indices[0]
        return nearest_distances, nearest_indices

    def detect_language(self, text):
        try:
            return detect(text)
        except:
            return 'en'  # Retourner 'en' en cas d'erreur de détection

    def calculate_metrics(self, reference_text, generated_text):
            scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
            scores = scorer.score(reference_text, generated_text)

            rouge1_precision = scores['rouge1'].precision
            rouge1_recall = scores['rouge1'].recall
            rouge1_fmeasure = scores['rouge1'].fmeasure

            data = {
                'Metric': ['ROUGE-1 Precision', 'ROUGE-1 Recall', 'ROUGE-1 F-measure'],
                'Score': [rouge1_precision, rouge1_recall, rouge1_fmeasure]
            }
            df = pd.DataFrame(data)
            return df

    def generate_response(self, query, pdf_name, top_k=5):
        start_time = time.time()  # Début de la mesure du temps
        query_language = self.detect_language(query)

        cleaned_query = self.preprocess_query(query, lang=query_language)
        query_embedding = self.embed_query(cleaned_query)

        if pdf_name in self.indices:
            index = self.indices[pdf_name]
            text_chunks = self.text_chunks_by_pdf[pdf_name]
            metadata = self.metadata_by_pdf[pdf_name]
            search_start_time = time.time()

            distances, nearest_indices = self.search_index(query_embedding, index, top_k)
            search_end_time = time.time()

            relevant_chunks = [text_chunks[idx] for idx in nearest_indices]
            relevant_metadata = [metadata[idx] for idx in nearest_indices]

            combined_text = ' '.join(relevant_chunks)

            system_prompt = "You are a QA bot. Given the question, answer it accurately and comprehensively based on the provided information."
            formatted_query = f"Question: {query}\nAnswer:"
            input_text = f"{system_prompt}\n\n{formatted_query}\n\nContext:\n{combined_text}"
            generation_start_time = time.time()

            messages = [
                {"role": "user", "content": input_text},
            ]

            outputs = self.pipe(messages, max_new_tokens=256)
            assistant_response = outputs[0]["generated_text"][-1]["content"].strip()
            generation_end_time = time.time()

            if query_language == 'fr':
                assistant_response_translated = self.translator(assistant_response)[0]['translation_text']
            else:
                assistant_response_translated = assistant_response

            rouge_start_time = time.time()

            rouge_scores = self.rouge_scorer.score(combined_text, assistant_response_translated)

            metrics_df = self.calculate_metrics(combined_text, assistant_response_translated)
            rouge_end_time = time.time()
            # Mesure du temps total
            end_time = time.time()
            total_time = end_time - start_time
            search_time = search_end_time - search_start_time
            generation_time = generation_end_time - generation_start_time
            rouge_time = rouge_end_time - rouge_start_time

            print("Metrics:\n", metrics_df)
            print(f"Total Time: {total_time:.2f} seconds")
            print(f"Search Time: {search_time:.2f} seconds")
            print(f"Generation Time: {generation_time:.2f} seconds")
            print(f"ROUGE Calculation Time: {rouge_time:.2f} seconds")

            return assistant_response_translated
        else:
            return "Le PDF spécifié n'a pas été trouvé."


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os

# Chemin vers le dossier contenant les PDFs dans Google Drive
directory_path = '/content/drive/My Drive/BCT'

# Liste les fichiers dans le dossier
files = os.listdir(directory_path)
print(files)

Mounted at /content/drive
['Cir_2024_06.pdf', 'Cir_2024_05.pdf', 'Cir_2024_04.pdf', 'Cir_2024_02.pdf', 'Cir_2023_06.pdf', 'Cir_2023_03.pdf', 'Cir_2023_01.pdf', 'Cir_2022_01.pdf', 'Cir_2021_05.pdf', 'Cir_2021_03.pdf', 'Cir_2020_19.pdf', 'Circular_Economy_and_Finance.pdf', 'test.pdf', 'testt.pdf', 'Cir_2016_06_fr.pdf']


In [None]:
loader = DdbDataLoader(directory_path=directory_path)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [None]:
loader.build_index()

Loading /content/drive/My Drive/BCT/Cir_2024_06.pdf...
Loading /content/drive/My Drive/BCT/Cir_2024_05.pdf...
Loading /content/drive/My Drive/BCT/Cir_2024_04.pdf...
Loading /content/drive/My Drive/BCT/Cir_2024_02.pdf...
Loading /content/drive/My Drive/BCT/Cir_2023_06.pdf...
Loading /content/drive/My Drive/BCT/Cir_2023_03.pdf...
Loading /content/drive/My Drive/BCT/Cir_2023_01.pdf...
Loading /content/drive/My Drive/BCT/Cir_2022_01.pdf...
Loading /content/drive/My Drive/BCT/Cir_2021_05.pdf...
Loading /content/drive/My Drive/BCT/Cir_2021_03.pdf...
Loading /content/drive/My Drive/BCT/Cir_2020_19.pdf...
Loading /content/drive/My Drive/BCT/Circular_Economy_and_Finance.pdf...
Loading /content/drive/My Drive/BCT/test.pdf...
Loading /content/drive/My Drive/BCT/testt.pdf...
Loading /content/drive/My Drive/BCT/Cir_2016_06_fr.pdf...


{'Cir_2024_06.pdf': <faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x790ac4139d10> >,
 'Cir_2024_05.pdf': <faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x7907ea88eca0> >,
 'Cir_2024_04.pdf': <faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x7907ea88e430> >,
 'Cir_2024_02.pdf': <faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x7907ebbe44e0> >,
 'Cir_2023_06.pdf': <faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x7907ebbe7510> >,
 'Cir_2023_03.pdf': <faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x7907ebbe78a0> >,
 'Cir_2023_01.pdf': <faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x7907ebbe54d0> >,
 'Cir_2022_01.pdf': <faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'fais

In [None]:
# Exemple de requête
query = "What is the main idea of this circular?"
cleaned_query = loader.preprocess_query(query)

print("Cleaned Query:", cleaned_query)


# Générer la réponse et afficher les scores BLEU et ROUGE
response = loader.generate_response(query, pdf_name='Circular_Economy_and_Finance.pdf')

# Afficher la réponse générée par le modèle
print(f"\nRéponse générée : {response}")


Cleaned Query: what is the main idea of this circular


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Metrics:
               Metric     Score
0  ROUGE-1 Precision  0.811111
1     ROUGE-1 Recall  0.474026
2  ROUGE-1 F-measure  0.598361
Total Time: 8.76 seconds
Search Time: 0.00 seconds
Generation Time: 8.35 seconds
ROUGE Calculation Time: 0.02 seconds

Réponse générée : The main idea of this circular is to **explore the opportunities within the financial system for the development of circular finance.** 

The circular focuses on the **"3R model of opportunities"** which provides a framework for understanding the triggers for the adoption of circular finance in financial markets. It specifically highlights the **"risk"** as the first array of opportunities within this framework. 

The circular also emphasizes that it is a **new strand of literature** in the field of circular finance, and aims to contribute to its theoretical design and empirical evidence.


In [None]:
!pip install gradio

Collecting gradio
  Downloading gradio-4.43.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<0.113.0 (from gradio)
  Downloading fastapi-0.112.4-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.9 (from gradio)
  Downloading python_multipart-0.0.9-py3-none-any.whl.metadata (2.5 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.6.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting semantic-version~=2.0 (from gradio)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting tomlkit==

In [None]:
import gradio as gr
import os
pdf_files = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]

# Fonction pour générer une réponse à partir de la requête utilisateur
def chat_with_pdf(history, query, pdf_name):
    if history is None:
        history = []

    response = loader.generate_response(query=query, pdf_name=pdf_name)
    history.append((query, response))
    return history, history

# Définir l'interface Gradio
with gr.Blocks(css=".container {background-color: #FFDAB9;}") as interface:
    gr.Markdown(
        """
        # Chat with your PDF
        Ask a question and get an answer based on the content of the loaded PDFs.        """
    )

    with gr.Row():
        pdf_name_input = gr.Dropdown(
            label="Please select the PDF!",
            choices=pdf_files,
            value=pdf_files[0],
        )

    query_input = gr.Textbox(
        label="Your question",
        placeholder="Please type your question here...",
    )

    submit_button = gr.Button("Ask your question")

    # Affichage de l'historique de la conversation
    conversation_output = gr.Chatbot(label="Conversation")

    # Lier les éléments d'entrée avec la fonction de chat
    submit_button.click(
        chat_with_pdf,
        [gr.State([]), query_input, pdf_name_input],
        [conversation_output, gr.State([])]
    )

    # Agencement de l'interface
    with gr.Row():
        with gr.Column(scale=1):
            query_input
            submit_button

    conversation_output

# Lancez l'interface
interface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://0f09284df4bccf34ea.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


