### Auteur : Mohamed Lamine OULD BOUYA
# Projet - Chatbot RAG : interroger vos PDF et pages web avec l’IA générative
Objectif : construire un chatbot RAG (Retrieval-Augmented Generation) capable d’extraire le contenu de documents (PDF/sites web), de le vectoriser, puis de répondre de façon contextuelle via un modèle de langue. Une interface Gradio permet de tester le système facilement : charger un document ou une URL, poser une question, obtenir une réponse sourcée.

## 1. Configuration de l'environnement

Installation des bibliothèques nécessaires

In [14]:
import sys
sys.executable

'c:\\Users\\ouldb\\anaconda3\\python.exe'

In [15]:
%pip install openai==0.28.1 python-dotenv bs4 PyPDF2 requests numpy gradio

Note: you may need to restart the kernel to use updated packages.


In [16]:
# Charger la clé depuis .env (sans l’écrire dans le code)
import os
try:
    from dotenv import load_dotenv
    load_dotenv()  # lit le fichier .env à la racine du projet
except Exception:
    pass

import openai
openai.api_key = os.getenv("OPENAI_API_KEY")

# (optionnel) vérification sans afficher la clé
print("Clé OpenAI détectée ?", "oui" if os.getenv("OPENAI_API_KEY") else "non")

Clé OpenAI détectée ? oui


In [17]:
import os
import numpy as np
import requests
from bs4 import BeautifulSoup
import PyPDF2
import openai
import gradio as gr
import os
import numpy as np
import requests
from bs4 import BeautifulSoup
import PyPDF2
import openai
import gradio as gr

## 2. Modules d'extraction de contenu

Création des fonction d'extraction du contenu de différentes sources

In [18]:
def scrape_website(url):
    """Scrape text content from a website"""
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        return ' '.join(soup.stripped_strings)
    except Exception as e:
        return f"Error scraping website: {str(e)}"

def extract_pdf_content(pdf_path):
    """Extract text from a PDF file"""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            return ' '.join(page.extract_text() for page in reader.pages)
    except Exception as e:
        return f"Error processing PDF: {str(e)}"

## 3. Traitement du texte et génération d'embeddings

Implémentation de découpage du texte et la génération d'embeddings

In [19]:
def split_into_chunks(text, chunk_size=500):
    """Split text into manageable chunks"""
    words = text.split()
    return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

def generate_embeddings(text):
    """Generate embeddings using OpenAI"""
    response = openai.Embedding.create(
        model="text-embedding-ada-002",
        input=text
    )
    return response['data'][0]['embedding']

def cosine_similarity(vec1, vec2):
    """Compute cosine similarity between two vectors"""
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

## 4. Implémentation du cœur du chatbot RAG

In [20]:
class RAGChatbot:
    def __init__(self):
        self.chunks_with_embeddings = None

    def load_from_url(self, url):
        """Load content from a website"""
        content = scrape_website(url)
        self._process_content(content)

    def load_from_pdf(self, pdf_path):
        """Load content from a PDF"""
        content = extract_pdf_content(pdf_path)
        self._process_content(content)

    def _process_content(self, content):
        """Process content into chunks and generate embeddings"""
        chunks = split_into_chunks(content)
        self.chunks_with_embeddings = [
            {"content": chunk, "embedding": generate_embeddings(chunk)}
            for chunk in chunks
        ]

    def find_relevant_chunk(self, query):
        """Find most relevant text chunk for a query"""
        query_embedding = generate_embeddings(query)
        similarities = [
            (chunk["content"], cosine_similarity(query_embedding, chunk["embedding"]))
            for chunk in self.chunks_with_embeddings
        ]
        return max(similarities, key=lambda x: x[1])[0]

    def ask(self, query):
        """Generate response based on query and context"""
        if not self.chunks_with_embeddings:
            return "Please load content first using load_from_url or load_from_pdf"

        relevant_chunk = self.find_relevant_chunk(query)
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant using context to answer questions."},
                {"role": "user", "content": f"Context: {relevant_chunk}\n\nQuery: {query}"}
            ],
            max_tokens=200
        )
        return response['choices'][0]['message']['content']

## 5. Création d'une interface Web avec Gradio

In [21]:
# --- ipython-input-7-7774a7eacb87 ---
class RAGChatbotInterface:
    def __init__(self):
        self.chatbot = RAGChatbot()
        self.chat_history = []

    def process_file(self, file):
        """Process uploaded PDF file"""
        try:
            self.chatbot.load_from_pdf(file.name)
            return "PDF successfully loaded! You can now ask questions."
        except Exception as e:
            return f"Error processing PDF: {str(e)}"

    def process_url(self, url):
        """Process website URL"""
        try:
            self.chatbot.load_from_url(url)
            return "Website content successfully loaded! You can now ask questions."
        except Exception as e:
            return f"Error processing URL: {str(e)}"

    def chat(self, message, history):
        """Process chat message and update history"""
        try:
            response = self.chatbot.ask(message)
            history.append((message, response))
            return response, history
        except Exception as e:
            error_message = f"Error generating response: {str(e)}"
            history.append((message, error_message))
            return error_message, history

    def launch_interface(self, share=True):
        """Create and launch Gradio interface"""
        with gr.Blocks(title="RAG Chatbot") as interface:
            gr.Markdown("# 📚 RAG Chatbot: Learn from Any Document")

            with gr.Tab("PDF Input"):
                # Change 'type' to 'filepath' to get the file path
                pdf_upload = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"])
                pdf_status = gr.Textbox(label="PDF Status", interactive=False)
                pdf_upload.upload(fn=self.process_file, inputs=[pdf_upload], outputs=[pdf_status])

            with gr.Tab("URL Input"):
                url_input = gr.Textbox(label="Enter Website URL", placeholder="https://example.com")
                url_status = gr.Textbox(label="URL Status", interactive=False)
                url_button = gr.Button("Load Content")
                url_button.click(fn=self.process_url, inputs=[url_input], outputs=[url_status])

            chatbot = gr.Chatbot(label="Chat with Your Document", height=400)
            msg = gr.Textbox(label="Your Question", placeholder="Ask a question about the document...")
            clear = gr.Button("Clear Chat")

            msg.submit(fn=self.chat, inputs=[msg, chatbot], outputs=[msg, chatbot])
            clear.click(lambda: None, None, chatbot, queue=False)

        interface.launch(share=share)

## 6. Lancement de l'interface du chatbot RAG

In [22]:
# Création et lancement de l'interface
rag_interface = RAGChatbotInterface()
rag_interface.launch_interface(share=True)

  chatbot = gr.Chatbot(label="Chat with Your Document", height=400)


* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://920a1263bacdea6879.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
