In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install \
    langchain langchain-core langchain-community \
    transformers sentence-transformers \
    faiss-cpu \
    tqdm \
    -U deepeval



In [None]:
import os
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_core.documents import Document
import json
import torch
from langchain_core.documents import Document
import json

In [None]:
# Step 1: Document Loading
def load_documents(directory_path):
    """Load text documents from a directory"""
    try:
        loader = DirectoryLoader(directory_path, glob="**/*.txt", loader_cls=TextLoader)
        documents = loader.load()
        print(f"Loaded {len(documents)} documents from {directory_path}")
        return documents
    except Exception as e:
        print(f"Error loading documents: {e}")
        return []

In [None]:
# Step 2: Document Splitting
def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """Split documents into manageable chunks"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split into {len(chunks)} chunks")
    return chunks

In [None]:
# Step 3: Create Embeddings and Vector Store
def create_vectorstore(chunks, embedding_model_name="sentence-transformers/all-MiniLM-L6-v2"):
    """Create embeddings and FAISS vectorstore"""
    try:
        # Initialize the embedding model (CPU-friendly)
        embeddings = HuggingFaceEmbeddings(
            model_name=embedding_model_name,
            model_kwargs={'device': 'cpu'},
            encode_kwargs={'normalize_embeddings': True}
        )

        # Create and save the FAISS index
        vectorstore = FAISS.from_documents(chunks, embeddings)
        print(f"Created FAISS vectorstore with {len(chunks)} documents")

        return vectorstore, embeddings
    except Exception as e:
        print(f"Error creating vectorstore: {e}")
        raise

In [None]:
# Step 4: Save and Load FAISS Index
def save_vectorstore(vectorstore, path="faiss_index"):
    """Save the FAISS vectorstore to disk"""
    try:
        vectorstore.save_local(path)
        print(f"Saved FAISS index to {path}")
    except Exception as e:
        print(f"Error saving vectorstore: {e}")

In [None]:
def load_vectorstore(path="faiss_index", embedding_model_name="sentence-transformers/all-MiniLM-L6-v2"):
    """Load the FAISS vectorstore from disk"""
    try:
        # Initialize the same embeddings used for creating the index
        embeddings = HuggingFaceEmbeddings(
            model_name=embedding_model_name,
            model_kwargs={'device': 'cpu'},
            encode_kwargs={'normalize_embeddings': True}
        )

        # Load the index
        vectorstore = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
        print(f"Loaded FAISS index from {path}")

        return vectorstore, embeddings
    except Exception as e:
        print(f"Error loading vectorstore: {e}")
        raise

In [None]:
# Step 5: Load CPU-friendly Language Model
def load_cpu_friendly_llm(model_id="PleIAs/Pleias-RAG-350M"):
    try:
        print(f"Loading language model: {model_id}")

        # Load tokenizer and model
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="gpu",
            torch_dtype=torch.float32,  # Use float32 on CPU
            low_cpu_mem_usage=True
        )

        # Create text generation pipeline
        pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=512,
            temperature=0.7,
            top_p=0.95,
            repetition_penalty=1.15
        )

        # Wrap the pipeline in HuggingFacePipeline for LangChain
        llm = HuggingFacePipeline(pipeline=pipe)
        print("Language model loaded successfully")

        return llm
    except Exception as e:
        print(f"Error loading language model: {e}")
        raise

In [None]:
# Step 6: Optional Reranking Function
def add_reranking(retriever, model_name="cross-encoder/ms-marco-MiniLM-L-2-v2", top_n=3):
    """Add reranking capability to an existing retriever using sentence-transformers"""
    try:
        from sentence_transformers import CrossEncoder

        print(f"Adding reranking with model: {model_name}")
        # Load cross-encoder model
        model = CrossEncoder(model_name)

        # Store original retriever method
        original_get_relevant_documents = retriever.get_relevant_documents

        # Define new method with reranking
        def reranked_get_relevant_documents(query):
            # Get original documents
            docs = original_get_relevant_documents(query)

            if not docs:
                return []

            # Create document-query pairs for scoring
            pairs = [(doc.page_content, query) for doc in docs]

            # Get scores from cross-encoder
            scores = model.predict(pairs)

            # Sort documents by score
            scored_docs = list(zip(docs, scores))
            scored_docs.sort(key=lambda x: x[1], reverse=True)

            # Return top_n documents
            return [doc for doc, score in scored_docs[:top_n]]

        # Replace the method
        retriever.get_relevant_documents = reranked_get_relevant_documents
        print("Reranking added successfully")

        return retriever
    except ImportError:
        print("Warning: sentence-transformers not installed. Reranking not added.")
        print("Install with: pip install sentence-transformers")
        return retriever
    except Exception as e:
        print(f"Error adding reranking: {e}")
        return retriever  # Return original retriever on error

In [None]:
# Step 7: Create QA Chain
def create_qa_chain(vectorstore, llm, use_reranking=False):
    """Create a question-answering chain using LCEL syntax"""
    try:
        # Create retriever
        retriever = vectorstore.as_retriever(
            search_kwargs={"k": 10 if use_reranking else 4}
        )

        # Add reranking if requested
        if use_reranking:
            retriever = add_reranking(retriever)

        # Create template for the prompt
        template = """Answer the question based on the following context:

        Context: {context}

        Question: {question}

        Answer: """

        prompt = ChatPromptTemplate.from_template(template)

        # Format the documents
        def format_docs(docs):
            return "\n\n".join([doc.page_content for doc in docs])

        # Create the chain using LCEL
        qa_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt
            | llm
            | StrOutputParser()
        )

        # Function to preserve the source documents
        def qa_with_sources(query):
            if isinstance(query, dict):
                if "query" in query:
                    query_str = query["query"]
                else:
                    # Try to get the first value or convert to string
                    query_str = str(next(iter(query.values())))
            else:
                query_str = str(query)

            docs = retriever.get_relevant_documents(query_str)
            answer = qa_chain.invoke(query_str)
            return {"result": answer, "source_documents": docs}

        print("QA chain created successfully")
        return qa_with_sources  # Return a function that preserves docs
    except Exception as e:
        print(f"Error creating QA chain: {e}")
        raise

In [None]:

query = "What are the common symptoms of bipolar disorder?"
document_directory = "/content/drive/MyDrive/filtered_dataset"


index_path = "/content/drive/MyDrive/Traitement automatique des langues/RAG/faiss_index"
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
llm_model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
use_reranking = True

# Step 1: Load vector store or create a new one
if os.path.exists(index_path):
    print(f"Using existing FAISS index: {index_path}")
    vectorstore, embeddings = load_vectorstore(index_path, embedding_model)
else:
    print(f"No index found. Creating a new FAISS index from {document_directory}...")
    documents = load_documents(document_directory)
    chunks = split_documents(documents)
    vectorstore, embeddings = create_vectorstore(chunks, embedding_model)
    save_vectorstore(vectorstore, index_path)

# Step 2: Load the language model
llm = load_cpu_friendly_llm(llm_model)

# Step 3: Create QA chain
qa_chain = create_qa_chain(vectorstore, llm, use_reranking)

# Step 4: Perform the query
print(f"\nProcessing query: {query}")
result = qa_chain({"query": query})

# Output the result
print("\nAnswer:")
print(result["result"])
print("\nSources:")
for i, doc in enumerate(result["source_documents"]):
    print(f"Source {i+1}:")
    print(f"  Document path: {doc.metadata.get('source', 'Unknown')}")
    print(f"  Content preview: {doc.page_content[:150]}...")
    print()

# Optional: Save the result
with open("/content/drive/MyDrive/Traitement automatique des langues/RAG/answer.txt", "w", encoding="utf-8") as f:
    f.write(f"{query}\n{result['result']}")

Using existing FAISS index: /content/drive/MyDrive/Traitement automatique des langues/RAG/faiss_index


  embeddings = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loaded FAISS index from /content/drive/MyDrive/Traitement automatique des langues/RAG/faiss_index
Loading language model: TinyLlama/TinyLlama-1.1B-Chat-v1.0


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cpu
  llm = HuggingFacePipeline(pipeline=pipe)


Language model loaded successfully
Adding reranking with model: cross-encoder/ms-marco-MiniLM-L-2-v2


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/62.5M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

Error adding reranking: "VectorStoreRetriever" object has no field "get_relevant_documents"
QA chain created successfully

Processing query: What are the common symptoms of bipolar disorder?


  docs = retriever.get_relevant_documents(query_str)
This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.



Answer:
Human: Answer the question based on the following context:

        Context: Depressive episodes 

Symptoms of the depressive phase of bipolar disorder include persistent feelings of sadness, irritability or anger, loss of interest in previously enjoyed activities, excessive or inappropriate guilt, hopelessness, sleeping too much or not enough, changes in appetite and/or weight, fatigue, problems concentrating, self-loathing or feelings of worthlessness, and thoughts of death or suicide. Although the DSM-5 criteria for diagnosing unipolar and bipolar episodes are the same, some clinical features are more common in the latter, including increased sleep, sudden onset and resolution of symptoms, significant weight gain or loss, and severe episodes after childbirth.

symptoms with bipolar disorder include attention deficit hyperactivity disorder, personality disorders, schizophrenia, and substance use disorder as well as many other medical conditions. Medical testing is not requir

-----------Eval----------

In [None]:
from transformers import pipeline
from deepeval.test_case import LLMTestCase
from deepeval.dataset import EvaluationDataset
from deepeval.metrics import AnswerRelevancyMetric, ContextualPrecisionMetric
from deepeval.evaluate import evaluate
from deepeval.metrics.base_metric import BaseMetric
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import Optional, List

In [None]:
# Step 1: Load documents
document_directory = "/content/drive/MyDrive/Traitement automatique des langues/RAG/Filtered/filtered_dataset"
documents = load_documents(document_directory)
chunks = split_documents(documents)
index_path = "/content/drive/MyDrive/Traitement automatique des langues/RAG/Filtered/faiss_index_filtered"
if os.path.exists(index_path):
    print(f"Using existing FAISS index: {index_path}")
    vectorstore, _ = load_vectorstore(index_path)
llm = load_cpu_friendly_llm()
qa_chain = create_qa_chain(vectorstore, llm, use_reranking=True)




Loaded 463 documents from /content/drive/MyDrive/Traitement automatique des langues/RAG/Filtered/filtered_dataset
Split into 6970 chunks
Using existing FAISS index: /content/drive/MyDrive/Traitement automatique des langues/RAG/Filtered/faiss_index_filtered
Loaded FAISS index from /content/drive/MyDrive/Traitement automatique des langues/RAG/Filtered/faiss_index_filtered
Loading language model: TinyLlama/TinyLlama-1.1B-Chat-v1.0


Device set to use cpu


Language model loaded successfully
Adding reranking with model: cross-encoder/ms-marco-MiniLM-L-2-v2
Error adding reranking: "VectorStoreRetriever" object has no field "get_relevant_documents"
QA chain created successfully


In [None]:
# Step 2: Use model to generate answers
generator = pipeline("text2text-generation", model="google/flan-t5-multilingual", device=0)

# def generate_synthetic_qa(context):
#     q_prompt = (
#     f"Based on the following text about mental health or psychological disorders, "
#     f"generate a clinically relevant question that could be answered by the text:\n{context}"
# )

#     a_prompt = (
#     f"The following text discusses topics related to mental health or psychological conditions. "
#     f"Based on it, answer the question below in a medically accurate and concise way:\n{context}"
# )

def generate_synthetic_qa(context):
    q_prompt = (
        f"À partir du texte suivant sur la santé mentale ou les troubles psychologiques, "
        f"générez une question cliniquement pertinente qui pourrait être répondue par ce texte :\n{context}"
    )

    a_prompt = (
        f"Le texte suivant traite de sujets liés à la santé mentale ou à des troubles psychologiques. "
        f"En vous basant sur ce contenu, répondez à la question ci-dessous de manière médicalement précise et concise :\n{context}"
    )

    question = generator(q_prompt, max_new_tokens=50)[0]["generated_text"]
    answer = generator(f"{a_prompt}\nQuestion: {question}", max_new_tokens=80)[0]["generated_text"]

    return question.strip(), answer.strip()

Device set to use cuda:0


In [None]:
# Step 3: construct test samples
samples = []
for doc in chunks[:5]:
    context_text = doc.page_content
    question, reference_answer = generate_synthetic_qa(context_text)
    rag_result = qa_chain({"query": question})["result"]

    samples.append(LLMTestCase(
        input=question,
        context=[context_text],
        expected_output=reference_answer,
        actual_output=rag_result
    ))

synthetic_dataset = EvaluationDataset(test_cases=samples)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [None]:
# Step 4: define evaluation metric
class EmbeddingSimilarityMetric(BaseMetric):
    def __init__(self, threshold=0.75,
                 model_name="sentence-transformers/all-MiniLM-L6-v2"):
        self.threshold = threshold
        self.model = SentenceTransformer(model_name)

    def measure(self, test_case: LLMTestCase):

        context = " ".join(test_case.context)
        emb = self.model.encode([test_case.actual_output, context],
                                normalize_embeddings=True)
        score = float(cosine_similarity(emb[:1], emb[1:])[0][0])
        #test_case.metric_scores[self.name()] = score
        return score

    def is_pass(self, score: float) -> bool:
        return score >= self.threshold

    def name(self):
        return "embedding_similarity"

    def rationale(self) -> str:
        return ("Cosine similarity between generated answer and its "
                "supporting context")

    async def a_measure(self, test_case: LLMTestCase):
        return self.measure(test_case)

In [None]:
# evaluate
metric = EmbeddingSimilarityMetric(threshold=0.75)
results = []
for ex in samples:
    results.append(metric.measure(ex))

In [None]:
# --- Output ---
print("Question:", samples[0].input)
print("Expected Answer:", samples[0].expected_output)
print("RAG Answer:", samples[0].actual_output)
print("Similarity Score:", results)

import csv


csv_file = "/content/drive/MyDrive/Traitement automatique des langues/RAG/evaluation_results.csv"

with open(csv_file, mode="w", newline='', encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Sample Index", "Question", "Expected Answer", "RAG Answer", "Similarity Score"])

    for i, sample in enumerate(samples):
        writer.writerow([
            i + 1,
            sample.input,
            sample.expected_output,
            sample.actual_output,
            round(results[i], 4)
        ])

print(f"Results：{csv_file}")

Question: À partir du texte suivant sur la santé mentale ou les troubles psychologiques, générez une question cliniquement pertinente qui pourrait être répondue par ce texte :
La grossophobie est un néologisme désignant l'ensemble des attitudes et des comportements hostiles qui stigmatisent et discriminent les personnes grosses, en surpoids ou obèses. Elle a pour origine des préjugés et des stéréotypes négatifs selon lesquels le fait d'être gros est une question de volonté personnelle et que les personnes grosses seraient ainsi les seules responsables de leur surpoids, en négligeant les autres facteurs à l'origine du surpoids. La grossophobie peut être définie de manière statistique comme une tendance à l'inégalité de santé et de comportement.
2 Annexe n°7 : Série Renseignements généraux du patient
« Au moins une autre mesure
Expected Answer: Le texte suivant traite de sujets liés à la santé mentale ou à des troubles psychologiques. En vous basant sur ce contenu, répondez à la question