In [1]:
%%capture
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install unsloth langchain chromadb pdfplumber python-dotenv pandas transformers
!pip install -U langchain-community
!pip install sentence-transformers
!pip install bitsandbytes
!pip install accelerate
!pip install einops
!pip install numpy
# Import required libraries
import torch
import csv
import pandas as pd
from datasets import load_dataset
from transformers import TrainingArguments, DataCollatorForSeq2Seq, pipeline
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import get_chat_template, train_on_responses_only
from trl import SFTTrainer
from langchain.document_loaders import PDFPlumberLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import uuid


## Load LLM

In [2]:
# 1. First load model and tokenizer
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length=2048,
    load_in_4bit=torch.cuda.is_available(),
    # Add flash attention for optimization
    attn_implementation="flash_attention_2" if torch.cuda.is_available() else None,
)

# 2. Then apply chat template
tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3.1",
)

# 3. Prepare for inference
model = FastLanguageModel.for_inference(model)

# 4. Verify template application
test_messages = [{"role": "user", "content": "Hello"}]
try:
    tokenizer.apply_chat_template(test_messages)
except Exception as e:
    raise ValueError("Chat template application failed!") from e

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


## Embedding model and vector database

In [3]:
# Define the method name
!pip install pdfplumber
!pip install chromadb
# method = "FineTuning + RAG"

# Load the PDF file
loader = PDFPlumberLoader("FAQ.pdf")
pages = loader.load()

# Split the text for better context continuity
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=256, separators=["\n\n", "\n", ". ", "! ", "? ", " "])
chunks = text_splitter.split_documents(pages)

# Replace the existing embedding function with:
embedding_function = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"},
    encode_kwargs={
        "normalize_embeddings": True,
        "batch_size": 32  # Better for multilingual processing
    },
)

# Function to create vectorstore with normalized embeddings
def create_vectorstore(chunks, embedding_function, vectorstore_path):
    # Add explicit Chroma client configuration
    from chromadb.config import Settings
    client_settings = Settings(
        chroma_db_impl="duckdb+parquet",
        persist_directory=vectorstore_path,
        anonymized_telemetry=False,
    )

    vectorstore = Chroma.from_documents(
        documents=chunks,
        embedding=embedding_function,
        client_settings=client_settings,  # Add this line
        persist_directory=vectorstore_path,
        collection_metadata={
            "hnsw:space": "cosine",
            "dimension": 384
        },
    )
    vectorstore.persist()  # Explicitly persist the data
    return vectorstore

vectorstore_path = "/content/vectorstore"  # Use absolute path
import os
import shutil
import time

# Completely remove existing directory with proper permissions
if os.path.exists(vectorstore_path):
    shutil.rmtree(vectorstore_path, ignore_errors=True)
    time.sleep(2)  # Increase wait time


# Create fresh directory with write permissions
os.makedirs(vectorstore_path, exist_ok=True, mode=0o777)
# Set directory permissions explicitly
os.chmod(vectorstore_path, 0o777)
# Now create the vectorstore
try:
    vectorstore = create_vectorstore(
        chunks=chunks,
        embedding_function=embedding_function,
        vectorstore_path=vectorstore_path
    )
except Exception as e:
    print(f"Error creating vectorstore: {e}")
    # Fallback to in-memory
    vectorstore = Chroma.from_documents(
        documents=chunks,
        embedding=embedding_function,
        persist_directory=None
    )



system_message = {
    "role": "system",
    "content": """You are an official advisor at IMT Mines Alès.
Answer STRICTLY using only this context: \n{context}
Rules:
1. If answer isn't in context, say "I don't have that information".
2. Keep responses under 3 sentences.
3. Never mention you're an AI.
"""
}







  embedding_function = HuggingFaceEmbeddings(


Error creating vectorstore: [91mYou are using a deprecated configuration of Chroma.

[94mIf you do not have data you wish to migrate, you only need to change how you construct
your Chroma client. Please see the "New Clients" section of https://docs.trychroma.com/deployment/migration.
________________________________________________________________________________________________

If you do have data you wish to migrate, we have a migration tool you can use in order to
migrate your data to the new Chroma architecture.
Please `pip install chroma-migrate` and run `chroma-migrate` to migrate your data and then
change how you construct your Chroma client.

See https://docs.trychroma.com/deployment/migration for more information or join our discord at https://discord.gg/MMeYNTmh3x for help![0m


## Hybrid Retrieval

In [4]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever
!pip install rank_bm25
# Create semantic retriever
semantic_retriever = vectorstore.as_retriever(search_kwargs={"k": 2})
# texts = [doc.page_content for doc in chunks]

# Create keyword retriever
# metadatas = [{"source": doc.metadata.get("source", "unknown")} for doc in chunks]
bm25_retriever = BM25Retriever.from_documents(chunks)
bm25_retriever.k = 2

# Combine retrievers
ensemble_retriever = EnsembleRetriever(
    retrievers=[semantic_retriever, bm25_retriever],
    weights=[0.6, 0.4]
)



In [5]:

def validate_response(response: str, context: str) -> str:
    # First check: Empty response
    if not response.strip():
        return "I don't have that information."

    # Second check: Direct contradiction
    validation_pipe = pipeline(
        "zero-shot-classification",
        model="facebook/bart-large-mnli",
        device=0 if torch.cuda.is_available() else -1,
    )

    result = validation_pipe(
        sequences=context,
        candidate_labels=["relevant", "irrelevant"],
        hypothesis_template="This context supports the response: '{}'",
        multi_label=False,
    )

    if result['labels'][0] == "irrelevant":
        return "I cannot confirm this information."

    return response


In [6]:
from transformers import StoppingCriteria

class StopOnTokens(StoppingCriteria):
    def __init__(self, stop_token_ids):
        self.stop_token_ids = stop_token_ids

    def __call__(self, input_ids, scores, **kwargs):
        return any(token in self.stop_token_ids for token in input_ids[0][-3:])

# Add this right after the class definition:
stop_token_ids = [tokenizer.eos_token_id] + [
    tokenizer.encode(token, add_special_tokens=False)[-1]  # Get last token ID
    for token in ["\nReferences:", "\nSource:"]
]

## Interactive chat

In [7]:
# Add these imports at the top
from typing import Dict, List
from langchain.schema import HumanMessage, AIMessage

# 1. Conversation State Manager
class ChatSession:
    def __init__(self):
        self.history: List[Dict] = []
        self.context_window = 2048  # Match model's max_seq_length

    def add_interaction(self, question: str, answer: str):
        self.history.extend([
            {"role": "user", "content": question},
            {"role": "assistant", "content": answer}
        ])

    def get_recent_history(self, token_limit: int = 512) -> str:
        """Return truncated conversation history"""
        history_text = "\n".join(
            f"{msg['role'].capitalize()}: {msg['content']}"
            for msg in self.history[-4:]  # Last 2 exchanges
        )
        return self._truncate_text(history_text, token_limit)

    def _truncate_text(self, text: str, max_tokens: int) -> str:
        tokens = tokenizer.encode(text, add_special_tokens=False)
        return tokenizer.decode(tokens[-max_tokens:], skip_special_tokens=True)

# 2. Enhanced RAG Pipeline with History
def interactive_rag_pipeline(question: str, session: ChatSession) -> str:
    # Incorporate conversation history in retrieval
    combined_query = f"{session.get_recent_history()}\n{question}"
    recent_history = session.get_recent_history()
    # Retrieve documents with history context
    docs = ensemble_retriever.invoke(combined_query)
    print(f"Retrieved \n{docs}")
    context = "\n".join([d.page_content for d in docs[:3]])

    # Build messages with history
    messages = [
        system_message.copy(),
        *session.history,
        {"role": "user", "content": question}
    ]
    messages[0]["content"] = messages[0]["content"].replace("{context}", context)
    print(f"Messages: {messages}")
    # Generate response
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to("cuda")
    print(f"debugging: {prompt}")
    outputs = model.generate(
        inputs.input_ids,
        max_new_tokens=256,
        temperature=0.5,
        stopping_criteria=[StopOnTokens(stop_token_ids)],
        pad_token_id=tokenizer.eos_token_id,
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    clean_response = response.split("assistant<|end_header_id|>")[-1].split("<|eot_id|>")[0].strip()

    # Validate and format
    final_response = validate_response(clean_response, context)
    session.add_interaction(question, final_response)

    return final_response



## Gradio

In [8]:
!pip install gradio
import gradio as gr
from typing import List, Tuple
def gradio_chat_interface(message: str, history: List[Tuple[str, str]]):
    """Handles chat interactions"""
    if not hasattr(gr, "session"):
        gr.session = ChatSession()

    if message.lower() == "/reset":
        gr.session = ChatSession()
        return "History reset!"

    response = interactive_rag_pipeline(message, gr.session)
    return response

# Simplified ChatInterface without newer parameters
demo = gr.ChatInterface(
    fn=gradio_chat_interface,
    title="IMT Mines Ales RAG Assistant",
    description="Ask me anything! Type /reset to clear history.",
    examples=["Where is IMT Mines Ales?", "Explain quantum physics"],
    theme="soft"
)

if __name__ == "__main__":
    demo.launch()



  self.chatbot = Chatbot(


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7467da35f55b6f4561.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
