1. Install Required Packages

In [None]:
# Installer les packages requis
!pip install faiss-cpu langchain_community sentence-transformers huggingface-hub

# Imports
import pandas as pd
from langchain_community.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from huggingface_hub import login  # This was missing!
import warnings
warnings.filterwarnings('ignore')

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dote

2. Load Dataset

In [None]:
# Get your token from https://huggingface.co/settings/tokens
HF_TOKEN = ""  # Replace with your actual token

# Only login if you have a token
if HF_TOKEN:
    try:
        login(token=HF_TOKEN)
        print("✅ Hugging Face authentication successful")
    except Exception as e:
        print(f"⚠️ Authentication failed: {e}")
else:
    print("⚠️ No HF_TOKEN provided, using public models only")

# 4. Load Dataset
print("📊 Loading dataset...")
try:
    url = "https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl"
    df = pd.read_json(url, lines=True)
    print(f"✅ Dataset loaded: {len(df)} documents")

    # Use both context and instruction for better content
    df['combined_content'] = df['context'] + " " + df['instruction']
    loader = DataFrameLoader(df, page_content_column="combined_content")
    documents = loader.load()
    print(f"✅ Documents processed: {len(documents)}")

except Exception as e:
    print(f"❌ Error loading dataset: {e}")
    # Fallback to a small sample dataset
    sample_data = {
        'context': [
            "Cheesemaking is the process of producing cheese from milk. It involves coagulating milk proteins to form curds and whey.",
            "Kubernetes is an open-source container orchestration platform that automates deployment, scaling, and management of containerized applications.",
            "Machine learning is a subset of artificial intelligence that enables systems to learn from data without being explicitly programmed."
        ],
        'instruction': [
            "Explain the process of cheesemaking",
            "What are the benefits of using Kubernetes?",
            "Define machine learning"
        ]
    }
    df = pd.DataFrame(sample_data)
    df['combined_content'] = df['context'] + " " + df['instruction']
    loader = DataFrameLoader(df, page_content_column="combined_content")
    documents = loader.load()
    print(f"✅ Using sample dataset: {len(documents)} documents")


⚠️ No HF_TOKEN provided, using public models only
📊 Loading dataset...
✅ Dataset loaded: 15011 documents
✅ Documents processed: 15011


# 3. Split Documents into Chunks

In [None]:
print("📝 Splitting documents into chunks...")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)
docs = text_splitter.split_documents(documents)
print(f"✅ Created {len(docs)} document chunks")


📝 Splitting documents into chunks...
✅ Created 19410 document chunks


4. Generate Embeddings

In [None]:
print("🔗 Creating embeddings...")
try:
    # Try the original model first
    modelPath = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=modelPath,
        model_kwargs={'device': 'cpu'},
        encode_kwargs={'normalize_embeddings': False}
    )
    print(f"✅ Using embedding model: {modelPath}")
except Exception as e:
    print(f"⚠️ Fallback to smaller model due to: {e}")
    # Fallback to a smaller model
    modelPath = "sentence-transformers/paraphrase-MiniLM-L3-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=modelPath,
        model_kwargs={'device': 'cpu'},
        encode_kwargs={'normalize_embeddings': False}
    )
    print(f"✅ Using fallback embedding model: {modelPath}")

# Test embeddings
test_text = "This is a test document."
try:
    query_result = embeddings.embed_query(test_text)
    print(f"✅ Embedding test successful, dimension: {len(query_result)}")
except Exception as e:
    print(f"❌ Embedding test failed: {e}")

🔗 Creating embeddings...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Using embedding model: sentence-transformers/all-MiniLM-L6-v2
✅ Embedding test successful, dimension: 384


5. Create a Vector Store (FAISS)

In [None]:
print("🗄️ Creating vector store...")
try:
    db = FAISS.from_documents(docs, embeddings)
    print("✅ FAISS vector store created successfully")
except Exception as e:
    print(f"❌ Error creating vector store: {e}")
    exit()

🗄️ Creating vector store...
✅ FAISS vector store created successfully


6. Load and Wrap the LLM

In [None]:
print("🤖 Loading language model...")
try:
    # Try a simple Q&A model first
    model_name = "distilbert-base-cased-distilled-squad"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)

    qa_pipeline = pipeline(
        "question-answering",
        model=model,
        tokenizer=tokenizer,
        max_length=512,
        truncation=True
    )

    llm = HuggingFacePipeline(
        pipeline=qa_pipeline,
        model_kwargs={"temperature": 0.7, "max_length": 512}
    )
    print(f"✅ Using local model: {model_name}")

except Exception as e:
    print(f"⚠️ Local model failed: {e}")
    print("🔄 Trying Hugging Face Hub model...")

    try:
        from langchain_community.llms import HuggingFaceHub
        llm = HuggingFaceHub(
            repo_id="google/flan-t5-small",
            huggingfacehub_api_token=HF_TOKEN,
            model_kwargs={"temperature": 0.7, "max_length": 512}
        )
        print("✅ Using Hugging Face Hub model: google/flan-t5-small")
    except Exception as e2:
        print(f"❌ Both models failed: {e2}")
        print("Please check your Hugging Face token or internet connection")
        exit()

🤖 Loading language model...


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

Device set to use cuda:0


✅ Using local model: distilbert-base-cased-distilled-squad


7. Build Retrieval QA Chain

In [None]:
print("🔗 Building retrieval QA chain...")
try:
    retriever = db.as_retriever(search_kwargs={"k": 3})
    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",  # Changed from "refine" to "stuff" for better compatibility
        retriever=retriever,
        return_source_documents=True,
        verbose=True
    )
    print("✅ QA chain created successfully")
except Exception as e:
    print(f"❌ Error creating QA chain: {e}")
    exit()

🔗 Building retrieval QA chain...
✅ QA chain created successfully


8. Test the System

In [None]:
print("\n🧪 Testing the system...")

def ask_question(question):
    """Function to ask a question and display results"""
    print(f"\n❓ Question: {question}")
    try:
        result = qa.invoke({"query": question})
        print(f"✅ Answer: {result['result']}")

        if result.get('source_documents'):
            print(f"📚 Source: {result['source_documents'][0].page_content[:200]}...")

        return result
    except Exception as e:
        print(f"❌ Error: {e}")
        return None

# Test questions
test_questions = [
    "What is cheesemaking?",
    "What are the main advantages of using Kubernetes?",
    "How does machine learning work?"
]

for question in test_questions:
    ask_question(question)

print("\n🎉 RAG system setup complete!")
print("You can now use ask_question('Your question here') to query the system.")


🧪 Testing the system...

❓ Question: What is cheesemaking?


[1m> Entering new RetrievalQA chain...[0m
❌ Error: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

The goal of cheese making is to control the spoiling of milk into cheese. The milk is traditionally from a cow, goat, sheep or buffalo, although, in theory, cheese could be made from the milk of any mammal. Cow's milk is most commonly used worldwide. The cheesemaker's goal is a consistent product with specific characteristics (appearance, aroma, taste, texture). The process used to make a Camembert will be similar to, but not quite the same as, that used to make Cheddar.

Some cheeses may be deliberately left to ferment from naturally airborne spores and bacteria; this approach generally leads to a less consistent product but one that is valuable in a niche market.

What is Kraft Dinner?

Culturing
Cheese is mad