In [1]:
# ============================================================================
# CELL 1: Install Dependencies
# ============================================================================
print("📦 Installing dependencies (this may take 2-3 minutes)...")
print("⚠️  You may see some dependency warnings - these are harmless!\n")

# Install core dependencies
!pip install -q sentence-transformers chromadb --no-warn-conflicts
!pip install -q transformers accelerate bitsandbytes --no-warn-conflicts
!pip install -q google-generativeai --no-warn-conflicts

print("\n✅ All dependencies installed!")
print("📝 Note: Dependency warnings can be ignored - they don't affect functionality.")

📦 Installing dependencies (this may take 2-3 minutes)...


✅ All dependencies installed!


In [2]:
# ============================================================================
# CELL 2: Import Libraries & Check GPU
# ============================================================================
import json
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
from google.colab import files
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import google.generativeai as genai
from getpass import getpass
import warnings
warnings.filterwarnings('ignore')

# Check GPU availability
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"✅ GPU Available: {gpu_name}")
    print(f"💾 GPU Memory: {gpu_memory:.1f} GB")
else:
    print("⚠️  No GPU detected. Please enable GPU: Runtime > Change runtime type > T4 GPU")

print("✅ Libraries imported successfully!")


✅ GPU Available: Tesla T4
💾 GPU Memory: 15.8 GB
✅ Libraries imported successfully!


In [3]:
# ============================================================================
# CELL 3: Upload and Load Data
# ============================================================================
print("📁 Please upload your rupp-data.txt file...")
uploaded = files.upload()

# Load the JSON data
filename = list(uploaded.keys())[0]
with open(filename, 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

print(f"\n✅ Loaded {len(raw_data)} Q&A pairs")
print(f"📊 Categories found: {len(set(item['category'] for item in raw_data))}")
print(f"📂 Categories: {', '.join(sorted(set(item['category'] for item in raw_data)))}")
print("\n🔍 Sample entry:")
print(json.dumps(raw_data[0], indent=2))


📁 Please upload your rupp-data.txt file...


Saving rupp-data.txt to rupp-data (1).txt

✅ Loaded 256 Q&A pairs
📊 Categories found: 108
📂 Categories: academic, academic_calendar_detail, academic_integrity, academic_probation, academic_support, adding_dropping, admission_process, admissions, after_graduation, alumni, application_process, attendance_policy, biology_program, campus, career_prep, ceremonies, changing_programs, chemistry_program, community, computer_science_detailed, contact, contact_departments, contact_specific, continuing_education, cost_living, course_load, credits, departments, development, disability, diversity, documents, double_major, electives, employment, engineering_detailed, english_detailed, english_requirements, environment, exam_format, exams, extracurricular, facilities_detail, faculty, famous_alumni, fees, fees_scholarship, financial, food, future_plans, general_info, geography_program, grade_disputes, graduation, graduation_requirements, history, history_program, homework, housing_detail, internationa

In [4]:
# ============================================================================
# CELL 4: Data Processing
# ============================================================================
def process_documents(data):
    """Convert raw data into structured documents"""
    documents = []
    for idx, item in enumerate(data):
        doc = {
            'id': f"doc_{idx}",
            'text': f"Question: {item['question']}\nAnswer: {item['answer']}",
            'metadata': {
                'category': item['category'],
                'question': item['question'],
                'answer': item['answer'],
                'doc_id': idx
            }
        }
        documents.append(doc)
    return documents

# Process documents
documents = process_documents(raw_data)
print(f"✅ Processed {len(documents)} documents")
print(f"\n📄 Sample processed document:")
print(f"ID: {documents[0]['id']}")
print(f"Text preview: {documents[0]['text'][:150]}...")
print(f"Category: {documents[0]['metadata']['category']}")

✅ Processed 256 documents

📄 Sample processed document:
ID: doc_0
Text preview: Question: What is RUPP?
Answer: RUPP (Royal University of Phnom Penh) is Cambodia's oldest and largest national research university, established in 19...
Category: general_info


In [5]:
# ============================================================================
# CELL 5: Initialize Embedding Model (sentence-transformers)
# ============================================================================
print("🤖 Loading embedding model: all-MiniLM-L6-v2...")
print("⏳ This will take 10-20 seconds...")
embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
print("✅ Embedding model loaded!")

# Test the embedder
test_embedding = embedder.encode("test query")
print(f"📊 Embedding dimension: {len(test_embedding)}")
print(f"📏 Model size: ~80MB")
print(f"⚡ Speed: ~1000 sentences/second")

🤖 Loading embedding model: all-MiniLM-L6-v2...
⏳ This will take 10-20 seconds...
✅ Embedding model loaded!
📊 Embedding dimension: 384
📏 Model size: ~80MB
⚡ Speed: ~1000 sentences/second


In [6]:
# ============================================================================
# CELL 6: Generate Embeddings for All Documents
# ============================================================================
def generate_embeddings(documents, embedder):
    """Generate embeddings for all documents"""
    texts = [doc['text'] for doc in documents]
    print(f"🔄 Generating embeddings for {len(texts)} documents...")

    embeddings = embedder.encode(
        texts,
        batch_size=32,
        show_progress_bar=True,
        normalize_embeddings=True,
        convert_to_numpy=True
    )

    return embeddings

# Generate embeddings
embeddings = generate_embeddings(documents, embedder)
print(f"✅ Generated embeddings with shape: {embeddings.shape}")
print(f"💾 Memory usage: ~{embeddings.nbytes / 1e6:.2f} MB")

🔄 Generating embeddings for 256 documents...


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

✅ Generated embeddings with shape: (256, 384)
💾 Memory usage: ~0.39 MB


In [7]:
# ============================================================================
# CELL 7: Setup ChromaDB Vector Store
# ============================================================================
print("🗄️  Setting up ChromaDB vector database...")

# Initialize ChromaDB client
client = chromadb.Client(Settings(
    anonymized_telemetry=False,
    allow_reset=True
))

# Create or reset collection
collection_name = "rupp_qa"
try:
    client.delete_collection(collection_name)
    print("🗑️  Cleared existing collection")
except:
    pass

collection = client.create_collection(
    name=collection_name,
    metadata={"hnsw:space": "cosine"}
)

# Add documents to collection
print("💾 Adding documents to vector store...")
collection.add(
    embeddings=embeddings.tolist(),
    documents=[doc['text'] for doc in documents],
    metadatas=[doc['metadata'] for doc in documents],
    ids=[doc['id'] for doc in documents]
)

print(f"✅ Vector store created with {collection.count()} documents!")
print(f"🔍 Search algorithm: HNSW (fast approximate search)")

🗄️  Setting up ChromaDB vector database...
💾 Adding documents to vector store...
✅ Vector store created with 256 documents!
🔍 Search algorithm: HNSW (fast approximate search)


In [8]:
# ============================================================================
# CELL 8: Retrieval Function
# ============================================================================
def retrieve_context(query, n_results=3, category_filter=None):
    """Retrieve relevant documents for a query"""

    # Generate query embedding
    query_embedding = embedder.encode([query], normalize_embeddings=True)[0]

    # Build filter if category specified
    where_clause = {"category": category_filter} if category_filter else None

    # Search vector store
    results = collection.query(
        query_embeddings=[query_embedding.tolist()],
        n_results=n_results,
        where=where_clause
    )

    # Format results
    context_docs = []
    for i in range(len(results['documents'][0])):
        context_docs.append({
            'text': results['documents'][0][i],
            'metadata': results['metadatas'][0][i],
            'distance': results['distances'][0][i] if results['distances'] else None
        })

    return context_docs

# Test retrieval
print("\n🧪 Testing retrieval system...")
test_query = "Can I take fewer courses?"
print(f"🔍 Query: '{test_query}'")
test_results = retrieve_context(test_query, n_results=2)

print("\n📚 Top 2 Retrieved documents:")
for i, doc in enumerate(test_results, 1):
    print(f"\n{'='*60}")
    print(f"Document {i}:")
    print(f"Category: {doc['metadata']['category']}")
    print(f"Question: {doc['metadata']['question']}")
    print(f"Similarity: {1 - doc['distance']:.3f}")
    print(f"Text preview: {doc['text'][:100]}...")

print("\n✅ Retrieval system working!")


🧪 Testing retrieval system...
🔍 Query: 'Can I take fewer courses?'

📚 Top 2 Retrieved documents:

Document 1:
Category: course_load
Question: Can I take fewer courses?
Similarity: 0.761
Text preview: Question: Can I take fewer courses?
Answer: Part-time enrollment with fewer courses may be possible....

Document 2:
Category: course_load
Question: How many courses should I take per semester?
Similarity: 0.628
Text preview: Question: How many courses should I take per semester?
Answer: Full-time students typically take 5-7...

✅ Retrieval system working!


In [9]:
# Authenticate with HuggingFace
from huggingface_hub import login
from getpass import getpass

token = getpass("Enter your HuggingFace token: ")
login(token=token)
print("✅ Authenticated!")

Enter your HuggingFace token: ··········
✅ Authenticated!


In [12]:
# ============================================================================
# CELL 9: Load Llama-3.1-8B-Instruct (Primary LLM)
# ============================================================================
print("🦙 Loading Llama-3.1-8B-Instruct...")
print("⏳ This will take 2-3 minutes (one-time download)...")
print("💾 Model will use ~6GB GPU memory with 4-bit quantization")

model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

# Configure 4-bit quantization to fit in free Colab GPU
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Load model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16
)

print("✅ Llama-3.1-8B loaded successfully!")
print(f"📊 Model size: ~4.5GB (4-bit quantized)")
print(f"⚡ Expected speed: 40-60 tokens/second")
print(f"🎯 Quality: Near GPT-3.5 level (88% benchmark)")

🦙 Loading Llama-3.1-8B-Instruct...
⏳ This will take 2-3 minutes (one-time download)...
💾 Model will use ~6GB GPU memory with 4-bit quantization


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

✅ Llama-3.1-8B loaded successfully!
📊 Model size: ~4.5GB (4-bit quantized)
⚡ Expected speed: 40-60 tokens/second
🎯 Quality: Near GPT-3.5 level (88% benchmark)


In [13]:
# ============================================================================
# CELL 10: Generate Answer Function with Llama-3.1
# ============================================================================
def generate_answer_llama(query, context_docs, max_tokens=256, temperature=0.3):
    """Generate answer using Llama-3.1-8B"""

    # Build context from retrieved documents
    context = "\n\n".join([
        f"Reference {i+1}:\n{doc['text']}"
        for i, doc in enumerate(context_docs)
    ])

    # Create chat messages in Llama-3.1 format
    messages = [
        {
            "role": "system",
            "content": "You are a helpful academic advisor at RUPP (Royal University of Phnom Penh). Provide clear, accurate, and professional information based on the context given. Keep answers concise but complete."
        },
        {
            "role": "user",
            "content": f"""Context information from RUPP policies:
{context}

Student Question: {query}

Instructions:
- Provide a clear and accurate answer based ONLY on the context above
- If the context doesn't contain enough information, say so
- Keep the tone professional and helpful
- Be specific and cite relevant policies when applicable"""
        }
    ]

    # Format prompt using Llama-3.1 chat template
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    # Decode response
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract only the assistant's response (after the last "assistant" marker)
    if "assistant" in full_response:
        response = full_response.split("assistant")[-1].strip()
    else:
        response = full_response.strip()

    return response

print("✅ Llama-3.1 generation function ready!")

# Test generation
print("\n🧪 Testing Llama-3.1 generation...")
test_context = retrieve_context("Can I take fewer courses?", n_results=2)
test_answer = generate_answer_llama("Can I study part-time?", test_context, max_tokens=150)
print(f"\n💬 Test Answer:\n{test_answer}")

✅ Llama-3.1 generation function ready!

🧪 Testing Llama-3.1 generation...

💬 Test Answer:
According to RUPP policies, part-time enrollment with fewer courses is possible. This allows you to extend your graduation time while accommodating other responsibilities. However, please note that part-time status may have implications for scholarships. I recommend checking with the Studies Office for more information and guidance.


In [15]:
# ============================================================================
# CELL 12: Complete RAG Pipeline
# ============================================================================
def rag_query(user_query, category=None, n_results=3, use_gemini=False, max_tokens=256):
    """Complete RAG pipeline with Llama-3.1 or Gemini"""

    print(f"\n{'='*70}")
    print(f"🔍 Query: {user_query}")
    if category:
        print(f"📂 Category filter: {category}")
    print(f"{'='*70}\n")

    # Step 1: Retrieve relevant context
    print("📚 Step 1: Retrieving relevant documents...")
    context_docs = retrieve_context(
        query=user_query,
        n_results=n_results,
        category_filter=category
    )

    print(f"✅ Retrieved {len(context_docs)} documents")
    for i, doc in enumerate(context_docs, 1):
        sim_score = 1 - doc['distance']
        print(f"   {i}. [{doc['metadata']['category']}] Similarity: {sim_score:.3f}")

    # Step 2: Generate answer
    print(f"\n🤖 Step 2: Generating answer with {'Gemini' if use_gemini else 'Llama-3.1'}...")

    import time
    start_time = time.time()

    if use_gemini and generate_answer_gemini:
        answer = generate_answer_gemini(user_query, context_docs)
    else:
        answer = generate_answer_llama(user_query, context_docs, max_tokens=max_tokens)

    generation_time = time.time() - start_time

    print(f"✅ Answer generated in {generation_time:.2f} seconds")

    # Step 3: Format response
    result = {
        'query': user_query,
        'answer': answer,
        'sources': [
            {
                'category': doc['metadata']['category'],
                'question': doc['metadata']['question'],
                'similarity': 1 - doc['distance']
            }
            for doc in context_docs
        ],
        'num_sources': len(context_docs),
        'generation_time': generation_time,
        'model_used': 'Gemini' if use_gemini else 'Llama-3.1-8B'
    }

    return result

print("✅ Complete RAG pipeline ready!")

✅ Complete RAG pipeline ready!


In [16]:
# ============================================================================
# CELL 13: Test RAG System with Multiple Queries
# ============================================================================
# Test queries covering different categories
test_queries = [
    "Can I take fewer courses?",
    "Are there summer classes?",
    "What happens if I fail a course?",
    "How do I withdraw from a class?",
]

print("🧪 TESTING RAG SYSTEM")
print("="*70)

for i, query in enumerate(test_queries, 1):
    print(f"\n{'#'*70}")
    print(f"TEST {i}/{len(test_queries)}")
    print(f"{'#'*70}")

    result = rag_query(query, use_gemini=False, n_results=3, max_tokens=200)

    print(f"\n💬 ANSWER:")
    print(result['answer'])

    print(f"\n📖 SOURCES ({result['num_sources']}):")
    for j, source in enumerate(result['sources'], 1):
        print(f"   {j}. [{source['category']}] {source['question'][:60]}...")
        print(f"      Similarity: {source['similarity']:.3f}")

    print(f"\n⏱️  Generation time: {result['generation_time']:.2f}s")
    print(f"🤖 Model: {result['model_used']}")

print("\n" + "="*70)
print("✅ All tests completed!")

🧪 TESTING RAG SYSTEM

######################################################################
TEST 1/4
######################################################################

🔍 Query: Can I take fewer courses?

📚 Step 1: Retrieving relevant documents...
✅ Retrieved 3 documents
   1. [course_load] Similarity: 0.761
   2. [course_load] Similarity: 0.628
   3. [attendance_policy] Similarity: 0.579

🤖 Step 2: Generating answer with Llama-3.1...
✅ Answer generated in 9.51 seconds

💬 ANSWER:
Based on the provided context, you may be able to take fewer courses through part-time enrollment. However, this would extend your graduation time. I recommend checking with the Studies Office at RUPP to discuss your part-time status and any implications for scholarships.

📖 SOURCES (3):
   1. [course_load] Can I take fewer courses?...
      Similarity: 0.761
   2. [course_load] How many courses should I take per semester?...
      Similarity: 0.628
   3. [attendance_policy] What happens if I miss too man

In [17]:
# ============================================================================
# CELL 14: Interactive Chat Interface
# ============================================================================
def interactive_rag():
    """Interactive query interface for RUPP chatbot"""
    print("\n" + "="*70)
    print("🎓 RUPP Q&A CHATBOT - Interactive Mode")
    print("="*70)
    print("Commands:")
    print("  • Type your question to get an answer")
    print("  • 'categories' - Show all available categories")
    print("  • 'stats' - Show system statistics")
    print("  • 'switch' - Switch between Llama and Gemini")
    print("  • 'quit' - Exit the chatbot")
    print("="*70 + "\n")

    # Get available categories
    categories = sorted(set(doc['metadata']['category'] for doc in documents))
    use_gemini = False

    while True:
        try:
            user_input = input("\n❓ Your question: ").strip()

            if not user_input:
                continue

            if user_input.lower() == 'quit':
                print("👋 Thank you for using RUPP Q&A Chatbot. Goodbye!")
                break

            if user_input.lower() == 'categories':
                print(f"\n📂 Available categories ({len(categories)}):")
                for i, cat in enumerate(categories, 1):
                    count = sum(1 for d in documents if d['metadata']['category'] == cat)
                    print(f"   {i}. {cat} ({count} Q&As)")
                continue

            if user_input.lower() == 'stats':
                print(f"\n📊 System Statistics:")
                print(f"   • Total Q&A pairs: {len(documents)}")
                print(f"   • Categories: {len(categories)}")
                print(f"   • Embedding model: all-MiniLM-L6-v2 (384d)")
                print(f"   • LLM model: {'Gemini Pro' if use_gemini else 'Llama-3.1-8B'}")
                print(f"   • Vector DB: ChromaDB (HNSW)")
                continue

            if user_input.lower() == 'switch':
                if generate_answer_gemini:
                    use_gemini = not use_gemini
                    print(f"🔄 Switched to {'Gemini Pro' if use_gemini else 'Llama-3.1-8B'}")
                else:
                    print("⚠️  Gemini not configured. Using Llama-3.1 only.")
                continue

            # Ask for optional category filter
            filter_cat = input("📂 Filter by category? (press Enter to skip): ").strip()
            category_filter = filter_cat if filter_cat and filter_cat in categories else None

            if filter_cat and filter_cat not in categories and filter_cat != "":
                print(f"⚠️  Category '{filter_cat}' not found. Searching all categories...")
                category_filter = None

            # Run RAG query
            result = rag_query(
                user_input,
                category=category_filter,
                use_gemini=use_gemini,
                n_results=3,
                max_tokens=250
            )

            print(f"\n💬 ANSWER:")
            print(result['answer'])

            print(f"\n📖 SOURCES:")
            for i, source in enumerate(result['sources'], 1):
                print(f"   {i}. [{source['category']}] {source['question']}")
                print(f"      Relevance: {source['similarity']:.1%}")

            print(f"\n⏱️  Response time: {result['generation_time']:.2f}s")

        except KeyboardInterrupt:
            print("\n\n👋 Interrupted. Goodbye!")
            break
        except Exception as e:
            print(f"\n❌ Error: {str(e)}")
            print("Please try again or type 'quit' to exit.")

# Run interactive mode
print("\n🚀 Starting interactive chatbot...")
interactive_rag()



🚀 Starting interactive chatbot...

🎓 RUPP Q&A CHATBOT - Interactive Mode
Commands:
  • Type your question to get an answer
  • 'categories' - Show all available categories
  • 'stats' - Show system statistics
  • 'switch' - Switch between Llama and Gemini
  • 'quit' - Exit the chatbot


❓ Your question: what is rupp?
📂 Filter by category? (press Enter to skip): 

🔍 Query: what is rupp?

📚 Step 1: Retrieving relevant documents...
✅ Retrieved 3 documents
   1. [vision] Similarity: 0.653
   2. [general_info] Similarity: 0.647
   3. [transportation] Similarity: 0.605

🤖 Step 2: Generating answer with Llama-3.1...
✅ Answer generated in 3.81 seconds

💬 ANSWER:
Based on the provided context, RUPP stands for Royal University of Phnom Penh.

📖 SOURCES:
   1. [vision] What is RUPP's mission?
      Relevance: 65.3%
   2. [general_info] What is RUPP?
      Relevance: 64.7%
   3. [transportation] How do I get to RUPP?
      Relevance: 60.5%

⏱️  Response time: 3.81s

❓ Your question: does rupp hav

In [18]:
# ============================================================================
# CELL 15: Evaluation & Performance Metrics
# ============================================================================
def evaluate_retrieval(test_cases):
    """Evaluate retrieval quality"""

    print("\n📊 RETRIEVAL EVALUATION")
    print("="*70)

    total_correct = 0
    results = []

    for test in test_cases:
        query = test['query']
        expected_category = test['expected_category']

        # Retrieve top result
        context_docs = retrieve_context(query, n_results=1)
        retrieved_category = context_docs[0]['metadata']['category']
        similarity = 1 - context_docs[0]['distance']

        is_correct = retrieved_category == expected_category
        total_correct += is_correct

        results.append({
            'query': query,
            'expected': expected_category,
            'retrieved': retrieved_category,
            'correct': is_correct,
            'similarity': similarity
        })

        status = "✅" if is_correct else "❌"
        print(f"{status} {query[:45]:45} | Expected: {expected_category:15} | Got: {retrieved_category:15} | Sim: {similarity:.3f}")

    accuracy = total_correct / len(test_cases) * 100
    avg_similarity = np.mean([r['similarity'] for r in results])

    print(f"\n{'='*70}")
    print(f"🎯 Retrieval Accuracy: {accuracy:.1f}% ({total_correct}/{len(test_cases)})")
    print(f"📊 Average Similarity: {avg_similarity:.3f}")
    print(f"{'='*70}")

    return results

# Example test cases (customize with your actual data)
example_test_cases = [
    {'query': 'Can I take fewer courses?', 'expected_category': 'course_load'},
    {'query': 'Are there summer classes?', 'expected_category': 'summer_courses'},
]

print("\n💡 To run evaluation, create test cases and call:")
print("eval_results = evaluate_retrieval(your_test_cases)")


💡 To run evaluation, create test cases and call:
eval_results = evaluate_retrieval(your_test_cases)


In [20]:
# ============================================================================
# CELL 16: Save & Export Results
# ============================================================================
def save_conversation(queries_and_answers, filename='rag_conversation.json'):
    """Save Q&A results to file and download"""

    # Prepare data for export
    export_data = {
        'system_info': {
            'embedding_model': 'sentence-transformers/all-MiniLM-L6-v2',
            'llm_model': 'meta-llama/Meta-Llama-3.1-8B-Instruct',
            'vector_db': 'ChromaDB',
            'total_documents': len(documents),
            'retrieval_k': 3
        },
        'conversations': queries_and_answers
    }

    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(export_data, f, indent=2, ensure_ascii=False)

    print(f"✅ Results saved to {filename}")

    # Download file
    files.download(filename)
    print(f"📥 File downloaded!")

def export_to_csv(results, filename='rag_results.csv'):
    """Export results to CSV format"""
    import csv

    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Query', 'Answer', 'Model', 'Time', 'Top_Category', 'Similarity'])

        for r in results:
            writer.writerow([
                r['query'],
                r['answer'],
                r['model_used'],
                f"{r['generation_time']:.2f}",
                r['sources'][0]['category'] if r['sources'] else 'N/A',
                f"{r['sources'][0]['similarity']:.3f}" if r['sources'] else 'N/A'
            ])

    print(f"✅ Exported to {filename}")
    files.download(filename)

print("\n💡 To save your results:")
print("save_conversation(your_results)")
print("export_to_csv(your_results)")


💡 To save your results:
save_conversation(your_results)
export_to_csv(your_results)


In [21]:
# ============================================================================
# CELL 17: Install FastAPI & Expose to Public
# ============================================================================
!pip install -q fastapi uvicorn pyngrok python-multipart
print("✅ FastAPI dependencies installed!")


✅ FastAPI dependencies installed!


In [22]:
# ============================================================================
# CELL 18: FastAPI Server for Next.js
# ============================================================================
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import List, Optional
import uvicorn
from pyngrok import ngrok
import nest_asyncio

# Allow nested event loops (required for Colab)
nest_asyncio.apply()

# Initialize FastAPI
app = FastAPI(title="RUPP RAG API", version="1.0.0")

# Enable CORS for Next.js
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # In production, specify your domain
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Request/Response Models
class QueryRequest(BaseModel):
    question: str
    category: Optional[str] = None
    n_results: Optional[int] = 3
    use_gemini: Optional[bool] = False

class Source(BaseModel):
    category: str
    question: str
    similarity: float

class QueryResponse(BaseModel):
    query: str
    answer: str
    sources: List[Source]
    generation_time: float
    model_used: str

# Health check endpoint
@app.get("/")
def read_root():
    return {
        "status": "online",
        "message": "RUPP RAG API",
        "endpoints": {
            "/query": "POST - Ask a question",
            "/categories": "GET - List all categories",
            "/health": "GET - Check system health"
        }
    }

# Query endpoint
@app.post("/query", response_model=QueryResponse)
async def query_endpoint(request: QueryRequest):
    try:
        # Run RAG query
        result = rag_query(
            user_query=request.question,
            category=request.category,
            n_results=request.n_results,
            use_gemini=request.use_gemini,
            max_tokens=256
        )

        return QueryResponse(**result)

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

# Get categories endpoint
@app.get("/categories")
def get_categories():
    categories = sorted(set(doc['metadata']['category'] for doc in documents))
    return {
        "categories": categories,
        "total": len(categories)
    }

# Health check
@app.get("/health")
def health_check():
    return {
        "status": "healthy",
        "model": "Llama-3.1-8B",
        "documents": len(documents),
        "gpu_available": torch.cuda.is_available()
    }

print("✅ FastAPI app configured!")

✅ FastAPI app configured!


In [27]:
# ============================================================================
# CELL 19: Start Server & Get Public URL  (COLAB SAFE VERSION)
# ============================================================================
from getpass import getpass
import threading
import nest_asyncio
import uvicorn
from pyngrok import ngrok

# Patch the running event loop so uvicorn works in Colab
nest_asyncio.apply()

# Get ngrok auth token (free: https://dashboard.ngrok.com/get-started/your-authtoken)
print("🔑 Get your FREE ngrok token from: https://dashboard.ngrok.com/get-started/your-authtoken")
ngrok_token = getpass("Enter ngrok auth token: ")
ngrok.set_auth_token(ngrok_token)

# Kill previous ngrok if exists
!pkill ngrok || echo "No existing ngrok process."

# Start ngrok tunnel
public_url = ngrok.connect(8000)
print(f"\n{'='*70}")
print(f"🌐 PUBLIC API URL: {public_url}")
print(f"{'='*70}")
print("\n📝 Copy this URL for your Next.js app!\n")

print("🧪 Test endpoints:")
print(f"   • Health: {public_url}/health")
print(f"   • Categories: {public_url}/categories")
print(f"   • Query: {public_url}/query  (POST)")
print("\n⚠️  Keep this cell running! Server will stop if you interrupt it.")
print("="*70)


# ------------------------------
# Run Uvicorn in a background thread
# ------------------------------
def start_server():
    uvicorn.run(app, host="0.0.0.0", port=8000)

server_thread = threading.Thread(target=start_server, daemon=True)
server_thread.start()


🔑 Get your FREE ngrok token from: https://dashboard.ngrok.com/get-started/your-authtoken
Enter ngrok auth token: ··········

🌐 PUBLIC API URL: NgrokTunnel: "https://grouseless-nonphysically-craig.ngrok-free.dev" -> "http://localhost:8000"

📝 Copy this URL for your Next.js app!

🧪 Test endpoints:
   • Health: NgrokTunnel: "https://grouseless-nonphysically-craig.ngrok-free.dev" -> "http://localhost:8000"/health
   • Categories: NgrokTunnel: "https://grouseless-nonphysically-craig.ngrok-free.dev" -> "http://localhost:8000"/categories
   • Query: NgrokTunnel: "https://grouseless-nonphysically-craig.ngrok-free.dev" -> "http://localhost:8000"/query  (POST)

⚠️  Keep this cell running! Server will stop if you interrupt it.
