In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## ===============================
# AI Educational Chatbot with RAG
## ===============================
### This system provides:
#### 1. Multi-source knowledge retrieval (PDFs, Wikipedia)
#### 2. Gradio web interface
#### 3. Performance evaluation
## ===============================

In [None]:
# 1. Install required packages

!pip install langchain langchain-community gradio sentence-transformers 
!pip install transformers pypdf faiss-cpu nltk chromadb PyMuPDF wikipedia sacrebleu

# 2. Import core libraries

import re
import fitz 
from typing import List, Dict, Tuple
from pypdf import PdfReader
from googlesearch import search
import wikipedia
import torch
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.prompts import ChatPromptTemplate

from transformers import MarianMTModel, MarianTokenizer, pipeline
from sacrebleu import corpus_bleu
import gradio as gr

## - PDF Processing :

In [None]:
# 3. PDF Processing
def clean_text(text: str) -> str:
    """Clean extracted PDF text"""
    # Remove headers/footers
    text = re.sub(r'Page \d+|Chapter \d+', '', text)
    # Remove bullet points and special chars
    text = re.sub(r'[\u2022\u2023\u25CF•–·●▪►]', '', text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.split('\n')
    lines = [line.strip() for line in text if line.strip() != '']
    text = ' '.join(lines)
    text = re.sub(r'[\u2022\u2023\u25CF\u25A0•–·●▪►]', '', text)
    text = re.sub(r'\s+', ' ', text) 
    text = re.sub(r' +([,!?؛:])', r'\1', text)
    text = text.strip()
    return text

def process_pdf(file_path: str, skip_pages: int = 20) -> str:
    """Extract and clean text from PDF"""
    doc = fitz.open(file_path)
    full_text = ""

    for page_num in range(skip_pages, len(doc)):
        page = doc[page_num]
        full_text += page.get_text()
    
    doc.close()
    return clean_text(full_text)


## - Knowledge Base :

In [None]:
def process_pdfs_and_create_vectorstore(pdf_folder: str, skip_pages: int = 20):
    """Process PDFs from a folder and return a Chroma vectorstore"""
    embedding_model = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-mpnet-base-v2"
    )
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=50,
        chunk_overlap=5,
        length_function=len
    )
    documents = []

    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            file_path = os.path.join(pdf_folder, filename)
            text = process_pdf(file_path, skip_pages=skip_pages)

            chunks = text_splitter.split_text(text)
            for chunk in chunks:
                documents.append(Document(
                    page_content=chunk,
                    metadata={"source": filename}
                ))

    if documents:
        vectorstore = Chroma.from_documents(
            documents=documents,
            embedding=embedding_model
        )
        return vectorstore
    else:
        return None

def query_vectorstore(vectorstore, question: str, k: int = 3) -> List[Document]:
    """Query the vector store for relevant documents"""
    if vectorstore is None:
        return []
    return vectorstore.similarity_search(question, k=k)


## - Load Question Answering model :

In [None]:
# pip install -q transformers
from transformers import pipeline

checkpoint = "google/flan-t5-large"

model = pipeline('text2text-generation', model = checkpoint)

input_prompt = """System Prompt:

You are an expert educational chatbot specialized in answering questions based strictly on provided study materials.

Strict Rules:
Language: Respond ONLY in English, regardless of the input language.

Sources Priority:

First: Use the provided PDF documents as the main and preferred source.

Second: Refer to verified English Wikipedia knowledge if the PDFs do not contain the answer.

Third: Use a general web search as a last fallback.

Tone & Style: Maintain a professional, academic tone appropriate for educational settings.

Accuracy Guidelines:

If uncertain or unable to verify, respond with: “I couldn't verify this from my sources.”

Never hallucinate or fabricate information.

Cite your source explicitly when possible.

Response Format (Always Follow This Template):
less
[Source: PDFs/Wikipedia/Web]  
[Confidence: High/Medium/Low]  

Answer: [Concise and accurate response goes here]
Examples of Interactions (Follow Format Exactly):
User: What is Artificial intelligence?
Bot:
[Source: PDFs]
[Confidence: High]
Answer: Artificial intelligence (AI) refers to the simulation of human intelligence processes by machines, especially computer systems.

User: Explain Gradient Descent
Bot:
[Source: Wikipedia]
[Confidence: Medium]
Answer: Gradient descent is an optimization algorithm used to minimize the cost function in machine learning by iteratively moving in the direction of the steepest descent as defined by the negative of the gradient.

User: What’s ML?
Bot:
[Source: PDFs]
[Confidence: High]
Answer: Machine learning (ML) is a subset of artificial intelligence that focuses on the development of algorithms that enable computers to learn from and make predictions or decisions based on data.

"""
generated_text = model(input_prompt, max_length=512, do_sample=True)[0]['generated_text']

print("Response :" + generated_text)


def generate_from_prompt(prompt: str, system_prompt: str = None, max_length: int = 200) -> str:
    """Generate text based on given prompt and optional system prompt"""
    if not prompt.strip():
        return ""
    
    try:
        full_prompt = f"{system_prompt}\n{prompt}" if system_prompt else prompt
        result = generation_model(
            full_prompt,
            max_length=max_length,
            do_sample=True,
            top_k=4,
            top_p=0.95
        )
        return result[0]["generated_text"]
    except Exception as e:
        print(f"Generation error: {e}")
        return ""

def generate_from_wikipedia(topic: str, system_prompt: str = None) -> str:
    """Fallback: Generate text using a Wikipedia summary"""
    try:
        wikipedia.set_lang("en")
        summary = wikipedia.summary(topic, sentences=3)
        return generate_from_prompt(f"Expand on this: {summary}", system_prompt=system_prompt)
    except Exception as e:
        print(f"Wikipedia error: {e}")
        return ""

def get_google_search_link(query: str) -> str:
    """Fallback: Get a Google search link for a query"""
    try:
        return next(search(query, num=1, stop=1, pause=2))
    except Exception as e:
        print(f"Google search error: {e}")
        return ""


## - Evaluation :

In [None]:
from sacrebleu.metrics import BLEU
bleu_metric = BLEU()  # Create a BLEU scorer instance

def calculate_bleu(predictions: List[str], references: List[str]) -> float:
    """Compute BLEU score for answer quality"""
    # SacreBLEU expects a list of hypotheses and a list of reference sets
    return bleu_metric.corpus_score(predictions, [references]).score

def calculate_exact_match(predictions: List[str], references: List[str]) -> float:
    """Compute exact match percentage"""
    matches = sum(1 for p, r in zip(predictions, references) if p.lower().strip() == r.lower().strip())
    return matches / len(predictions) if predictions else 0.0

In [None]:
"""preds = ["The capital of France is Paris.", "Water boils at 100 degrees."]
refs = ["The capital of France is Paris.", "Water boils at 100°C."]

print("BLEU Score:", calculate_bleu(preds, refs))
print("Exact Match:", calculate_exact_match(preds, refs))
"""

## - RAG Pipeline :

In [None]:
def generate_response(
    user_question: str,
    vectorstore,
    system_prompt: str
) -> str:
    """Answer generation using vectorstore, Wikipedia, or fallback"""
    # 1. ابحث في قاعدة المعرفة
    relevant_docs = query_vectorstore(vectorstore, user_question)
    context = "\n".join([doc.page_content for doc in relevant_docs])
    
    generation_prompt = f"""{system_prompt}
User: {user_question}
Context: {context}
Bot:"""
    
    answer = generate_from_prompt(generation_prompt, system_prompt)
    
    # 2. لو ما فيش إجابة، جرب ويكيبيديا
    if not answer.strip():
        answer = generate_from_wikipedia(user_question, system_prompt)
    
    # 3. لو برضو مفيش، ارجع برابط بحث
    if not answer.strip():
        link = get_google_search_link(user_question)
        if link:
            answer = f"I couldn't find a precise answer. You might find this helpful: {link}"
        else:
            answer = "Sorry, I couldn't find an answer."
    
    return answer


def evaluate_system(
    test_cases: List[Tuple[str, str]],
    vectorstore,
    system_prompt: str
) -> Dict[str, float]:
    """Evaluate chatbot performance on test cases"""
    questions = [q for q, _ in test_cases]
    references = [r for _, r in test_cases]
    
    predictions = []
    for question in questions:
        response = generate_response(question, vectorstore, system_prompt)
        predictions.append(response)
    
    return {
        "bleu": calculate_bleu(predictions, references),
        "exact_match": calculate_exact_match(predictions, references)
    }


In [None]:

vectorstore = process_pdfs_and_create_vectorstore("/kaggle/input/ai-pdfs/AI")


system_prompt = """You are an educational assistant. Provide clear and concise answers based on the provided context."""

question = input("ask : ")
response = generate_response(question, vectorstore, system_prompt)
print("Response:", response)


test_cases = [
    ("What is AI?", "AI stands for Artificial Intelligence."),
    ("What is the capital of France?", "The capital of France is Paris.")
]

scores = evaluate_system(test_cases, vectorstore, system_prompt)
print("Evaluation Scores:", scores)


## - Gradio Interface :

In [None]:
"""# 9. Gradio Interface
import gradio as gr
from typing import List, Tuple


def create_chat_interface(pdf_folder: str):
    """Launch interactive web interface"""
    chatbot = EducationalChatbot(pdf_folder)
    
    def respond(message: str, history: List[Tuple[str, str]]):
        response = chatbot.generate_response(message)
        return response
    
    demo = gr.Interface(
        fn=chatbot.generate_response,
        inputs=gr.Textbox(label="Ask your question about AI", placeholder="e.g., What is the difference between AI and ML?"),
        outputs="text",
        title="AI Educational Assistant (English)",
        description="Ask me anything about Artificial Intelligence (English only)",
        examples=[
            "What is the difference between AI and ML?",
            "Explain neural networks in simple terms"
        ],
        cache_examples=True
    )
    
    return demo"""