<a href="https://colab.research.google.com/github/NadaOsamaAhmed/Myproject/blob/master/project_dm_psychology2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

os.makedirs("/content/pdfs", exist_ok=True)
print("Folder created!")

Folder created!


In [8]:
!pip install --upgrade cohere sentence-transformers faiss-cpu torch gradio PyPDF2

import torch
import cohere
from sentence_transformers import SentenceTransformer
import faiss
import gradio as gr
from PyPDF2 import PdfReader
import numpy as np
import re
import os

# Initialize
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Cohere API key - Put your key here
COHERE_API_KEY = "XSDCov2rtlKRUIJynd0sJpOPP3hxwVIqe4Qr60ix"
co = cohere.Client(COHERE_API_KEY)

# Load embedding model
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embedding_model.to(device)

# Step 1: Load PDFs
print("Loading PDFs...")
documents = []
doc_metadata = []

pdf_folder = "/content/pdfs"

for i in range(1, 11):
    try:
        pdf_path = os.path.join(pdf_folder, f"psychology_{i}.pdf")
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text() or ""

        if not text.strip():
            print(f"Warning: Document {i} is empty, skipped")
            continue

        documents.append(text)
        doc_metadata.append({"source": f"psychology_{i}.pdf", "doc_id": i})
        print(f"âœ“ Document {i} loaded: {len(text)} chars")
    except Exception as e:
        print(f"âœ— Error reading PDF {i}: {e}")

if not documents:
    raise ValueError("No valid documents found!")

print(f"\nTotal documents loaded: {len(documents)}")

# Step 2: Split documents into chunks
print("\nSplitting documents into chunks...")

def split_text_into_chunks(text, chunk_size=500, chunk_overlap=50):
    """
    Split text into chunks with overlap
    """
    chunks = []
    start = 0
    text_length = len(text)

    while start < text_length:
        end = start + chunk_size

        # If not at the end, try to break at sentence boundary
        if end < text_length:
            # Look for sentence endings near the chunk boundary
            boundary_text = text[end-100:end+100] if end+100 < text_length else text[end-100:]
            sentence_end = max(
                boundary_text.rfind('. '),
                boundary_text.rfind('.\n'),
                boundary_text.rfind('? '),
                boundary_text.rfind('! ')
            )

            if sentence_end != -1:
                end = end - 100 + sentence_end + 1

        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)

        start = end - chunk_overlap

    return chunks

chunks = []
chunk_metadata = []

for i, doc in enumerate(documents):
    doc_chunks = split_text_into_chunks(doc, chunk_size=500, chunk_overlap=50)
    chunks.extend(doc_chunks)
    chunk_metadata.extend([doc_metadata[i]] * len(doc_chunks))
    print(f"Document {i+1}: {len(doc_chunks)} chunks")

print(f"\nTotal chunks: {len(chunks)}")

# Step 3: Create embeddings
print("\nCreating embeddings...")
embeddings = embedding_model.encode(chunks, show_progress_bar=True, device=device)
embeddings = np.array(embeddings).astype('float32')

# Step 4: Create FAISS index
print("Building FAISS index...")
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
print(f"âœ“ FAISS index built with {index.ntotal} vectors")

# Step 5: RAG Query Function
def rag_query(question, top_k=3, use_cohere=True):
    """
    RAG system with Cohere integration
    """
    # Embed the question
    query_embedding = embedding_model.encode([question], device=device)
    query_embedding = np.array(query_embedding).astype('float32')

    # Search FAISS
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve relevant chunks
    retrieved_chunks = [chunks[i] for i in indices[0]]
    retrieved_sources = [chunk_metadata[i]['source'] for i in indices[0]]

    # Build context
    context = "\n\n".join([f"[Source {i+1}: {src}]\n{chunk}"
                           for i, (chunk, src) in enumerate(zip(retrieved_chunks, retrieved_sources))])

    if use_cohere:
        # Use Cohere for generation
        try:
            response = co.chat(
                message=question,
                documents=[{"text": chunk} for chunk in retrieved_chunks],
                model="command-a-03-2025",  # or command-r
                temperature=0.3,
                prompt_truncation="AUTO"
            )
            answer = response.text
            citations = response.citations if hasattr(response, 'citations') else []
        except Exception as e:
            answer = f"Error with Cohere API: {str(e)}\n\nPlease check your API key"
            citations = []
    else:
        # Fallback: return context only
        answer = f"Retrieved texts:\n\n{context}"
        citations = []

    return {
        "answer": answer,
        "sources": list(set(retrieved_sources)),
        "context": context[:1000] + "..." if len(context) > 1000 else context,
        "citations": citations
    }

# Step 6: Gradio Interface
def chat_interface(question):
    if not question.strip():
        return "Please enter a question"

    result = rag_query(question, top_k=3, use_cohere=True)

    return result['answer']

# Create Gradio UI
with gr.Blocks(title="Psychology RAG System", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# ðŸ§  Psychology & Mental Health RAG System")
    gr.Markdown("Ask any question about psychology and mental health disorders")

    question_input = gr.Textbox(
        label="Question",
        placeholder="Example: What is depression and what are its symptoms?",
        lines=3
    )

    submit_btn = gr.Button("Search", variant="primary")

    answer_output = gr.Textbox(label="Answer", lines=15)

    submit_btn.click(
        fn=chat_interface,
        inputs=[question_input],
        outputs=[answer_output]
    )

    # Examples
    gr.Examples(
        examples=[
            ["What is depression?"],
            ["What is the difference between anxiety and stress?"],
            ["How is OCD treated?"],
            ["What are the symptoms of PTSD?"],
            ["Explain cognitive behavioral therapy"]
        ],
        inputs=[question_input]
    )

# Launch
print("\n" + "="*50)
print("Starting Gradio interface...")
print("="*50)
demo.launch(share=True, debug=True)


Using device: cpu
Loading PDFs...
âœ“ Document 1 loaded: 12010 chars
âœ“ Document 2 loaded: 11758 chars
âœ“ Document 3 loaded: 10381 chars
âœ“ Document 4 loaded: 19475 chars
âœ“ Document 5 loaded: 13275 chars
âœ“ Document 6 loaded: 14793 chars
âœ“ Document 7 loaded: 8503 chars
âœ“ Document 8 loaded: 13938 chars
âœ“ Document 9 loaded: 8638 chars
âœ“ Document 10 loaded: 14948 chars

Total documents loaded: 10

Splitting documents into chunks...
Document 1: 26 chunks
Document 2: 26 chunks
Document 3: 23 chunks
Document 4: 43 chunks
Document 5: 29 chunks
Document 6: 32 chunks
Document 7: 19 chunks
Document 8: 31 chunks
Document 9: 19 chunks
Document 10: 31 chunks

Total chunks: 279

Creating embeddings...


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

Building FAISS index...
âœ“ FAISS index built with 279 vectors


  with gr.Blocks(title="Psychology RAG System", theme=gr.themes.Soft()) as demo:



Starting Gradio interface...
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://f7e14e4c2ec3de3f02.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://f7e14e4c2ec3de3f02.gradio.live


