In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

casml_generative_ai_hackathon_path = kagglehub.competition_download('casml-generative-ai-hackathon')
erenakbulut_sentence_transformers_path = kagglehub.dataset_download('erenakbulut/sentence-transformers')
microsoft_phi_transformers_2_1_path = kagglehub.model_download('Microsoft/phi/Transformers/2/1')
google_gemma_transformers_2b_it_3_path = kagglehub.model_download('google/gemma/Transformers/2b-it/3')

print('Data source import complete.')


In [None]:
!pip install --upgrade transformers accelerate==0.22.0 einops==0.6.1 langchain==0.0.300 xformers==0.0.21
!pip install bitsandbytes==0.41.1 chromadb==0.4.12 datasets accelerate --upgrade huggingface_hub
!pip install torchvision==0.4.0 -f https://download.pytorch.org/whl/torch_stable.html
!pip install -U sentence-transformers pypdf2 pdfplumber

In [None]:
import sys
from torch import cuda, bfloat16
import torch
import transformers
import chromadb
from chromadb.config import Settings
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma

In [None]:
model_id = '/kaggle/input/gemma/transformers/2b-it/3'

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

In [None]:

query_pipeline = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.float16,
        device_map="auto",
        max_length = 10000,
)

In [None]:
import pandas as pd

df = pd.read_json('/kaggle/input/casml-generative-ai-hackathon/Dataset_RAG (1)/queries.json')

df.head()

In [None]:
llm = HuggingFacePipeline(pipeline=query_pipeline)

In [None]:
from PyPDF2 import PdfReader
import pdfplumber

def load_book_with_page_overlap(pdf_path, overlap_size=0):
    """Load and preprocess the book from PDF with word overlap between pages."""
    sections = []
    # Function to add overlap between two pages
    def overlap_pages(page_text, next_page_text, overlap_size):
        page_words = page_text.split()
        next_page_words = next_page_text.split()
        # Take the last `overlap_size` words from the current page
        overlap = page_words[-overlap_size:] if len(page_words) > overlap_size else page_words
        combined_text = " ".join(overlap) + " " + " ".join(next_page_words)
        return combined_text
    try:
        # Try loading with PyPDF2
        reader = PdfReader(pdf_path, strict=False)  # Ignore structural issues
        previous_page_text = None  # To hold the last page's text for overlap
        for page_num, page in enumerate(reader.pages):
            text = page.extract_text()
            if text and text.strip():
                if previous_page_text:
                # Overlap the current page with the previous one
                    combined_text = overlap_pages(previous_page_text, text, overlap_size)
                    sections.append({"page": page_num + 1, "text": combined_text.strip()})
                else:
                    sections.append({"page": page_num + 1, "text": text.strip()})
                # Update `previous_page_text` to be the current page's text for the next iteration
                previous_page_text = text.strip()
    except Exception as e:
        print(f"Error loading PDF with PyPDF2: {e}")
        print("Falling back to pdfplumber...")
        try:
            # Use pdfplumber as a fallback
            with pdfplumber.open(pdf_path) as pdf:
                previous_page_text = None
                for page_num, page in enumerate(pdf.pages):
                    text = page.extract_text()
                    if text and text.strip():
                        if previous_page_text:
                            # Overlap the current page with the previous one
                            combined_text = overlap_pages(previous_page_text, text, overlap_size)
                            sections.append({"page": page_num + 1, "text": combined_text.strip()})
                        else:
                            sections.append({"page": page_num + 1, "text": text.strip()})
                        # Update `previous_page_text` for the next iteration
                        previous_page_text = text.strip()
        except Exception as fallback_error:
            print(f"Error loading PDF with pdfplumber: {fallback_error}")
            return None
    return sections

pdf_path = '/kaggle/input/casml-generative-ai-hackathon/Dataset_RAG (1)/book.pdf'
book_data= load_book_with_page_overlap(pdf_path, overlap_size=5)

if book_data:
    print(f"Loaded {len(book_data)} pages successfully.")
else:
    print("Failed to load the PDF.")

In [None]:

for i, section in enumerate(book_data[:5]):  # Show first 5 sections

    print(f"Page {section['page']} - Text: {section['text'][:150]}...")

In [None]:
from datasets import Dataset
dataset = Dataset.from_list(book_data)
print(dataset)

In [None]:
model_kwargs = {"device": "cuda"}
local_model_path = "/kaggle/input/sentence-transformers/minilm-l6-v2/all-MiniLM-L6-v2"

embeddings = HuggingFaceEmbeddings(model_name=local_model_path, model_kwargs=model_kwargs)

In [None]:
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

documents = [
    Document(
        page_content=section['text'],
        metadata={"page": section['page']},
    )
    for section in book_data
]

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

# Split the documents into smaller chunks
all_splits = text_splitter.split_documents(documents)

In [None]:
vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory="chroma_db")

In [None]:
retriever = vectordb.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    verbose=True
)

In [None]:
import csv
import json

def query_and_retrieve(query,query_id, vectordb):
    """
    Perform a precision-based retrieval and generate a structured answer.

    Args:
        query (str): The query string.
        vectordb: The vector database object to perform similarity searches.

    Returns:
        dict: The final structured answer and references.
    """
    # Stage 1: Core concept retrieval
    concept_results = vectordb.similarity_search(
        query=f"definition explanation {query}",
        k=1000   # Number of results to retrieve
    )

    # Stage 2: Supporting detail retrieval
    detail_results = vectordb.similarity_search(
        query=f"example application {query}",
        k=1000
    )

    # Combine and prioritize contexts
    contexts = []
    pages = set()

    # Process concept_results and detail_results
    for results in [concept_results, detail_results]:
        for doc in results:
            contexts.append(doc.page_content)
            pages.add(str(doc.metadata.get('page', 'Unknown')))

    # Combine contexts and prepare the input for the QA system
    combined_context = ' '.join(contexts[:2])  # Limit to top 2 relevant contexts
    sources = list(sorted(pages))[:3]  # Limit references to top 3 pages

    # Generate structured answer using the combined context
    prompt = f""" refer context and answer


### Input:
**Context:**
{combined_context}

**Question:**
{query}

### Output:
**Answer:**

"""
    # Call the QA system to generate the answer
    try:
        result = qa.run(prompt)
    except Exception as e:
        print(f"Error during model execution: {e}")
        result = "Error generating answer"

    # Map the query to a precise section
    section = map_to_precise_section(query, combined_context)

    # Format the response
    return {
        'ID': query_id,
        'context': combined_context,
        'answer': result,
        'references': json.dumps({
            'sections': [section],
            'pages': sources
        })
    }


def create_submission_file(queries_df, vector_store):
    """
    Create the submission file based on the provided queries and vector store.

    Args:
        queries_df (pd.DataFrame): DataFrame containing the test set queries and IDs.
        vector_store: The vector database for document retrieval.
    """
    results = []

    for _, row in queries_df.iterrows():
        query_id = row['query_id']
        query = row['question']

        # Generate structured answers for each query
        result = query_and_retrieve(query,query_id, vector_store)
        results.append(result)

    # Convert results to a DataFrame for submission
    submission_df = pd.DataFrame(results)
    submission_df.to_csv('submission.csv', index=False)
    print("Submission file saved as 'submission.csv'")

def map_to_precise_section(query, context):
    """
    Map the query to the most relevant section based on context and query keywords.
    """
    section_mapping = {
        'scientific': 'psychological_research/approaches_to_research',
        'brain': 'biopsychology/the_brain_and_behavior',
        'memory': 'memory/how_memory_functions',
        'development': 'developmental_psychology/introduction',
        'personality': 'personality/introduction_to_personality',
        'emotion': 'emotion_and_motivation/emotions',
        'learning': 'learning/introduction',
        'social': 'social_psychology/introduction'
    }

    for key, section in section_mapping.items():
        if key in query.lower() or key in context.lower():
            return section
    return "introduction_to_psychology"


In [None]:
create_submission_file(df, vectordb)