In [14]:
# Install required packages
!pip install -q langchain sentence-transformers faiss-cpu pypdf tiktoken requests

# Import libraries
import os
from typing import Optional, List, Mapping, Any
import requests
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms.base import LLM
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

## Step 1: Custom DeepSeek LLM Wrapper
class DeepSeekLLM(LLM):
    api_key: str
    model: str = "deepseek-chat"
    temperature: float = 0.7
    max_tokens: int = 1000

    @property
    def _llm_type(self) -> str:
        return "deepseek"

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }
        payload = {
            "model": self.model,
            "messages": [{"role": "user", "content": prompt}],
            "temperature": self.temperature,
            "max_tokens": self.max_tokens
        }
        response = requests.post(
            "https://api.deepseek.com/v1/chat/completions",
            headers=headers,
            json=payload
        )
        return response.json()["choices"][0]["message"]["content"]

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        return {
            "model": self.model,
            "temperature": self.temperature,
            "max_tokens": self.max_tokens
        }

## Step 2: Load and Process PDF
def load_and_chunk_pdf(file_path):
    # Load PDF
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    
    # Split into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100,
        length_function=len
    )
    return text_splitter.split_text(text)

## Step 3: Create Vector Store
def create_vector_store(chunks):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
    return FAISS.from_texts(chunks, embeddings)

## Step 4: Initialize RAG Pipeline
def initialize_rag_pipeline(api_key, vector_store):
    # Create prompt template
    prompt_template = """Use the following context to answer the question.
    If you don't know the answer, say you don't know. Be precise and technical.

    Context:
    {context}

    Question: {question}
    Answer:"""

    PROMPT = PromptTemplate(
        template=prompt_template,
        input_variables=["context", "question"]
    )

    # Initialize LLM
    llm = DeepSeekLLM(api_key=api_key)

    # Create QA chain
    return RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever(search_kwargs={"k": 3}),
        chain_type_kwargs={"prompt": PROMPT},
        return_source_documents=True
    )

## Step 5: Query Function
def ask_question(qa_chain, question):
    result = qa_chain({"query": question})
    print("🧠 Answer:", result["result"])
    print("\n📚 Sources:")
    for i, doc in enumerate(result["source_documents"]):
        print(f"\nSource {i+1}:")
        print(doc.page_content[:300] + "...")

## Main Execution
if __name__ == "__main__":
    # Get API key from Kaggle Secrets
    from kaggle_secrets import UserSecretsClient
    secrets = UserSecretsClient()
    DEEPSEEK_API_KEY = secrets.get_secret("DEEPSEEK_API_KEY")
    
    # Load and process PDF (replace with your file path)
    print("Loading and processing PDF...")
    chunks = load_and_chunk_pdf("/kaggle/input/human-and-ai-written/Human_and_AI_Written_Text_Detection_Using_Deep_Learning_and_Machine_Learning.pdf")
    
    # Create vector store
    print("Creating vector database...")
    vector_store = create_vector_store(chunks)
    
    # Initialize RAG pipeline
    print("⚙️ Initializing RAG pipeline...")
    qa_chain = initialize_rag_pipeline(DEEPSEEK_API_KEY, vector_store)
    
    # Example questions
    questions = [
        "What is the main contribution of this paper?",
        "What methodology did the authors use?",
        "What were the key findings?",
        "What datasets were used in this research?"
    ]
    
    # Ask questions
    print("\n💬 Ready for questions! Here are some examples:")
    for q in questions:
        print(f"\n❓ Question: {q}")
        ask_question(qa_chain, q)
    
    # Interactive mode
    print("\n🎤 Enter your own questions (type 'quit' to exit):")
    while True:
        user_question = input("\nYour question: ")
        if user_question.lower() == 'quit':
            break
        ask_question(qa_chain, user_question)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


📄 Loading and processing PDF...
🔍 Creating vector database...
⚙️ Initializing RAG pipeline...

💬 Ready for questions! Here are some examples:

❓ Question: What is the main contribution of this paper?
🧠 Answer: The main contribution of the paper appears to be the development and evaluation of machine learning models (specifically Bi-LSTM and Bi-GRU) for classifying human-generated versus AI-generated text. The paper details preprocessing techniques (tokenization, padding, vectorization) and model architectures to distinguish subtle differences between AI and human writing. 

However, the provided context is fragmented, and key sections (e.g., abstract, conclusions) are missing, so the exact novelty (e.g., dataset creation, model performance benchmarks) cannot be definitively stated. For a precise answer, the full paper would need to be reviewed. 

Based on the excerpts, the focus is on:
1. **Methodology**: Use of bidirectional RNN variants (Bi-LSTM/Bi-GRU) for improved context understan


Your question:  exit


🧠 Answer: It seems like your question is incomplete or unclear. If you're asking about the performance of AI text detection models (e.g., Bi-GRU, LSTM, XGBoost, etc.) based on the provided context, I can summarize:

- The **Bi-GRU model** achieves the highest accuracy (99.69%) in distinguishing AI-generated vs. human-written text, outperforming other methods like XGBoost (99%), DistilBERT (92%), and ensemble models.  
- **LSTM and Bi-GRU** misclassify fewer AI-generated texts as human-written (617 and 541 errors, respectively) compared to other models.  
- The table and bar chart highlight the superior performance of deep learning models (especially Bi-GRU) over machine learning and other deep learning approaches.  

If you meant something else by "exit," please clarify your question. Otherwise, let me know if you'd like more details on specific models or metrics.

📚 Sources:

Source 1:
AI text when ChatGPT is told to write like a chemist,” Cell Reports
Physical Science, vol. 4, no. 11


Your question:  exit


KeyboardInterrupt: 