In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

gemini_long_context_path = kagglehub.competition_download('gemini-long-context')
kane0068_ai_etic_articles_path = kagglehub.dataset_download('kane0068/ai-etic-articles')
google_gemini_1_5_flash_api_api_gemini_1_5_flash_1_path = kagglehub.model_download('google/gemini-1.5-flash-api/Api/gemini-1.5-flash/1')

print('Data source import complete.')


# ❓ InquisitAI - The Art of Questioning❓

## 🌟"What's the foundation of thinking?
It's asking questions. From Einstein to Socrates, history's greatest thinkers all started by asking questions. Today, I'm introducing InquisitAI, a system built on this fundamental principle."

## 🤖 Introduction
- InquisitAI is an AI-powered document analysis system
- Leveraging the power of Gemini AI, the system doesn't just read documents - it generates intelligent questions about them
- Each question learns from the previous answer, diving deeper into the content
- The system begins by processing PDF and TXT files, continuously questioning and exploring


 ## 🧠 Knowledge and Questioning Process

High-quality questioning requires deep knowledge. It demands the insight and experience that an expert gains over years of study. InquisitAI, powered by the Gemini model:
- Analyzes hundreds of pages in seconds
- Identifies relationships between texts
- Catches subtle details in research methodology
- Most importantly, generates critical questions like an expert

Through these questions and answers, users:
- Discover research blind spots
- Identify new research opportunities
- Detect methodological weaknesses
- Enhance their own thinking processes


## 🔄 Question Generation Mechanism

How does the system generate questions?

- 🔍 Gap identification
- 📊 Methodological concern detection
- ⚡ Contradiction discovery
- 🎯 Unexplored impact analysis
- ✅ Validity assessment of conclusions

## 💡 Potential Use Cases

### 📚 Academic Research
- Literature review
- Methodology assessment
- Research gap identification

### 💼 Business World
- Market research analysis
- Competitive analysis
- Strategy document evaluation

### 🎓 Education
- Student work assessment
- Course material analysis
- Critical thinking skills development

## 🤝 AI and Human Collaboration

InquisitAI exemplifies how artificial intelligence complements rather than competes with humans. The system expands human thinking capacity, accelerates research processes, and offers new perspectives. This allows researchers, students, and professionals to dedicate their time to creative thinking and innovative solutions rather than mechanical analysis.

### ✨ Every great discovery, every significant invention, begins with the right question. InquisitAI harnesses the power of artificial intelligence to generate these questions for you, guiding you toward new discoveries. Because we know that without asking the right questions, we cannot reach the right answers. Let InquisitAI keep your questions flowing.

In [None]:
# Importing Lıbraries

import json
from typing import List, Dict, Optional, Union
from dataclasses import dataclass
import random
from datetime import datetime
import google.generativeai as genai
from kaggle_secrets import UserSecretsClient
import pypdf
import os
from pathlib import Path
import mimetypes


# Initialize Gemini API
user_secrets = UserSecretsClient()
api_key = user_secrets.get_secret("GEMINI_API_KEY")
genai.configure(api_key=api_key)

@dataclass
class TokenUsage:
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int

@dataclass
class Document:
    content: str
    file_path: str
    file_type: str
    timestamp: datetime

@dataclass
class Question:
    content: str
    timestamp: datetime
    token_usage: Dict

@dataclass
class Answer:
    content: str
    timestamp: datetime
    token_usage: Dict

@dataclass
class Analysis:
    content: Dict
    timestamp: datetime
    token_usage: Dict

def upload_to_gemini(file_path: str) -> Dict:
    """Upload file to Gemini with appropriate mime type"""
    mime_type = mimetypes.guess_type(file_path)[0]
    if mime_type is None:
        # Default to plain text if mime type cannot be determined
        mime_type = 'text/plain'

    with open(file_path, 'rb') as f:
        file_content = f.read()

    return {
        "mime_type": mime_type,
        "data": file_content
    }




class InquisitAI:
    def __init__(self):
        # Initialize Gemini model with configuration
        generation_config = {
            "temperature": 1,
            "top_p": 0.95,
            "top_k": 64,
            "max_output_tokens": 8192,
            "response_mime_type": "text/plain",
        }

        self.current_context = ""  # To hide the last answer
        self.discussion_thread = []  # To follow the flow of discussion


        self.model = genai.GenerativeModel(
            model_name="gemini-1.5-flash-latest",
            generation_config=generation_config
        )
        self.chat_session = self.model.start_chat()

        self.documents: List[Document] = []

        self.discussion_history = []
        self.total_token_usage = TokenUsage(0, 0, 0)



    def update_token_count(self, response) -> Dict:
        """Update and return token usage from response metadata"""
        metadata = response.usage_metadata
        token_usage = {
            "prompt_tokens": metadata.prompt_token_count,
            "completion_tokens": metadata.candidates_token_count,
            "total_tokens": metadata.total_token_count
        }

        self.total_token_usage.prompt_tokens += token_usage["prompt_tokens"]
        self.total_token_usage.completion_tokens += token_usage["completion_tokens"]
        self.total_token_usage.total_tokens += token_usage["total_tokens"]

        print(f"\nToken Usage for this response:")
        print(f"Prompt tokens: {token_usage['prompt_tokens']}")
        print(f"Completion tokens: {token_usage['completion_tokens']}")
        print(f"Total tokens: {token_usage['total_tokens']}")

        return token_usage

    def load_documents(self, file_paths: List[str]) -> bool:
        """Load and process multiple documents using Gemini file upload"""
        try:
            uploaded_files = []

            # Upload each document to Gemini
            for file_path in file_paths:
                path = Path(file_path)
                if not path.exists():
                    print(f"File not found: {file_path}")
                    continue

                print(f"\nUploading document: {path.name}")
                uploaded_file = upload_to_gemini(str(path))
                uploaded_files.append(uploaded_file)

                self.documents.append(Document(
                    content="",  # Content will be handled by Gemini
                    file_path=str(path),
                    file_type=path.suffix.lower()[1:],
                    timestamp=datetime.now()
                ))

            if not uploaded_files:
                print("Error: No valid documents were uploaded")
                return False

            # Initialize chat session with uploaded files
            history = [{
                "role": "user",
                "parts": uploaded_files
            }]

            self.chat_session = self.model.start_chat(history=history)

            # Verify document understandin
            verify_prompt = f"I have uploaded {len(uploaded_files)} documents for analysis. Please confirm you can access and understand their content."
            response = self.chat_session.send_message(verify_prompt)
            token_usage = self.update_token_count(response)

            if "confirm" in response.text.lower() or "understand" in response.text.lower():
                print(f"\nDocuments successfully loaded and processed:")
                for doc in self.documents:
                    print(f"- {Path(doc.file_path).name}")
                return True
            else:
                print("Error: Model did not confirm document understanding")
                return False

        except Exception as e:
            print(f"Error in document loading: {str(e)}")
            return False

    def generate_question(self) -> Question:
        """Generate a single focused question about the documents"""
        if not self.documents:
            return Question(
                content="Error: No documents loaded",
                timestamp=datetime.now(),
                token_usage={"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
            )

        if not self.current_context:
            prompt = f"""Based on the {len(self.documents)} documents provided earlier,
            generate ONE specific, focused question that critically examines:
            - A potential gap or limitation in the research
            - A methodological concern
            - An unexplored implication
            - A conflicting finding
            - The validity of a specific conclusion

            Requirements:
            1. Generate ONLY ONE question
            2. The question must be specific and focused on a single issue
            3. Avoid broad, multi-part questions
            4. Focus on concrete rather than abstract concerns

            Format: Return only the single question, without any preamble or additional text."""
        else:
            prompt = f"""Based on the previous answer: "{self.current_context}"

            If the answer was inadequate or raised new issues, generate ONE specific follow-up question that:
            1. Addresses any unclear or unsupported claims in the previous answer
            2. Probes deeper into a specific point made
            3. Challenges a specific assumption

            Requirements:
            1. Generate ONLY ONE focused question
            2. The question must directly relate to the previous answer
            3. If the answer was fully satisfactory, respond with exactly "SATISFIED"

            Format: Return only the single question or "SATISFIED", without any other text."""

        response = self.chat_session.send_message(prompt)
        token_usage = self.update_token_count(response)

        # Check if the response contains multiple questions (looking for number prefixes or multiple question marks)
        response_text = response.text.strip()
        if response_text.count('?') > 1 or any(str(i)+'.' in response_text for i in range(1,10)):
            # If multiple questions detected, take only the first complete question
            questions = [q.strip() for q in response_text.split('?') if q.strip()]
            response_text = questions[0] + '?'

        return Question(
            content=response_text,
            timestamp=datetime.now(),
            token_usage=token_usage
        )




    def get_answer(self, question: str) -> Answer:
        """Get a specific and detailed answer from the model"""
        prompt = f"""Provide a specific, detailed answer to this question:
        {question}

        Requirements:
        1. Address the question directly and specifically
        2. Provide concrete examples or evidence where possible
        3. Acknowledge any limitations or uncertainties
        4. Stay focused on the specific question asked

        Format: Provide a clear, structured response that directly addresses the question."""

        response = self.chat_session.send_message(prompt)
        token_usage = self.update_token_count(response)

        return Answer(
            content=response.text.strip(),
            timestamp=datetime.now(),
            token_usage=token_usage
        )

    def analyze_answer(self, question: str, answer: str) -> Analysis:
        """Analyze the quality and completeness of an answer"""
        analysis_prompt = f"""Analyze this Q&A concisely:
        Question: {question}
        Answer: {answer}

        Provide a structured analysis with scores (1-10) for relevance, depth, and clarity.
        Format your response EXACTLY like this example:
        {{
            "scores": {{
                "relevance": 8,
                "depth": 7,
                "clarity": 9
            }},
            "needs_followup": true,
            "recommendations": ["Be more specific", "Add examples"]
        }}"""

        response = self.chat_session.send_message(analysis_prompt)
        token_usage = self.update_token_count(response)

        try:
            # Clean the response text to ensure valid JSON
            cleaned_response = response.text.strip()
            # Remove any markdown formatting if present
            if cleaned_response.startswith("```json"):
                cleaned_response = cleaned_response[7:-3]
            elif cleaned_response.startswith("```"):
                cleaned_response = cleaned_response[3:-3]

            analysis_content = json.loads(cleaned_response)

            # Validate expected structure
            required_keys = {"scores", "needs_followup", "recommendations"}
            if not all(key in analysis_content for key in required_keys):
                raise ValueError("Missing required keys in analysis")

        except (json.JSONDecodeError, ValueError) as e:
            print(f"Analysis parsing error: {str(e)}")
            analysis_content = {
                "scores": {"relevance": 5, "depth": 5, "clarity": 5},
                "needs_followup": True,
                "recommendations": ["Error parsing analysis - using default values"]
            }

        return Analysis(
            content=analysis_content,
            timestamp=datetime.now(),
            token_usage=token_usage
        )

    def run_discussion(self, max_turns: int = 5) -> List[Dict]:
        """Run an interactive discussion about the documents with optimized token usage"""
        discussion = []
        turn = 0

        print("\n=== Starting Document Analysis Discussion ===")

        while turn < max_turns:
            print(f"\n--- Turn {turn + 1}/{max_turns} ---")

            # Generate question with reduced context
            question = self.generate_question()
            if question.content == "SATISFIED":
                print("\nDiscussion complete - All points adequately addressed")
                break

            print(f"\nQuestion: {question.content}")

            # Get answer with simplified prompt
            answer = self.get_answer(question.content)
            print(f"\nAnswer: {answer.content}")

            # Store minimal context for next question
            self.current_context = answer.content[:500]  # Limit context size

            # Analyze response with structured format
            analysis = self.analyze_answer(question.content, answer.content)
            print("\nAnalysis:", json.dumps(analysis.content, indent=2))

            # Record discussion entry with optimized structure
            entry = {
                "turn": turn + 1,
                "question": question.content,
                "answer": answer.content,
                "analysis": analysis.content,
                "token_usage": {
                    "total": (
                        question.token_usage["total_tokens"] +
                        answer.token_usage["total_tokens"] +
                        analysis.token_usage["total_tokens"]
                    )
                }
            }

            discussion.append(entry)
            self.discussion_thread.append(entry)

            print(f"\nToken usage this turn: {entry['token_usage']['total']}")

            turn += 1

        return discussion

    def export_results(self, filepath: str) -> bool:
        """Export complete discussion and analysis results"""
        try:
            output = {
                "metadata": {
                    "timestamp": datetime.now().isoformat(),
                    "document_count": len(self.documents),
                    "document_types": [doc.file_type for doc in self.documents]
                },
                "token_usage": {
                    "prompt_tokens": self.total_token_usage.prompt_tokens,
                    "completion_tokens": self.total_token_usage.completion_tokens,
                    "total_tokens": self.total_token_usage.total_tokens
                },
                "discussion": self.discussion_history
            }

            with open(filepath, 'w', encoding='utf-8') as f:
                json.dump(output, f, indent=2, ensure_ascii=False)

            print(f"\nResults exported to: {filepath}")
            return True

        except Exception as e:
            print(f"Error exporting results: {str(e)}")
            return False

def get_documents_from_folder(folder_path: str) -> List[str]:
    documents = []
    folder_path = Path(folder_path)

    # Check if folder exists
    if not folder_path.exists():
        print(f"Folder not found: {folder_path}")
        return documents

    # Recursively search for PDF and TXT files
    for file_path in folder_path.rglob("*"):
        if file_path.is_file() and file_path.suffix.lower() in ['.pdf', '.txt']:
            documents.append(str(file_path))
            print(f"Found document: {file_path.name}")

    print(f"\nTotal documents found: {len(documents)}")
    return documents

def main():
    # Create output directory if it doesn't exist
    output_dir = Path("output")
    output_dir.mkdir(exist_ok=True)

    # Initialize analyzer
    analyzer = InquisitAI()

    # # Specify documents to analyze
    # documents = [
    #     "path/to/your/document1.pdf",
    #     "path/to/your/document2.txt"
    # ]

    # Get all documents from the specified folder
    documents_folder = "/kaggle/input/ai-etic-articles"  # Change this to your folder path
    documents = get_documents_from_folder(documents_folder)

    if not documents:
        print("No PDF or TXT documents found in the specified folder")
        return

    # Run analysis
    if analyzer.load_documents(documents):
        discussion = analyzer.run_discussion(max_turns=5)
        analyzer.discussion_history = discussion

        # Export results
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_file = output_dir / f"discussion_analysis_{timestamp}.json"
        analyzer.export_results(output_file)

        # Print final statistics
        print("\n=== Final Statistics ===")
        print(f"Documents analyzed: {len(analyzer.documents)}")
        print(f"Discussion turns: {len(discussion)}")
        print(f"Total tokens used: {analyzer.total_token_usage.total_tokens}")
    else:
        print("Analysis failed due to document loading error")

if __name__ == "__main__":
    main()