In [None]:
## Yousaf Khaliq, Danlei Zhu, Haoyuan Wang
#We are using ChatGPT and Gemini 2.5 pro to help with coding.

In [None]:
# @title Complete AI Research Assistant Code (RAG + TF-IDF Similarities)

# --- 1. Install Dependencies ---
!pip install -qU \
    arxiv \
    pypdf \
    langchain \
    -U langchain-community\
    openai \
    tiktoken \
    faiss-cpu \
    gradio \
    seaborn \
    matplotlib \
    scikit-learn \
    # aiohttp # Likely no longer needed as we won't re-download \
    # wordcloud # Not used in the provided similarity code snippet
    # azure.identity # Not used in the provided similarity code snippet

print("Dependencies installed successfully!")

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.3/302.3 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m56.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m661.2/661.2 kB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.0/54.0 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.6/322.6 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m75.1 MB/s[0m eta [36m0:00:00

In [None]:
# --- 2. Import Libraries and Set Up ---
import os
import shutil
import arxiv
import gradio as gr
from getpass import getpass
import datetime
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Any # For type hinting from new code

# LangChain components
from langchain.document_loaders import PyPDFLoader # Keep for RAG indexing
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from langchain.schema import Document

# Imports from the new similarity code
from pypdf import PdfReader # For direct PDF reading for similarity text extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

print("Libraries imported.")

# --- OpenAI API Key Setup ---
os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API Key (needed for RAG chat): ")

if not os.environ.get("OPENAI_API_KEY"):
    print("⚠️ OpenAI API Key not provided. RAG Chat features will NOT work.")
else:
    print("OpenAI API Key set.")

# --- Constants and Directories ---
PDF_FOLDER = "arxiv_pdfs"
REF_FOLDER = "arxiv_references"
VECTORSTORE_PATH = "faiss_index" # For RAG

os.makedirs(PDF_FOLDER, exist_ok=True)
os.makedirs(REF_FOLDER, exist_ok=True)
print(f"Directories '{PDF_FOLDER}' and '{REF_FOLDER}' created/ensured.")

Libraries imported.
Enter your OpenAI API Key (needed for RAG chat): ··········
OpenAI API Key set.
Directories 'arxiv_pdfs' and 'arxiv_references' created/ensured.


In [None]:
# --- 3. Core Functions ---

# --- Helper Functions ---
def clean_filename(title: str) -> str:
    """Removes invalid characters for filenames and limits length."""
    cleaned = re.sub(r'[\\/*?:"<>|]', "", title) # Remove invalid chars
    cleaned = re.sub(r'\s+', '_', cleaned)       # Replace spaces with underscores
    return cleaned[:100]                         # Truncate long filenames

def format_reference(result: arxiv.Result) -> str:
    """Formats a reference string for an arXiv paper."""
    try:
        authors = ", ".join(author.name for author in result.authors)
        year = result.published.year
        title = result.title.replace('\n', '').replace('  ', ' ') # Clean title
        arxiv_id = result.get_short_id()
        pdf_link = result.pdf_url
        return f"{authors} ({year}). *{title}*. arXiv:{arxiv_id}. Available at: {pdf_link}"
    except Exception as e:
        print(f"Error formatting reference for {result.entry_id}: {e}")
        return f"Error formatting reference for {result.entry_id}"

# --- Paper Fetching (Downloads PDFs needed by RAG & TF-IDF) ---
def fetch_papers(topic: str, sort_by: str, max_results: int) -> (List[Dict[str, Any]], List[str], str | None, str | None):
    """Fetches papers from arXiv, downloads PDFs, creates references list, and returns detailed paper info."""
    print(f"Fetching papers for topic: '{topic}', sorting by: {sort_by}, max results: {max_results}")
    # --- Cleanup ---
    if os.path.exists(PDF_FOLDER): shutil.rmtree(PDF_FOLDER)
    if os.path.exists(REF_FOLDER): shutil.rmtree(REF_FOLDER)
    if os.path.exists(VECTORSTORE_PATH): shutil.rmtree(VECTORSTORE_PATH) # Clear RAG index
    os.makedirs(PDF_FOLDER, exist_ok=True)
    os.makedirs(REF_FOLDER, exist_ok=True)
    print("Cleared previous downloads, references, and RAG index.")

    try:
        # Configure arXiv search parameters
        search_params = {"query": topic, "max_results": max_results}
        if sort_by == "Relevance": search_params["sort_by"] = arxiv.SortCriterion.Relevance
        elif sort_by == "Date":
             search_params["sort_by"] = arxiv.SortCriterion.SubmittedDate
             search_params["sort_order"] = arxiv.SortOrder.Descending
        client = arxiv.Client() # Instantiate client
        search = arxiv.Search(**search_params)
        # Use client.results for current arxiv library version
        results = list(client.results(search)) # Execute search

        if not results:
            print("No papers found.")
            return [], [], None, None # Return empty lists and None for zip paths

        print(f"Found {len(results)} papers. Downloading...")
        # --- Process Results ---
        paper_details = []         # List to store detailed dict for each paper
        references_list = []       # List for the formatted references text file
        ref_filename = f"references_{clean_filename(topic)}_{datetime.datetime.now().strftime('%Y%m%d_%H%M')}.txt"
        ref_filepath = os.path.join(REF_FOLDER, ref_filename)

        with open(ref_filepath, "w", encoding="utf-8") as ref_file:
            for i, result in enumerate(results):
                progress = f"[{i+1}/{len(results)}]"
                try:
                    title = result.title.replace('\n', '').replace('  ', ' ')
                    filename_base = clean_filename(f"{result.get_short_id()}_{title}")
                    pdf_filename = f"{filename_base}.pdf"
                    pdf_filepath = os.path.join(PDF_FOLDER, pdf_filename) # Absolute path to downloaded PDF

                    print(f"{progress} Downloading '{title}' to {pdf_filename}...")
                    result.download_pdf(dirpath=PDF_FOLDER, filename=pdf_filename) # Download the PDF

                    # Store detailed info needed by both RAG and TF-IDF parts
                    paper_info = {
                        "index": i + 1, # User-facing 1-based index
                        "title": title,
                        "filename": pdf_filename, # Just the filename
                        "filepath": pdf_filepath, # Full path for loading
                        "arxiv_id": result.get_short_id(),
                        "summary": result.summary, # Needed for TF-IDF
                        # Keep other potentially useful metadata
                        "url": result.pdf_url,
                        "doi": result.doi,
                        "published": str(result.published),
                        "authors": [a.name for a in result.authors],
                        # Placeholders to be filled later
                        "clean_text": "", # For TF-IDF full text
                        # "references": [] # REMOVED - No longer extracting references for Jaccard
                    }
                    paper_details.append(paper_info)

                    # Add formatted reference to list and file
                    ref_string = format_reference(result)
                    references_list.append(ref_string)
                    ref_file.write(ref_string + "\n\n")
                    print(f"{progress} Download complete.")

                except Exception as e: # Catch download/processing errors
                    print(f"{progress} ⚠️ Failed to download/process paper {result.entry_id} ('{result.title}'): {type(e).__name__} - {e}")
                    # Attempt to add a placeholder reference even on failure
                    try:
                        ref_string = format_reference(result)
                        references_list.append(f"[Download Failed] {ref_string}")
                        ref_file.write(f"[Download Failed] {ref_string}\n\n")
                    except Exception as ref_e: print(f"Could not format reference for failed paper {result.entry_id}: {ref_e}")

        print(f"Downloads finished. References saved in '{ref_filepath}'.")
        # Create zip archives for download buttons
        pdf_zip_path = shutil.make_archive(f"{PDF_FOLDER}", 'zip', PDF_FOLDER)
        ref_zip_path = shutil.make_archive(f"{REF_FOLDER}", 'zip', REF_FOLDER)
        return paper_details, references_list, pdf_zip_path, ref_zip_path

    except Exception as e: # Catch broader errors during fetching
        print(f"❌ An error occurred during arXiv fetching: {e}")
        import traceback
        traceback.print_exc()
        return [], [], None, None


# --- RAG Core: Indexing Function (Uses OpenAI Embeddings) ---
def load_and_index_papers_for_rag(paper_details: List[Dict[str, Any]]) -> FAISS | None:
    """Loads PDFs using Langchain's PyPDFLoader, splits text, creates OpenAI embeddings, and builds FAISS index for RAG chat."""
    if not os.environ.get("OPENAI_API_KEY"):
         print("⚠️ Cannot index papers for RAG: OpenAI API Key not set.")
         return None

    # Filter details for papers that were successfully downloaded
    valid_paper_details = [p for p in paper_details if os.path.exists(p['filepath'])]
    if not valid_paper_details:
        print(f"⚠️ No valid PDF files found for RAG indexing based on fetched details.")
        return None

    print(f"Loading PDFs for RAG indexing (using Langchain PyPDFLoader)...")
    all_docs_for_rag = []
    docs_loaded_count = 0
    # Load docs and add metadata needed for RAG source attribution
    for details in valid_paper_details:
        filepath = details['filepath']
        # --- ADDED LOGGING ---
        print(f" -> RAG Indexing: Attempting to load Paper {details['index']} ('{details['title'][:40]}...') from {details['filename']}")
        try:
            loader = PyPDFLoader(filepath) # Langchain loader
            docs = loader.load()
            if not docs:
                 # --- ADDED LOGGING ---
                 print(f"    -> WARNING: No documents loaded by PyPDFLoader for Paper {details['index']}. PDF might be empty or unreadable by this loader.")
                 continue # Skip if loader returns nothing

            # --- ADDED LOGGING ---
            print(f"    -> Loaded {len(docs)} pages for Paper {details['index']}.")
            docs_loaded_count += 1
            for doc in docs: # Add metadata to each page/chunk
                doc.metadata['source'] = filepath # Essential for filtering
                doc.metadata['filename'] = details['filename']
                doc.metadata['title'] = details['title']
                doc.metadata['paper_index'] = details['index']
                # PyPDFLoader adds 'page' (0-indexed) automatically
                if 'page' not in doc.metadata: # Defensive check
                     print(f"   -> WARNING: 'page' metadata missing in loaded doc for {details['filename']}")
                     doc.metadata['page'] = -1 # Assign default if missing
            all_docs_for_rag.extend(docs)
        except Exception as e:
             # --- ADDED LOGGING ---
            print(f"    -> ERROR loading Paper {details['index']} with PyPDFLoader: {type(e).__name__} - {e}. Skipping for RAG.")

    if not all_docs_for_rag:
        print("❌ No documents could be loaded by Langchain loader for RAG indexing.")
        return None
    print(f"RAG Indexing: Successfully loaded pages from {docs_loaded_count}/{len(valid_paper_details)} valid PDFs using PyPDFLoader.")

    print(f"RAG Indexing: Splitting text for {len(all_docs_for_rag)} pages...")
    # Use RecursiveCharacterTextSplitter for robust splitting
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
    texts_for_rag = text_splitter.split_documents(all_docs_for_rag)

    if not texts_for_rag:
        print("⚠️ RAG Indexing: No text chunks generated after splitting. Check PDF text content and splitter settings.")
        return None

    # --- ADDED LOGGING: Chunks per paper ---
    chunks_per_paper = {}
    for chunk in texts_for_rag:
        p_index = chunk.metadata.get('paper_index', 'Unknown')
        chunks_per_paper[p_index] = chunks_per_paper.get(p_index, 0) + 1
    print("RAG Indexing: Chunks created per paper index:")
    if chunks_per_paper:
         for p_index, count in sorted(chunks_per_paper.items()):
              print(f"  - Paper {p_index}: {count} chunks")
    else:
         print("  -> No chunks were associated with paper indices (check metadata addition).")
    # --- END LOGGING ---

    print(f"RAG Indexing: Split into {len(texts_for_rag)} total chunks. Creating OpenAI embeddings...")
    try:
        embeddings = OpenAIEmbeddings() # Uses the API key from environment
        print("RAG Indexing: Initializing FAISS vector store...")
        vector_store = FAISS.from_documents(texts_for_rag, embeddings) # Create index from docs & embeddings
        # Optional: vector_store.save_local(VECTORSTORE_PATH)
        print(f"RAG Indexing: FAISS index created successfully with {vector_store.index.ntotal} vectors.")
        return vector_store
    except Exception as e:
        print(f"❌ RAG Indexing: Error creating OpenAI embeddings or FAISS index: {e}")
        return None

# --- RAG Core: LLM and Prompt Setup ---
def setup_llm_and_prompt() -> (ChatOpenAI | None, str | None):
    """Sets up the Langchain ChatOpenAI LLM and the QA prompt template string."""
    if not os.environ.get("OPENAI_API_KEY"):
         print("⚠️ Cannot set up LLM: OpenAI API Key not set.")
         return None, None
    print("Setting up LLM (gpt-3.5-turbo) and QA Prompt Template String...")
    try:
        # Initialize the LLM
        llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.1, streaming=False) # Low temp for factual RAG

        # Define the detailed prompt template as a string - Includes post-context instruction
        prompt_template_string = """You are an AI research assistant analyzing academic papers. Use the following pieces of context, which are excerpts from specific research papers identified by their metadata (title, page number), to answer the question accurately and concisely.

        **Instructions:**
        1.  Base your answer *exclusively* on the provided context below.
        2.  If the context does not contain the information needed to answer the question, explicitly state that the information is not available in the provided excerpts.
        3.  Do not introduce outside knowledge or make assumptions beyond the text given.
        4.  When asked to compare papers, clearly attribute points to the specific paper they came from, based *only* on the context provided for each. Use clear headings or bullet points for each paper. Double-check your attribution.
        5.  Structure your answer clearly. For summaries, provide the key points. For comparisons, highlight similarities and differences methodically.

        Context:
        ---------
        {context}
        ---------

        **Based strictly on the context above, answer the following question:**
        Question: {question}

        Answer:"""
        print("LLM and Prompt Template String setup complete.")
        # Return the LLM instance and the template *string*
        return llm, prompt_template_string
    except Exception as e:
        print(f"❌ Error setting up LLM or Prompt String: {e}")
        return None, None


# --- TF-IDF Similarity Calculation Functions ---

def clean_extracted_text_for_similarity(text: str, min_line_length: int = 20) -> str:
    """Clean extracted PDF text for TF-IDF analysis."""
    patterns = [ # Regex patterns from the provided code...
        r'arXiv:\d+\.\d+v\d+', r'DOI:\s?[\w./-]+', r'Proceedings of .*?\d{4}', r'Page\s\d+',
        r'Figure\s\d+[:.]?', r'Table\s\d+[:.]?', r'\$[^$]+\$', r'\\begin\{equation\}.*?\\end\{equation\}',
        r'©\d{4}\s(?:IEEE|ACM|Springer|Elsevier)', r'http[s]?://\S+', r'\b(?:figure|table|equation)\s*\d+\b',
        r'\b\d+%\b', r'\b\w{1,2}\b', # Remove very short words
    ]
    cleaned = text
    for pattern in patterns:
        cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)

    lines = []
    for line in cleaned.split('\n'):
        line_strip = line.strip()
        # Keep lines if they are reasonably long OR contain reference keywords
        if len(line_strip) >= min_line_length or re.search(r'\b(references|bibliography)\b', line_strip, re.IGNORECASE):
            lines.append(line_strip)

    final_text = '\n'.join(lines)
    final_text = re.sub(r'\n{3,}', '\n\n', final_text) # Condense blank lines
    final_text = re.sub(r'\s{2,}', ' ', final_text)   # Condense spaces within lines
    return final_text

def extract_full_text_for_similarity(pdf_filepath: str) -> str:
    """Extracts and cleans text from a local PDF file using pypdf for similarity analysis."""
    if not os.path.exists(pdf_filepath):
        print(f"Error (TF-IDF Text Extraction): PDF file not found at {pdf_filepath}")
        return ""
    try:
        reader = PdfReader(pdf_filepath)
        raw_text = []
        for i, page in enumerate(reader.pages):
            try:
                page_text = page.extract_text()
                if page_text: raw_text.append(page_text)
            except Exception as page_e: # Catch errors on specific pages
                print(f"  - Warning (TF-IDF): Could not extract text from page {i+1} in {os.path.basename(pdf_filepath)}: {page_e}")
        if not raw_text:
            print(f"Warning (TF-IDF): No text extracted from PDF: {os.path.basename(pdf_filepath)}")
            return ""

        full_text = " ".join(raw_text)
        cleaned_text = clean_extracted_text_for_similarity(full_text)
        # Return only if substantial text remains after cleaning
        return cleaned_text if len(cleaned_text.split()) > 50 else ""
    except Exception as e: # Catch errors reading the whole PDF file
        print(f"Error (TF-IDF): Reading PDF {os.path.basename(pdf_filepath)}: {e}")
        return ""

def fit_tfidf_vectorizer(papers_with_text: List[Dict[str, Any]]) -> TfidfVectorizer | None:
    """Creates and fits a TF-IDF vectorizer."""
    print("Fitting TF-IDF Vectorizer...")
    # Combine summary and cleaned full text for fitting the vocabulary
    all_texts = [p["summary"] + " " + p.get("clean_text", "") for p in papers_with_text if p.get("clean_text")]
    if not all_texts:
         print("Error (TF-IDF): No text available to fit vectorizer.")
         return None

    vectorizer = TfidfVectorizer(
        stop_words='english', max_df=0.6, min_df=2, # Filter very common/rare terms
        sublinear_tf=True, norm='l2', ngram_range=(1, 2), # Use unigrams and bigrams
        token_pattern=r'(?u)\b[a-zA-Z]{3,}\b', max_features=10000 # Keep words >= 3 letters
    )
    try:
        vectorizer.fit(all_texts)
        feature_names = vectorizer.get_feature_names_out()
        print(f"TF-IDF Vocabulary size: {len(feature_names)}")
        return vectorizer
    except ValueError as ve: # Catch potential errors if vocabulary is empty after filtering
         print(f"Error (TF-IDF): Fitting vectorizer failed, likely due to insufficient valid terms. {ve}")
         return None
    except Exception as e:
        print(f"Error (TF-IDF): Fitting vectorizer failed: {e}")
        return None

# --- REMOVED Reference Extraction/Processing Functions ---

def calculate_similarity_matrices(papers_with_text: List[Dict[str, Any]], vectorizer: TfidfVectorizer | None):
    """Calculate ONLY the TF-IDF Paper-to-Paper similarity matrix."""
    print("Calculating TF-IDF Paper-to-Paper Similarity Matrix...") # Updated print
    if not vectorizer:
        print("Error (TF-IDF): Vectorizer not available.")
        return None, None # Return None for papers list and matrix

    # Filter papers that have substantial cleaned text for TF-IDF calculations
    valid_papers_for_tfidf = [p for p in papers_with_text if p.get("clean_text")]
    if len(valid_papers_for_tfidf) < 2:
        print("Warning (TF-IDF): Need at least 2 papers with extracted text for similarity calculation.")
        return None, None # Return None for papers list as well

    full_sim_matrix = None

    # --- Paper-to-Paper Similarity (TF-IDF on Summary + Full Text) ---
    combined_texts = [p["summary"] + " " + p.get("clean_text", "") for p in valid_papers_for_tfidf]
    try:
        combined_vectors = vectorizer.transform(combined_texts)
        full_sim_matrix = cosine_similarity(combined_vectors)
        print(" -> Calculated Paper-to-Paper Similarity Matrix (TF-IDF).")
    except Exception as e:
        print(f"Error calculating paper-to-paper TF-IDF similarity: {e}")
        full_sim_matrix = None # Ensure it's None on error

    # --- Shared References Section REMOVED ---

    # Return the papers actually used for calculations, and ONLY the TF-IDF matrix
    return valid_papers_for_tfidf, full_sim_matrix


# --- Visualization Functions ---
def plot_similarity_heatmap(matrix, labels, title) -> plt.Figure | None:
    """Plots similarity heatmap and returns the Matplotlib Figure. Returns None on error."""
    # Basic validation
    if matrix is None or labels is None or len(labels) == 0 or matrix.shape[0] != len(labels) or matrix.shape[1] != len(labels):
        print(f"⚠️ Cannot plot heatmap '{title}': Invalid data provided.")
        fig, ax = plt.subplots(); ax.text(0.5, 0.5, f'Data unavailable for\n{title}', ha='center', va='center', transform=ax.transAxes, fontsize=10, wrap=True); ax.set_xticks([]); ax.set_yticks([]); return fig

    print(f"Generating heatmap: {title}")
    try:
        plt.style.use('seaborn-v0_8-darkgrid')
        fig_width = max(8, len(labels) * 0.8); fig_height = max(6, len(labels) * 0.7)
        fig, ax = plt.subplots(figsize=(fig_width, fig_height))
        short_labels = [f"P{i+1}" for i in range(len(labels))] # Use P1, P2...
        annot = len(labels) <= 12
        sns.heatmap(matrix, annot=annot, fmt=".2f", cmap="YlGnBu", vmin=0, vmax=1, xticklabels=short_labels, yticklabels=short_labels, ax=ax, linewidths=0.5, linecolor='gray', annot_kws={"size": 9})
        ax.set_title(title, fontsize=14, pad=15)
        plt.xticks(rotation=45, ha="right", fontsize=9); plt.yticks(rotation=0, fontsize=9)
        plt.tight_layout(pad=1.5); plt.close(fig) # Prevent inline display
        return fig
    except Exception as e:
        print(f"❌ Error generating heatmap '{title}': {e}")
        fig, ax = plt.subplots(); ax.text(0.5, 0.5, f'Error plotting\n{title}', ha='center', va='center', transform=ax.transAxes); ax.set_xticks([]); ax.set_yticks([]); return fig

# --- REMOVED plot_shared_reference_heatmap function ---


# --- 4. Gradio Web Application ---

# --- State Management ---
initial_state = {
    # RAG Components
    "vector_store": None, "llm": None, "prompt_template_string": None,
    # Fetched Data
    "paper_details": [], "references_list": [],
    "pdf_zip_path": None, "ref_zip_path": None,
}

# --- Utility function for parsing paper references in queries ---
def parse_paper_references(query: str, paper_details: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """Parses user query to find references like 'paper 1' or 'paper titled "X"'."""
    target_details = []
    # Look for 'paper <number>'
    index_matches = re.findall(r'paper\s+(\d+)', query, re.IGNORECASE)
    for index_str in index_matches:
        try:
            index = int(index_str)
            found = next((p for p in paper_details if p['index'] == index), None)
            if found: target_details.append(found)
        except ValueError: continue

    # Look for 'paper titled "..."'
    title_matches = re.findall(r'paper titled\s+["\']([^"\']+)["\']', query, re.IGNORECASE)
    for title in title_matches:
        normalized_title = title.strip().lower()
        found = next((p for p in paper_details if p['title'].strip().lower() == normalized_title), None)
        if found and found not in target_details: target_details.append(found) # Add if new

    # Fallback: Check for direct title mentions if no specific pattern matched
    if not target_details:
        query_lower = query.lower()
        for p in paper_details:
             if p['title'].lower() in query_lower:
                  is_substring = any(p['title'].lower() in existing['title'].lower() or existing['title'].lower() in p['title'].lower() for existing in target_details)
                  if not is_substring and p not in target_details: target_details.append(p)

    # Return unique identified papers
    unique_targets = []; seen_indices = set()
    for p in target_details:
        if p['index'] not in seen_indices: unique_targets.append(p); seen_indices.add(p['index'])

    if unique_targets: print(f"Parsed query, identified target paper(s): {[p['index'] for p in unique_targets]}")
    else: print("Parsed query, no specific paper references identified.")
    return unique_targets


# --- Gradio Interface Definition ---
with gr.Blocks(theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="purple")) as demo:
    app_state = gr.State(initial_state) # Holds the application's state

    gr.Markdown("# 🤖 AI Research Assistant (RAG Chat + TF-IDF Similarity)") # Updated title
    gr.Markdown("Fetches research papers from arXiv, processes them for RAG chat and TF-IDF similarity analysis.")

    with gr.Row():
        # --- Left Column: Controls ---
        with gr.Column(scale=1, min_width=300):
            topic_input = gr.Textbox(label="Research Topic", placeholder="e.g., Diffusion Models for Image Generation")
            sort_by_input = gr.Radio(label="Sort Papers By", choices=["Relevance", "Date"], value="Relevance")
            max_results_input = gr.Slider(label="Number of Papers", minimum=1, maximum=25, value=5, step=1)
            fetch_button = gr.Button("🔍 Fetch & Process Papers", variant="primary")
            status_output = gr.Markdown(value="*Status: Waiting for input...*")

        # --- Right Column: Paper List & Downloads ---
        with gr.Column(scale=2):
            gr.Markdown("### Fetched Papers List")
            papers_output = gr.Textbox(label="Paper Index & Title (Scrollable)", lines=10, interactive=False, max_lines=15)
            with gr.Row():
                 download_pdfs_button = gr.DownloadButton(label="⬇️ Download PDFs (.zip)", visible=False, size="sm")
                 download_refs_button = gr.DownloadButton(label="⬇️ Download References (.zip)", visible=False, size="sm")

    gr.Markdown("---") # Separator

    # --- Similarity Visualizations Accordion (MODIFIED) ---
    with gr.Accordion("📊 Similarity Analysis (TF-IDF)", open=False): # Updated title
        gr.Markdown("Visualization based on TF-IDF analysis of paper text (summary + full text). Uses P1, P2... labels; cross-reference with the list above.")
        with gr.Row(): # Only one plot now
            tfidf_plot_output = gr.Plot(label="Paper-to-Paper Similarity (TF-IDF)")
            # --- REMOVED shared_ref_plot_output ---
        # similarity_legend_output = gr.Markdown(value="") # Placeholder for legend if needed

    gr.Markdown("---") # Separator

    # --- Chat Interface ---
    gr.Markdown("### 💬 Chat with the Papers (RAG - OpenAI Powered)")
    gr.Markdown("Ask questions about the papers using index (`summarize paper 1`) or title (`compare paper titled 'X'...`).")
    chatbot_output = gr.Chatbot(label="Conversation", height=450, bubble_full_width=False, avatar_images=None)
    with gr.Row():
        question_input = gr.Textbox(label="Your Question", placeholder="Ask about the papers...", scale=4, show_label=False)
        ask_button = gr.Button("💬 Ask", scale=1, variant="secondary")
        clear_chat_button = gr.Button("🧹 Clear", scale=1)

    # --- Event Handlers ---

    # 1. Fetch Button Click Logic (Orchestrates Fetching, RAG Indexing, TF-IDF Processing)
    def fetch_and_process_papers(topic: str, sort_by: str, max_results: int, current_app_state: Dict[str, Any]):
        """Handles fetching, RAG setup, TF-IDF processing, and plot generation."""
        if not topic.strip(): # Handle empty topic
            # Reset necessary UI elements
            return { status_output: gr.update(value="*Status: Please enter a research topic.*"), papers_output: gr.update(value=""), download_pdfs_button: gr.update(visible=False), download_refs_button: gr.update(visible=False), chatbot_output: gr.update(value=[]), tfidf_plot_output: gr.update(value=None), # shared_ref_plot_output removed
                     app_state: current_app_state }

        # Initial status update and disabling UI elements
        yield { status_output: gr.update(value="*Status: Fetching papers from arXiv...*"), fetch_button: gr.update(interactive=False), chatbot_output: gr.update(value=[]), tfidf_plot_output: gr.update(value=None), # shared_ref_plot_output removed
               }

        # --- Step 1: Fetch Papers & Basic Details ---
        new_state = initial_state.copy() # Reset state for new fetch
        paper_details, references_list, pdf_zip, ref_zip = fetch_papers(topic, sort_by, max_results)
        new_state["paper_details"] = paper_details # Includes placeholders
        new_state["references_list"] = references_list
        new_state["pdf_zip_path"] = pdf_zip
        new_state["ref_zip_path"] = ref_zip

        if not paper_details: # Handle case where no papers were found/downloaded
             yield { status_output: gr.update(value="*Status: No papers found or download failed.*"), papers_output: gr.update(value="No papers found."), download_pdfs_button: gr.update(visible=False), download_refs_button: gr.update(visible=False), fetch_button: gr.update(interactive=True), app_state: new_state }; return

        # --- Display Fetched Paper List ---
        papers_display_text = "\n".join([f"{p['index']}. {p['title']}" for p in paper_details])
        yield { status_output: gr.update(value="*Status: Papers fetched. Processing for RAG Chat...*"), papers_output: gr.update(value=papers_display_text), download_pdfs_button: gr.update(value=pdf_zip, visible=bool(pdf_zip)), download_refs_button: gr.update(value=ref_zip, visible=bool(ref_zip)), app_state: new_state }

        # --- Step 2: RAG Indexing (Uses OpenAI Embeddings) ---
        vector_store = load_and_index_papers_for_rag(paper_details)
        llm, prompt_template_str = None, None
        if vector_store:
            new_state["vector_store"] = vector_store
            yield { status_output: gr.update(value="*Status: RAG indexing complete. Setting up LLM...*"), app_state: new_state }
            llm, prompt_template_str = setup_llm_and_prompt()
            if llm and prompt_template_str:
                new_state["llm"] = llm; new_state["prompt_template_string"] = prompt_template_str
                yield { status_output: gr.update(value="*Status: RAG setup complete. Processing for TF-IDF Similarity...*"), app_state: new_state }
            else: yield { status_output: gr.update(value="*Status: RAG indexing OK, but LLM setup failed. Chat disabled.*"), app_state: new_state }
        else: yield { status_output: gr.update(value="*Status: RAG indexing failed. Chat disabled. Processing for TF-IDF Similarity...*"), app_state: new_state }

        # --- Step 3: TF-IDF Similarity Processing ---
        yield { status_output: gr.update(value="*Status: Extracting text & calculating TF-IDF similarity... (May take time)*") }
        processed_paper_details_for_tfidf = new_state["paper_details"] # Modify the list in state
        print("TF-IDF Phase: Extracting full text...")
        successful_extractions = 0
        for paper in processed_paper_details_for_tfidf: # Modify list in place
             if os.path.exists(paper['filepath']):
                  paper['clean_text'] = extract_full_text_for_similarity(paper['filepath'])
                  if paper['clean_text']: successful_extractions += 1
                  else: print(f"    -> Warning: No substantial text extracted for TF-IDF from Paper {paper['index']}")
             else: paper['clean_text'] = ""
        print(f"TF-IDF Phase: Successfully extracted text from {successful_extractions}/{len(processed_paper_details_for_tfidf)} papers.")

        # Fit vectorizer only if text was extracted
        tfidf_vectorizer = None
        if successful_extractions > 0:
             tfidf_vectorizer = fit_tfidf_vectorizer(processed_paper_details_for_tfidf)

        # Calculate matrix (MODIFIED Call)
        tfidf_papers, full_sim_matrix = None, None # Initialize
        if tfidf_vectorizer:
             # *** ADJUSTED CALL: Only expect 2 return values ***
             tfidf_papers, full_sim_matrix = calculate_similarity_matrices(
                 processed_paper_details_for_tfidf, tfidf_vectorizer
             )

        # Generate Plots (MODIFIED)
        tfidf_plot_fig = None
        # --- REMOVED shared_ref_plot_fig variable ---

        if tfidf_papers: # Check if calculate_similarity_matrices returned valid papers
             paper_titles_for_plot = [p['title'] for p in tfidf_papers] # Use titles from the returned list
             tfidf_plot_fig = plot_similarity_heatmap(full_sim_matrix, paper_titles_for_plot, "Paper-to-Paper Similarity (TF-IDF)")
             # --- REMOVED call to plot_shared_reference_heatmap ---
        else: print("Skipping TF-IDF plot generation as insufficient data was processed.")

        # --- Final Status Update (MODIFIED) ---
        final_status = "*Status: Ready! "
        if new_state.get("llm"): final_status += "RAG Chat active. "
        else: final_status += "RAG Chat disabled. "
        # *** ADJUSTED check ***
        if tfidf_plot_fig: final_status += "TF-IDF Similarity plot generated."
        else: final_status += "Similarity analysis skipped or failed."

        # MODIFIED final yield - removed shared_ref_plot_output
        yield {
            status_output: gr.update(value=final_status),
            tfidf_plot_output: gr.update(value=tfidf_plot_fig),
            fetch_button: gr.update(interactive=True), # Re-enable fetch button
            app_state: new_state # Store final state
        }

    # MODIFIED click handler outputs - removed shared_ref_plot_output
    fetch_button.click(
        fetch_and_process_papers,
        inputs=[topic_input, sort_by_input, max_results_input, app_state],
        outputs=[status_output, papers_output, download_pdfs_button, download_refs_button,
                 chatbot_output, tfidf_plot_output, # REMOVED shared_ref_plot_output
                 fetch_button, app_state]
    )


    # 2. Ask Button Click Logic (Chat Handling - UPDATED Question for Summaries)
    def handle_chat_message(question: str, history: List[List[str]], current_app_state: Dict[str, Any]):
        vs = current_app_state.get("vector_store")
        llm = current_app_state.get("llm")
        prompt_template_str = current_app_state.get("prompt_template_string")
        paper_details = current_app_state.get("paper_details", [])

        # --- Pre-computation & RAG Readiness Checks ---
        if not question.strip(): yield history, history, current_app_state; return
        if not (vs and llm and prompt_template_str):
             missing = ["Vector Store" if not vs else None, "LLM" if not llm else None, "QA Prompt Template" if not prompt_template_str else None]
             error_msg = f"⚠️ RAG system not ready. Missing: {', '.join(filter(None, missing))}. Please fetch/process papers first."
             print(f"ERROR in handle_chat: {error_msg}"); history.append((question, error_msg)); yield history, history, current_app_state; return

        history.append((question, None)); yield history, history, current_app_state # Add placeholder

        # --- Parse Query & Retrieve Documents for RAG ---
        target_paper_details = parse_paper_references(question, paper_details)
        relevant_docs = []
        retrieval_k = 5
        retrieval_successful = False
        retrieval_failure_reason = ""
        try:
            # [Keep the robust retrieval logic with title/question/generic fallback]
            if target_paper_details:
                print(f"RAG: Retrieving for: {[p['index'] for p in target_paper_details]}")
                for details in target_paper_details:
                    fpath = details['filepath']; paper_title = details['title']; paper_index = details['index']
                    docs = vs.similarity_search(query=paper_title, k=retrieval_k, filter={'source': fpath})
                    if not docs: docs = vs.similarity_search(query=question, k=retrieval_k, filter={'source': fpath})
                    if not docs:
                        generic_query = "research paper content"
                        docs = vs.similarity_search(query=generic_query, k=retrieval_k, filter={'source': fpath})
                        if not docs:
                             retrieval_failure_reason = f"Could not retrieve any text chunks for Paper {paper_index}. The PDF might be image-based or have failed processing during RAG indexing (check console logs)."
                             print(f"    -> WARNING: {retrieval_failure_reason}")
                    relevant_docs.extend(docs)
            else:
                print("RAG: Retrieving generally using question.")
                relevant_docs = vs.similarity_search(question, k=retrieval_k + 2)
            if relevant_docs: retrieval_successful = True
        except Exception as e:
            print(f"❌ RAG: Error during retrieval: {e}"); history[-1] = (question, f"⚠️ Retrieval error: {e}"); yield history, history, current_app_state; return

        # --- Generate Response using QA Chain ---
        llm_answer = retrieval_failure_reason if retrieval_failure_reason else "Sorry, RAG couldn't find relevant information for your query in the available document excerpts."
        is_llm_refusal = True # Assume refusal by default
        source_docs_used = []

        if retrieval_successful:
            try:
                unique_docs = []; seen_doc_content = set()
                for doc in relevant_docs:
                    if doc.page_content and len(doc.page_content.strip()) > 10:
                         if doc.page_content not in seen_doc_content:
                            unique_docs.append(doc); seen_doc_content.add(doc.page_content)

                if unique_docs:
                    print(f"RAG: Running QA chain with {len(unique_docs)} unique chunks.")
                    qa_prompt = PromptTemplate(template=prompt_template_str, input_variables=["context", "question"])
                    document_prompt_template = """**Source: '{title}' - page {page}**
                    {page_content}"""
                    document_prompt = PromptTemplate(input_variables=["page_content", "title", "page"], template=document_prompt_template)

                    # --- *** Modify Question for Single-Paper Summaries *** ---
                    effective_question = question # Default to original user question
                    is_index_query_single = bool(re.search(r'paper\s+\d+', question, re.IGNORECASE)) and not bool(re.search(r'paper titled', question, re.IGNORECASE)) and len(target_paper_details) == 1
                    is_summary_request = "summarize" in question.lower() or "summary" in question.lower()

                    if is_index_query_single and is_summary_request:
                        paper_title = target_paper_details[0]['title']
                        effective_question = f"Provide a concise summary of the key points, findings, and conclusions presented in the provided context excerpts from the paper titled '{paper_title}'."
                        print(f"RAG: Modified question for LLM: {effective_question}")
                    # --- *** End Modification *** ---

                    # Execute QA chain
                    qa_chain = load_qa_chain(llm=llm, chain_type="stuff", prompt=qa_prompt, document_prompt=document_prompt, verbose=False)
                    # Use the potentially modified question here
                    result = qa_chain({"input_documents": unique_docs, "question": effective_question}, return_only_outputs=False)
                    generated_text = result['output_text'].strip()

                    # --- Stricter Refusal Check ---
                    refusal_phrases = [ # Keep the comprehensive list
                        "provided context does not contain", "information is not available",
                        "couldn't find information", "do not have information",
                        "cannot answer based on the context", "context provided does not mention",
                        "based on the text provided", "relevant information was not found",
                        "i cannot provide a summary", "unable to summarize", "does not seem to contain",
                        "no specific information", "no details provided"
                    ]
                    is_llm_refusal = any(phrase in generated_text.lower() for phrase in refusal_phrases) or \
                                     (len(generated_text.split()) < 15 and ("sorry" in generated_text.lower() or "unable" in generated_text.lower()))

                    llm_answer = generated_text # Store the LLM's actual output

                    if not is_llm_refusal:
                         source_docs_used = result.get('input_documents', []) # Store sources ONLY if not a refusal
                         print(f"RAG: LLM generated a valid answer.")
                    else:
                         print(f"RAG: LLM generated a refusal or very short answer: '{generated_text[:100]}...'")
                    # --- End Stricter Refusal Check ---

                else: print("RAG: No unique, non-empty documents after filtering.")
            except Exception as e:
                print(f"❌ RAG: Error running QA chain: {e}"); import traceback; traceback.print_exc();
                error_type = type(e).__name__; history[-1] = (question, f"⚠️ Answer generation error: {error_type}"); yield history, history, current_app_state; return
        else:
            print(f"RAG: Retrieval did not find any relevant documents. Failure Reason: {retrieval_failure_reason if retrieval_failure_reason else 'N/A'}")
            # llm_answer already holds the failure message, is_llm_refusal remains True

        # --- Add Title Prefix for Index-Based Queries ---
        response_prefix = ""
        is_index_query = bool(re.search(r'paper\s+\d+', question, re.IGNORECASE)) and not bool(re.search(r'paper titled', question, re.IGNORECASE))
        # Add prefix only if it was an index query AND it was NOT a refusal
        if is_index_query and target_paper_details and not is_llm_refusal:
             if len(target_paper_details) == 1:
                  p = target_paper_details[0]; response_prefix = f"**Summary for Paper {p['index']} ('{p['title']}')**: \n\n"
             elif len(target_paper_details) > 1:
                  titles_str = " vs ".join([f"Paper {p['index']} ('{p['title']}')" for p in target_paper_details]); response_prefix = f"**Comparison of {titles_str}**: \n\n"
        final_answer_text = response_prefix + llm_answer

        # --- Source Attribution (Conditional on NOT being a refusal) ---
        if not is_llm_refusal and source_docs_used:
            cited_sources_info = []; seen_source_keys = set()
            for doc in source_docs_used:
                meta = doc.metadata; title = meta.get('title', 'Unknown')
                page_num_0_indexed = meta.get('page', -1)
                source_key = f"{title}_{page_num_0_indexed}"
                if source_key not in seen_source_keys:
                    page_str = f", page ~{page_num_0_indexed + 1}" if page_num_0_indexed != -1 else ""
                    cited_sources_info.append(f"'{title}'{page_str}"); seen_source_keys.add(source_key)
            if cited_sources_info:
                 cited_sources_info.sort(); sources_text = "\n\n*Sources (RAG):* \n" + "\n".join([f"- {s}" for s in cited_sources_info])
                 final_answer_text += sources_text

        history[-1] = (question, final_answer_text)

        yield history, history, current_app_state

    # Connect Ask button and Enter key in textbox to the chat handler
    ask_button.click(handle_chat_message, inputs=[question_input, chatbot_output, app_state], outputs=[chatbot_output, chatbot_output, app_state]).then(lambda: gr.update(value=""), outputs=[question_input])
    question_input.submit(handle_chat_message, inputs=[question_input, chatbot_output, app_state], outputs=[chatbot_output, chatbot_output, app_state]).then(lambda: gr.update(value=""), outputs=[question_input])

    # 3. Clear Chat Button Logic
    def clear_chat_history(): return [], [] # Simple function to return empty lists
    clear_chat_button.click(clear_chat_history, inputs=None, outputs=[chatbot_output, chatbot_output])


# --- 5. Launch the Gradio App ---
print("\nLaunching Gradio App...")
# queue() allows multiple users/requests concurrently
# share=True creates a public link needed for Colab
# debug=True provides helpful logs in the Colab output console
demo.queue().launch(debug=True, share=True)

  chatbot_output = gr.Chatbot(label="Conversation", height=450, bubble_full_width=False, avatar_images=None)
  chatbot_output = gr.Chatbot(label="Conversation", height=450, bubble_full_width=False, avatar_images=None)



Launching Gradio App...
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://d3c6791eb52b8b82a5.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


  state[block._id] = block.__class__(**kwargs)


Fetching papers for topic: 'AI research assistant', sorting by: Relevance, max results: 25
Cleared previous downloads, references, and RAG index.
Found 25 papers. Downloading...
[1/25] Downloading 'Decoding AI's Nudge: A Unified Framework to Predict Human Behavior in AI-assisted Decision Making' to 2401.05840v1_Decoding_AI's_Nudge_A_Unified_Framework_to_Predict_Human_Behavior_in_AI-assisted_Decisi.pdf...
[1/25] Download complete.
[2/25] Downloading 'AI-Empowered Human Research Integrating Brain Science and Social Sciences Insights' to 2411.12761v1_AI-Empowered_Human_Research_Integrating_Brain_Science_and_Social_Sciences_Insights.pdf...
[2/25] Download complete.
[3/25] Downloading 'AI empowering research: 10 ways how science can benefit from AI' to 2307.10265v1_AI_empowering_research_10_ways_how_science_can_benefit_from_AI.pdf...
[3/25] Download complete.
[4/25] Downloading 'Human-AI Collaborative Game Testing with Vision Language Models' to 2501.11782v1_Human-AI_Collaborative_Game_Test

  state[block._id] = block.__class__(**kwargs)


TF-IDF Phase: Successfully extracted text from 25/25 papers.
Fitting TF-IDF Vectorizer...
TF-IDF Vocabulary size: 9368
Calculating TF-IDF Paper-to-Paper Similarity Matrix...
 -> Calculated Paper-to-Paper Similarity Matrix (TF-IDF).
Generating heatmap: Paper-to-Paper Similarity (TF-IDF)


  state[block._id] = block.__class__(**kwargs)


Parsed query, identified target paper(s): [1]
RAG: Retrieving for: [1]
RAG: Running QA chain with 5 unique chunks.
RAG: Modified question for LLM: Provide a concise summary of the key points, findings, and conclusions presented in the provided context excerpts from the paper titled 'Decoding AI's Nudge: A Unified Framework to Predict Human Behavior in AI-assisted Decision Making'.
RAG: LLM generated a valid answer.
Parsed query, identified target paper(s): [10]
RAG: Retrieving for: [10]
RAG: Running QA chain with 5 unique chunks.
RAG: Modified question for LLM: Provide a concise summary of the key points, findings, and conclusions presented in the provided context excerpts from the paper titled 'Collective Attention in Human-AI Teams'.
RAG: LLM generated a valid answer.
Parsed query, identified target paper(s): [25]
RAG: Retrieving for: [25]
RAG: Running QA chain with 5 unique chunks.
RAG: Modified question for LLM: Provide a concise summary of the key points, findings, and conclusions

