In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

gemini_long_context_path = kagglehub.competition_download('gemini-long-context')
kane0068_pdf_files_path = kagglehub.dataset_download('kane0068/pdf-files')
google_gemini_1_5_flash_api_api_gemini_1_5_flash_1_path = kagglehub.model_download('google/gemini-1.5-flash-api/Api/gemini-1.5-flash/1')

print('Data source import complete.')


This code represents a sophisticated Research Assistant class designed for comprehensive academic paper analysis using Google's Gemini AI. Let me break down its most impressive features:

# 🔬 Advanced Paper Processing Capabilities:

Extracts detailed information from academic PDFs
Parses metadata, abstracts, and full content
Automatic keyword and reference extraction
Robust error handling and logging

# 🧠 AI-Powered Analysis Features:

Uses Gemini AI to identify:

Connections between papers
Research gaps
Future research directions
Key findings


Advanced text similarity calculations
Topic clustering
Citation network analysis

# 📊 Visualization and Reporting:

Generates comprehensive markdown reports
Creates multiple visualizations:

Paper similarity heatmaps
Interactive citation network graphs
Topic distribution charts
Methodology distribution pie charts



# 🔒 Smart Rate Limiting:

Implements request rate limiting for Gemini API
Prevents overwhelming the API with too many requests

# 🔍 Key Technical Highlights:

Uses advanced NLP techniques like TF-IDF vectorization
Implements network analysis with NetworkX
Supports batch processing of multiple research papers
Flexible configuration for cache directories and logging

In essence, this is a powerful, AI-enhanced research assistant that can automatically analyze, connect, and visualize insights from academic literature, making complex research synthesis much more efficient and insightful.

In [None]:
pip install PyPDF2 typing

In [None]:
# Importing Libraries

import os
import json
import PyPDF2
import google.generativeai as genai
from typing import List, Dict, Any, Optional
import numpy as np
from datetime import datetime
import logging
from pathlib import Path
import re
from functools import wraps
from collections import defaultdict
import pandas as pd
from dataclasses import dataclass, asdict

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import time
from concurrent.futures import ThreadPoolExecutor
from functools import lru_cache

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns


# Interactive plotting
import plotly.graph_objects as go

In [None]:
# Define DataTypes
@dataclass
class Paper:
    title : str
    authors : List[str]
    year : int
    abstract :str
    content : str
    filepath : str
    references : List[str] = None
    citations : List[str] = None
    keywords : List[str] = None

    def to_dict(self):
        return asdict(self)

@dataclass
class Analysis:
    connections : List[Dict]
    research_gaps : List[str]
    future_directions : List[str]
    key_findings : List[str]
    methodology_analysis : Dict
    topic_clusters :Dict
    citation_network : Dict
    similarity_scores : Dict

In [None]:
#For Gemini Request Control

class RateLimiter:
    def __init__(self, max_requests, time_period):
        self.max_requests = max_requests
        self.time_period = time_period
        self.request_times = []

    def __call__(self, func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            now = time.time()
            self.request_times = [t for t in self.request_times if t > now - self.time_period]

            if len(self.request_times) >= self.max_requests:
                wait_time = self.time_period - (now - self.request_times[0])
                time.sleep(wait_time)

            self.request_times.append(now)
            return func(*args, **kwargs)
        return wrapper





class ResearchAssistant:
    def __init__(self,api_key : str , cache_dir :str = "./cache"):
        #Start Research Assistant with Google API key and set up logging

        self.setup_logging()
        self.logger = logging.getLogger(__name__)
         # Cache Config
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(exist_ok = True)

        # Google Api Config
        genai.configure(api_key = api_key)
        self.model = genai.GenerativeModel(model_name='gemini-1.5-flash-latest') #
        self.chat = self.model.start_chat()


        # Analysis Tools
#         self.vectorizer = TfidfVectorizer(stop_words = 'english')
        self.vectorizer = TfidfVectorizer(
            stop_words='english',
            min_df=1,  # Include terms that appear in at least 1 document
            max_df=0.95,  # Exclude terms that appear in more than 95% of documents
            token_pattern=r'(?u)\b\w+\b',  # Match any word character
            strip_accents='unicode'
        )
        self.tokenizer = genai.GenerativeModel('gemini-1.5-flash-latest').count_tokens
        self.logger.info('Research Assistant initialized successfully')


    def count_tokens_in_text(self, text: str) -> int:
        ##Count tokens in given text using Gemini's tokenizer
        try:
            result = self.tokenizer(text)
            return result.total_tokens
        except Exception as e:
            self.logger.error(f"Error counting tokens: {str(e)}")
            return 0

    def setup_logging(self):

        logging.basicConfig(
            level=logging.DEBUG,
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('research_assistant_debug.log'),
                logging.StreamHandler()
            ]
        )




    def extract_text_from_pdf(self,filepath : str) -> Optional[Dict[str, str]]:

        try:
            self.logger.debug(f"Starting PDF extraction from {filepath}")

            if not os.path.exists(filepath):
                self.logger.error(f'File Not Found : {filepath}')
                return None

            with open(filepath,'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)

                if not pdf_reader.pages:
                    self.logger.error("PDF has no page")
                    return None

                # Extract Metadata
                metadata = {}

                if pdf_reader.metadata:
                    metadata = {
                        'title' : pdf_reader.metadata.get('/Title',''),
                        'author': pdf_reader.metadata.get('/Author',''),
                        'subject': pdf_reader.metadata.get('/Subject',''),
                        'keywords' : pdf_reader.metadata.get('/Keywords','')
                    }
                else:
                    self.logger.warning('No metadata found in PDF')
                # EXtract Text
                text = ""
                abstract = ""

                for i ,page in enumerate(pdf_reader.pages):
                    try:
                        content = page.extract_text()
                        self.logger.debug(f"Page {i+1} extracted,length :{len(content)}")
                        text += content

                    #Abstract from the first page
                        if i == 0:
                            abstract_match = re.search(r'Abstract\s*(.*?)(?=\n\n|\n[A-Z]{2,})', content, re.DOTALL)
                            if abstract_match:
                                abstract = abstract_match.group(1).strip()
                                self.logger.debug(f"Abstract found , length :{len(abstract)}")

                            else:
                                self.logger.warning('No abstract pattern found on first page')
                    except Exception as e:
                        self.logger.error(f"Error extracting text from page {i+1}: {str(e)}")
                result =  {
                    'metadata' :metadata,
                    'abstract' : abstract,
                    'content' : text
                }

                self.logger.info(f"PDF extraction completed. "
                               f"Metadata keys: {list(metadata.keys())}, "
                               f"Abstract length: {len(abstract)}, "
                               f"Content length: {len(text)}")

                return result

        except PyPDF2.PdfReadError as e:
            self.logger.error(f"PDF reading error: {str(e)}")
            return None
        except Exception as e:
            self.logger.error(f"Unexpected error in PDF extraction: {str(e)}")
            return None



    def test_gemini_connection(self):
        try:
            response = self.model.generate_content("Please respond with 'Connection successful' if you receive this message.")
            print(f"Gemini response: {response.text}")
            return True
        except Exception as e:
            print(f"Gemini connection failed: {str(e)}")
            return False

    def parse_paper(self, filepath: str) -> Optional[Paper]:
        try:
            self.logger.info(f"Starting to parse paper: {filepath}")

            extracted_data = self.extract_text_from_pdf(filepath)
            if not extracted_data:
                self.logger.error("No data extracted from PDF")
                return None

            # Log the extracted data
            self.logger.debug(f"Extracted metadata: {extracted_data['metadata']}")
            self.logger.debug(f"Abstract length: {len(extracted_data['abstract'])}")
            self.logger.debug(f"Content length: {len(extracted_data['content'])}")

            # Validate and clean metadata
            title = extracted_data['metadata'].get('title', '').strip()
            authors = [author.strip() for author in extracted_data['metadata'].get('author', '').split(',') if author.strip()]
            subject = extracted_data['metadata'].get('subject', '').strip()

            # Enhanced year extraction
            year = None
            year_patterns = [
                r"20[0-2][0-9]",  # Standard year format
                r"©\s*20[0-2][0-9]",  # Copyright year
                r"Published.*20[0-2][0-9]",  # Publication year
            ]

            for pattern in year_patterns:
                year_match = re.search(pattern, subject or extracted_data['content'])
                if year_match:
                    year_str = re.search(r"20[0-2][0-9]", year_match.group()).group()
                    year = int(year_str)
                    self.logger.debug(f"Year found: {year} using pattern: {pattern}")
                    break

            if not year:
                self.logger.warning("No year found in document")

            # Log validation results
            self.logger.info(f"Parsed data - Title: {bool(title)}, "
                           f"Authors: {len(authors)}, "
                           f"Year: {year}")

            return Paper(
                title=title or "Untitled",
                authors=authors or ["Unknown"],
                year=year,
                abstract=extracted_data['abstract'],
                content=extracted_data['content'],
                filepath=filepath,
                references=None,
                keywords=None
            )

        except Exception as e:
            self.logger.error(f"Error parsing paper: {str(e)}", exc_info=True)
            return None


    def extract_references(self,content : str) -> List[str]:
        # can be improved...
        references = []

        ref_section = re.search(r"References\s*(.*?)(?=\n\n|\Z)" , content , re.DOTALL)
        if ref_section :
            ref_text = ref_section.group(1)

            references = [ref.strip() for ref in ref_text.split('\n') if ref.strip()]

        return references

    def extract_keywords(self, content: str) -> List[str]:

        keywords = []
        try:
        # Look for keywords with different possible formats
            patterns = [
            r'Keywords:?\s*(.*?)(?=\n\n|\Z)',
            r'Key\s+words:?\s*(.*?)(?=\n\n|\Z)',
            r'Index\s+Terms:?\s*(.*?)(?=\n\n|\Z)'
            ]

            for pattern in patterns:
                keyword_section = re.search(pattern, content, re.DOTALL | re.IGNORECASE)
                if keyword_section:
                # Split by common separators and clean
                    raw_keywords = re.split(r'[;,]', keyword_section.group(1))
                    keywords = [k.strip() for k in raw_keywords if k.strip()]
                    break

        except Exception as e:
            self.logger.error(f"Error extracting keywords: {str(e)}")

        return keywords



    def analyze_papers(self, papers: List[Paper]) -> Analysis:
        try:
            # Count tokens before analysis
            total_tokens = 0
            token_counts = {}

            for paper in papers:
                # Combine all text content for token counting
                paper_text = f"""
                Title: {paper.title}
                Abstract: {paper.abstract}
                Authors: {', '.join(paper.authors)}
                Year: {paper.year}
                Keywords: {', '.join(paper.keywords or [])}
                Content: {paper.content}
                """

                tokens = self.count_tokens_in_text(paper_text)
                token_counts[paper.title] = tokens
                total_tokens += tokens

            self.logger.info(f"Total tokens processed: {total_tokens}")
            self.logger.info("Token counts per paper:")
            for title, count in token_counts.items():
                self.logger.info(f"- {title}: {count} tokens")

            # Existing analysis code
            gemini_analysis = self._analyze_with_gemini(papers)
            similarity_scores = self._calculate_similarity_scores(papers)
            topic_clusters = self._cluster_topics(papers)
            citation_network = self._analyze_citation_network(papers)
            methodology_analysis = self._analyze_methodologies(papers)

            analysis = Analysis(
                connections = gemini_analysis['connections'],
                research_gaps = gemini_analysis['research_gaps'],
                future_directions = gemini_analysis['future_directions'],
                key_findings = gemini_analysis['key_findings'],
                methodology_analysis = methodology_analysis,
                topic_clusters = topic_clusters,
                citation_network = citation_network,
                similarity_scores = similarity_scores
            )

            # Add token counts to the analysis
            analysis.token_counts = token_counts
            analysis.total_tokens = total_tokens

            return analysis

        except Exception as e:
            self.logger.error(f"Error in Papers Analysis: {str(e)}")
            raise

    #Per minute: Gemini 1.5 flash ~10-15 requests - Gemini 1.5 Pro ~2 requests - Gemini 1.5 8B ~10-15
    @RateLimiter(max_requests=15, time_period=60)
    def _analyze_with_gemini(self, papers: List[Paper]) -> Dict:
        try:
            self.logger.info("Attempting to connect to Gemini...")
            prompt = self._construct_analysis_prompt(papers)
            self.logger.info("Sending prompt to Gemini...")
            response = self.chat.send_message(prompt)
            self.logger.info(f"Received response from Gemini: {response.text[:100]}...")
        except Exception as e:
            self.logger.error(f"Gemini API error: {str(e)}")
            raise

        analysis = {
        'connections': [],
        'research_gaps': [],
        'future_directions': [],
        'key_findings': []
        }

    # For more flexible header matching
        current_section = None
        for line in response.text.split('\n'):
            line = line.strip()
            if not line:
                continue

            # Control titles more flexibly
            lower_line = line.lower()
            if "key findings" in lower_line:
                current_section = 'key_findings'
                continue
            elif "research gaps" in lower_line:
                current_section = 'research_gaps'
                continue
            elif "future" in lower_line and "direction" in lower_line:
                current_section = 'future_directions'
                continue
            elif "connection" in lower_line:
                current_section = 'connections'
                continue

            # # Clear bullets-
                line = line.lstrip('* ')
            if line.startswith('-'):
                line = line.lstrip('- ')

            # Add content to relevant section
            if current_section and line:
                if current_section == 'connections':
                    connection = {
                        'papers': [p.title for p in papers],
                        'description': line,
                        'strength': 0.8
                    }
                    analysis[current_section].append(connection)
                else:
                    # Clear title marks
                    line = line.replace('**', '')
                    if line and not line.endswith(':'):  # Başlık satırlarını atla
                        analysis[current_section].append(line)

        return analysis

    def _construct_analysis_prompt(self,papers : List[Paper]) -> str:
        # Creating an analysis prompts for the Gemini -            !!!Can be İmproved
        prompt = """
        Analyze for the following academic papers and provide:
        1.Connections between papers(including methodological and theoretical links)
        2.Research gaps in the field
        3.Promising future research directions
        4.Key findings and their implications

        Papers to analyze:
        """

        for paper in papers:
            prompt += f"\nTitle: {paper.title}\n"
            prompt += f"Abstract: {paper.abstract}\n"
            prompt += f"Authors: {', '.join(paper.authors)}\n"
            prompt += f"Year: {paper.year}\n"
            prompt += f"Keywords: {', '.join(paper.keywords or [])}\n"
            prompt += f"Content excerpt: {paper.content[:2000]}...\n\n"

        return prompt



    def _calculate_similarity_scores(self, papers: List[Paper]) -> Dict:

       ## Calculate similarity scores between papers with improved text processing and error handling.

        try:
            # Preprocess texts
            texts = []
            for paper in papers:
                # Combine title, abstract, and content for better similarity calculation
                text = f"{paper.title} {paper.abstract} {paper.content}"
                # Basic text cleaning
                text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces
                text = re.sub(r'[^\w\s]', ' ', text)  # Remove punctuation
                text = text.lower().strip()
                texts.append(text)

            if not texts:
                self.logger.error("No valid texts found for similarity calculation")
                return {}

            # Check if texts contain only stop words or are empty
            non_empty_texts = [text for text in texts if text.strip()]
            if not non_empty_texts:
                self.logger.warning("All texts are empty after preprocessing")
                return self._create_empty_similarity_matrix(papers)

            try:
                # Calculate TF-IDF
                tfidf_matrix = self.vectorizer.fit_transform(texts)

                # Check if we have any features
                if tfidf_matrix.shape[1] == 0:
                    self.logger.warning("No features extracted from texts")
                    return self._create_empty_similarity_matrix(papers)

                # Calculate cosine similarity
                similarity_matrix = cosine_similarity(tfidf_matrix)

                # Store results
                similarity_scores = {}
                for i, paper1 in enumerate(papers):
                    similarity_scores[paper1.title] = {}
                    for j, paper2 in enumerate(papers):
                        if i != j:
                            similarity_scores[paper1.title][paper2.title] = float(similarity_matrix[i][j])

                return similarity_scores

            except ValueError as ve:
                self.logger.error(f"Vectorization error: {str(ve)}")
                return self._create_empty_similarity_matrix(papers)

        except Exception as e:
            self.logger.error(f"Error in similarity calculation: {str(e)}")
            return self._create_empty_similarity_matrix(papers)

    def _create_empty_similarity_matrix(self, papers: List[Paper]) -> Dict:

        #Create an empty similarity matrix when similarity calculation fails.

        similarity_scores = {}
        for paper1 in papers:
            similarity_scores[paper1.title] = {}
            for paper2 in papers:
                if paper1.title != paper2.title:
                    similarity_scores[paper1.title][paper2.title] = 0.0
        return similarity_scores

    def _cluster_topics(self,papers :List[Paper] ) -> Dict:
        # Clustiring topics in article
        all_keywords = []
        for paper in papers:
            if paper.keywords:
                all_keywords.extend(paper.keywords)

        # Frequence Analysis
        keyword_freq = defaultdict(int)
        for keyword in all_keywords:
            keyword_freq[keyword] += 1

        #Clustering Topics
        clusters = defaultdict(list)
        for paper in papers:
            if paper.keywords:
                main_keyword = max(paper.keywords , key = lambda k : keyword_freq[k])
                clusters[main_keyword].append(paper.title)

        return dict(clusters)

    def _analyze_citation_network(self, papers: List[Paper]) -> Dict:

        #Analyze citation network with improved error handling and correct graph metrics calculation.

        try:
            G = nx.DiGraph()

            # Adding Nodes
            for paper in papers:
                G.add_node(paper.title)

            # Adding Edges
            for paper in papers:
                if paper.references:
                    for ref in paper.references:
                        for other_paper in papers:
                            # More robust reference matching
                            if (ref.lower() in other_paper.title.lower() or
                                other_paper.title.lower() in ref.lower()):
                                G.add_edge(paper.title, other_paper.title)

            # Calculate network metrics
            metrics = {
                'centrality': nx.degree_centrality(G),
                'pagerank': nx.pagerank(G),
                'node_count': G.number_of_nodes(),
                'edge_count': G.number_of_edges()
            }

            return metrics

        except Exception as e:
            self.logger.error(f"Error in citation network analysis: {str(e)}")
            # Return empty metrics if analysis fails
            return {
                'centrality': {},
                'pagerank': {},
                'node_count': 0,
                'edge_count': 0
            }



    def _analyze_methodologies(self, papers: List[Paper]) -> Dict:

        methodologies = defaultdict(int)
        datasets = defaultdict(int)

        # Expanded keyword patterns
        methodology_patterns = [
        r'(?:method|approach|technique|algorithm|methodology)s?\s*(?::|is|are|was|were)\s*([^.]*)',
        r'(?:we|authors)\s+(?:use|used|employ|employed|apply|applied)\s+([^.]*)',
        r'(?:proposed|developed|implemented)\s+(?:approach|method|technique)\s+([^.]*)'
        ]

        dataset_patterns = [
        r'(?:dataset|database|corpus|data\s+set)s?\s*(?::|is|are|was|were)\s*([^.]*)',
        r'(?:data|samples)\s+(?:were|was|is|are)\s+(?:collected|gathered|obtained)\s+(?:from)?\s*([^.]*)',
        r'(?:we|authors)\s+(?:use|used|collect|collected)\s+data\s+(?:from)?\s*([^.]*)'
        ]

        for paper in papers:
            # Find methodologies
            for pattern in methodology_patterns:
                matches = re.finditer(pattern, paper.content, re.IGNORECASE)
                for match in matches:
                    if match.group(1).strip():
                        methodologies[match.group(1).strip()] += 1

            # Find datasets
            for pattern in dataset_patterns:
                matches = re.finditer(pattern, paper.content, re.IGNORECASE)
                for match in matches:
                    if match.group(1).strip():
                        datasets[match.group(1).strip()] += 1

        return {
        'methodologies': dict(methodologies),
        'datasets': dict(datasets)
        }


    def generate_report(self,analysis : Analysis , output_dir : str = "./reports"):

        output_dir = Path(output_dir)
        output_dir.mkdir(exist_ok = True)

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        report_file = output_dir / f"research_analysis_{timestamp}.md"

        # Markdown report create
        with open(report_file, 'w', encoding='utf-8') as f:
            f.write("# Research Analysis Report\n\n")

            f.write("## Token Statistics\n\n")
            f.write(f"Total tokens processed: {analysis.total_tokens:,}\n\n")
            f.write("### Tokens per Paper\n")
            for title, count in analysis.token_counts.items():
                f.write(f"- {title}: {count:,} tokens\n")
            f.write("\n")

            # BConnections
            f.write("## Paper Connections\n\n")
            for conn in analysis.connections:
                f.write(f"- **Papers**: {' & '.join(conn['papers'])}\n")
                f.write(f"  - {conn['description']}\n")
                f.write(f"  - Strength: {conn['strength']:.2f}\n\n")

            # Research gaps
            f.write("## Research Gaps\n\n")
            for gap in analysis.research_gaps:
                f.write(f"- {gap}\n")

            # Future directions
            f.write("\n## Future Research Directions\n\n")
            for direction in analysis.future_directions:
                f.write(f"- {direction}\n")

            f.write("\n## Key Findings\n\n")
            for finding in analysis.key_findings:
                f.write(f"- {finding}\n")

            # Sİmilarity Scores
            f.write("\n## Paper Similarities\n\n")
            f.write("| Paper 1 | Paper 2 | Similarity Score |\n")
            f.write("|---------|----------|------------------|\n")
            for paper1, scores in analysis.similarity_scores.items():
                for paper2, score in scores.items():
                    f.write(f"| {paper1} | {paper2} | {score:.3f} |\n")

            # Topic clusters
            f.write("\n## Topic Clusters\n\n")
            for topic, papers in analysis.topic_clusters.items():
                f.write(f"### {topic}\n")
                for paper in papers:
                    f.write(f"- {paper}\n")

            # Metodoloji analyss
            f.write("\n## Methodology Analysis\n\n")
            f.write("### Common Methodologies\n")
            for method, count in analysis.methodology_analysis['methodologies'].items():
                f.write(f"- {method}: {count} occurrences\n")

            f.write("\n### Datasets Used\n")
            for dataset, count in analysis.methodology_analysis['datasets'].items():
                f.write(f"- {dataset}: {count} occurrences\n")

            # Citation network metrics
            f.write("\n## Citation Network Analysis\n\n")
            f.write("### Network Metrics\n")
            f.write(f"- Number of nodes: {analysis.citation_network['node_count']}\n")
            f.write(f"- Number of edges: {analysis.citation_network['edge_count']}\n")

            f.write("\n### PageRank Scores\n")
            for paper, score in analysis.citation_network['pagerank'].items():
                f.write(f"- {paper}: {score:.3f}\n")

            f.write("\n### Centrality Scores\n")
            for paper, score in analysis.citation_network['centrality'].items():
                f.write(f"- {paper}: {score:.3f}\n")

        # Create visualizations
        self._generate_visualizations(analysis, output_dir, timestamp)

        self.logger.info(f"Report generated successfully: {report_file}")
        return report_file




    def _generate_visualizations(self, analysis: Analysis, output_dir: Path, timestamp: str):
        """
        Parameters:
        - analysis: Analysis object containing all analysis results
        - output_dir: Path to output directory
        - timestamp: Timestamp string for unique filenames
        """
        try:
            # visualizations subdirectory
            vis_dir = output_dir / 'visualizations'
            vis_dir.mkdir(exist_ok=True)

            # Common figure settings for larger visualizations
            plt.rcParams.update({
                'font.size': 8,  # Smaller base font size
                'figure.figsize': (16, 12),  # Larger default figure size
                'figure.dpi': 300  # Higher resolution
            })

            # 1. Similarity Heatmap
            similarity_matrix = []
            paper_titles = list(analysis.similarity_scores.keys())

            # Group papers by topic for better organization
            topic_groups = defaultdict(list)
            for paper in paper_titles:
                topic = next((t for t, papers in analysis.topic_clusters.items()
                             if paper in papers), 'Other')
                topic_groups[topic].append(paper)

            # Sort papers by topic
            sorted_papers = []
            for topic in sorted(topic_groups.keys()):
                sorted_papers.extend(sorted(topic_groups[topic]))

            for paper1 in sorted_papers:
                row = []
                for paper2 in sorted_papers:
                    if paper1 == paper2:
                        row.append(1.0)
                    else:
                        row.append(analysis.similarity_scores[paper1].get(paper2, 0))
                similarity_matrix.append(row)

            plt.figure(figsize=(20, 16))
            sns.heatmap(similarity_matrix,
                        xticklabels=sorted_papers,
                        yticklabels=sorted_papers,
                        cmap='YlOrRd',
                        annot = True,
                        square=True)
            plt.xticks(rotation=45, ha='right', fontsize=6)
            plt.yticks(fontsize=6)
            plt.title('Paper Similarity Heatmap (Grouped by Topic)', pad=20)

            # Add topic separators and labels
            current_idx = 0
            for topic, papers in topic_groups.items():
                if current_idx > 0:
                    plt.axhline(y=current_idx, color='white', linewidth=2)
                    plt.axvline(x=current_idx, color='white', linewidth=2)
                current_idx += len(papers)

            plt.tight_layout()
            plt.savefig(vis_dir / f'similarity_heatmap_{timestamp}.png',
                        bbox_inches='tight')
            plt.close()

            # 2. Enhanced Citation Network Graph
            G = nx.DiGraph()

            # Filter edges based on configurable threshold
            similarity_threshold = 0.2 #0.3  # Can be made configurable
            for paper1, scores in analysis.similarity_scores.items():
                G.add_node(paper1)
                for paper2, score in scores.items():
                    if score > similarity_threshold:
                        G.add_node(paper2)
                        G.add_edge(paper1, paper2, weight=score)

            #   interactive network visualization using plotly
            pos = nx.spring_layout(G, k=1/np.sqrt(len(G.nodes())), iterations=50)

            edge_trace = go.Scatter(
                x=[], y=[], line=dict(width=2, color='#888'),
                hoverinfo='none', mode='lines')

            node_trace = go.Scatter(
                x=[], y=[], text=[], mode='markers+text',
                hoverinfo='text', textposition='bottom center',
                marker=dict(
                    showscale=True,
                    colorscale='YlGnBu',
                    size=20,
                    colorbar=dict(
                        thickness=15,
                        title='Node Connections',
                        xanchor='left',
                        titleside='right'
                    )
                )
            )

            # Add edges to trace
            for edge in G.edges():
                x0, y0 = pos[edge[0]]
                x1, y1 = pos[edge[1]]
                edge_trace['x'] += tuple([x0, x1, None])
                edge_trace['y'] += tuple([y0, y1, None])

            # Add nodes to trace
            for node in G.nodes():
                x, y = pos[node]
                node_trace['x'] += tuple([x])
                node_trace['y'] += tuple([y])
                node_trace['text'] += tuple([node])

            # Create figure
            fig = go.Figure(data=[edge_trace, node_trace],
                         layout=go.Layout(
                             title='Interactive Paper Citation Network',
                             showlegend=False,
                             hovermode='closest',
                             margin=dict(b=0, l=0, r=0, t=40),
                             annotations=[dict(
                                 text="",
                                 showarrow=False,
                                 xref="paper", yref="paper"
                             )],
                             xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                             yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
                         ))

            fig.write_html(vis_dir / f'interactive_citation_network_{timestamp}.html')

            # 3. Topic Distribution
            topic_counts = {topic: len(papers) for topic, papers in analysis.topic_clusters.items()}

            # Sort topics by count for better visualization
            sorted_topics = dict(sorted(topic_counts.items(), key=lambda x: x[1], reverse=True))

            plt.figure(figsize=(16, 8))
            bars = plt.bar(range(len(sorted_topics)), sorted_topics.values())
            plt.xticks(range(len(sorted_topics)), sorted_topics.keys(),
                       rotation=45, ha='right', fontsize=8)

            # Add value labels on top of bars
            for bar in bars:
                height = bar.get_height()
                plt.text(bar.get_x() + bar.get_width()/2., height,
                        f'{int(height)}',
                        ha='center', va='bottom')

            plt.title('Distribution of Papers Across Topics')
            plt.tight_layout()
            plt.savefig(vis_dir / f'topic_distribution_{timestamp}.png')
            plt.close()

            # 4. Methodology Pie Chart with Grouping
            methodology_counts = analysis.methodology_analysis['methodologies']
            if methodology_counts:
                # Group small slices into "Other" category
                threshold = 0.05  # 5% threshold for grouping
                total = sum(methodology_counts.values())
                grouped_methods = {}
                other_sum = 0

                for method, count in methodology_counts.items():
                    if count/total < threshold:
                        other_sum += count
                    else:
                        grouped_methods[method] = count

                if other_sum > 0:
                    grouped_methods['Other'] = other_sum

                plt.figure(figsize=(20, 16))
                plt.pie(grouped_methods.values(),
                        labels=grouped_methods.keys(),
                        autopct='%1.1f%%',
                        textprops={'fontsize': 6})
                plt.title('Methodology Distribution (Methods <5% Grouped as Other)')
                plt.tight_layout()
                plt.savefig(vis_dir / f'methodology_distribution_{timestamp}.png')
                plt.close()

            self.logger.info(f"Enhanced visualizations generated successfully in {vis_dir}")

        except Exception as e:
            self.logger.error(f"Error generating visualizations: {str(e)}")
            raise


    def batch_process_papers(self, directory: str) -> List[Paper]:

        #Process all PDFs in a directory with improved extraction

        papers = []
        directory_path = Path(directory)

        for pdf_file in directory_path.glob("*.pdf"):
            try:
                paper = self.parse_paper(str(pdf_file))
                if paper:
                    # Extract keywords if not already present
                    if not paper.keywords:
                        paper.keywords = self.extract_keywords(paper.content)

                    # Extract references if not already present
                    if not paper.references:
                        paper.references = self.extract_references(paper.content)

                    papers.append(paper)
                    self.logger.info(f"Successfully processed {pdf_file}")
                else:
                    self.logger.warning(f"Could not parse {pdf_file}")
            except Exception as e:
                self.logger.error(f"Error processing {pdf_file}: {str(e)}")

        return papers

    def save_analysis(self, analysis: Analysis, filepath: str):

        #JSON size record resulting from analysis

        try:
            # Analysis convert to dict
            analysis_dict = {
                'connections': analysis.connections,
                'research_gaps': analysis.research_gaps,
                'future_directions': analysis.future_directions,
                'key_findings': analysis.key_findings,
                'methodology_analysis': analysis.methodology_analysis,
                'topic_clusters': analysis.topic_clusters,
                'citation_network': analysis.citation_network,
                'similarity_scores': analysis.similarity_scores
            }

            # Save the json
            with open(filepath, 'w', encoding='utf-8') as f:
                json.dump(analysis_dict, f, indent=2, ensure_ascii=False)

            self.logger.info(f"Analysis saved successfully to {filepath}")
        except Exception as e:
            self.logger.error(f"Error saving analysis: {str(e)}")
            raise

    def load_analysis(self, filepath: str) -> Analysis:
        # Loading saved analysis results
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                analysis_dict = json.load(f)

            analysis = Analysis(
                connections=analysis_dict['connections'],
                research_gaps=analysis_dict['research_gaps'],
                future_directions=analysis_dict['future_directions'],
                key_findings=analysis_dict['key_findings'],
                methodology_analysis=analysis_dict['methodology_analysis'],
                topic_clusters=analysis_dict['topic_clusters'],
                citation_network=analysis_dict['citation_network'],
                similarity_scores=analysis_dict['similarity_scores']
            )

            self.logger.info(f"Analysis loaded successfully from {filepath}")
            return analysis
        except Exception as e:
            self.logger.error(f"Error loading analysis: {str(e)}")
            raise

In [None]:
# #. Example of using Research Assistant
from kaggle_secrets import UserSecretsClient
# 1. Starting Research Assistant

user_secrets = UserSecretsClient()
api_key = user_secrets.get_secret("GEMINI_API_KEY")

assistant = ResearchAssistant(
    api_key=api_key,
    cache_dir="./cache",

)

# 2. Batch processing PDF files
pdf_directory = "/kaggle/input/pdf-files/pdf"  # PDF'lerin bulunduğu dizin
papers = assistant.batch_process_papers(pdf_directory)

# 3. Analysis articles
analysis = assistant.analyze_papers(papers)

#4. Create Report
report_path = assistant.generate_report(
    analysis=analysis,
    output_dir="/kaggle/working/reports"  # Raporların kaydedileceği dizin
)

# 6. Save Analysis
assistant.save_analysis(
    analysis=analysis,
    filepath="/kaggle/working/analysis_results.json"
)

# # # 7.Load Saved Analysis
# # loaded_analysis = assistant.load_analysis("./analysis_results.json")

# # Example of processing a single PDF file
# single_paper = assistant.parse_paper("/kaggle/input/example-article/1751-0473-7-7.pdf")
# if single_paper:
#     print(f"Title: {single_paper.title}")
#     print(f"Authors: {','.join(single_paper.authors)}")
#     print(f"Year: {single_paper.year}")
#     print(f"Keywords: {', '.join(single_paper.keywords)}")

In [None]:
print(f"Total tokens processed: {analysis.total_tokens}")
for title, count in analysis.token_counts.items():
    print(f"{title}: {count} tokens")