In [13]:
from PyPDF2 import PdfReader
from collections import Counter
import re
import os
import pandas as pd

def extract_sections(pdf_path, sections_of_interest):
    """
    Extract specific sections (e.g., Abstract, Introduction) from a PDF.
    """
    reader = PdfReader(pdf_path)
    section_texts = {section: "" for section in sections_of_interest}
    
    current_section = None
    for page in reader.pages:
        text = page.extract_text()
        if text:
            # Ensure text is properly encoded and cleaned
            text = text.encode("utf-8", errors="ignore").decode("utf-8")
            lines = text.split("\n")
            for line in lines:
                line_clean = line.strip()
                if line_clean in sections_of_interest:
                    current_section = line_clean
                elif current_section and line_clean:
                    section_texts[current_section] += " " + line_clean
    
    return section_texts


def count_keywords(section_texts, keywords, keyword_list):
    """
    Count occurrences of keyword variations in specified sections.
    """
    keyword_counts = {key[0]: 0 for key in keywords}  # Initialize counts for primary terms
    keyword_sentences = {key[0]: [] for key in keywords}  # Store sentences with occurrences
    
    for section, text in section_texts.items():
        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
        for keyword_group in keywords:
            main_keyword = keyword_group[0]
            variations = keyword_group
            for sentence in sentences:
                for variation in variations:
                    if variation.lower() in sentence.lower():
                        keyword_counts[main_keyword] += 1
                        keyword_sentences[main_keyword].append(sentence.strip())
    
    return keyword_counts, keyword_sentences

def process_papers(folder_path, sections_of_interest, keywords):
    """
    Process all papers in a folder, count keywords, and compile results into a DataFrame.
    """
    results = []
    keyword_list = [k[0] for k in keywords]
    sentence_results = {key[0]: [] for key in keywords}
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            section_texts = extract_sections(pdf_path, sections_of_interest)
            keyword_counts, keyword_sentences = count_keywords(section_texts, keywords, keyword_list)
            results.append([filename] + [keyword_counts.get(keyword, 0) for keyword in keyword_list])
            for key in keyword_sentences:
                sentence_results[key].extend(keyword_sentences[key])
    
    columns = ["Paper Name"] + keyword_list
    df = pd.DataFrame(results, columns=columns)
    
    return df, sentence_results

if __name__ == "__main__":
    # Define sections of interest
    sections_of_interest = ["Abstract", "Introduction", "Discussion", "Conclusion"]

    # Define keyword groups
    keywords = [
    # ["Novelty", "novel"],
    # ["Simplicity", "simple"],
    # ["Generalization", "generalisable"],
    # ["Flexibility/Extensibility", "flexible", "flexibility", "extensibility", "extensible"],
    # ["Robustness", "robust"],
    # ["Realistic output", "realistic"],
    # ["Formal description/analysis", "formal", "mathematical"],
    # ["Theoretical guarantees", "guarantee"],
    # ["Approximation", "approximate"],
    # ["Quantitative evidence (e.g. experiments)", "quantitative"],
    # ["Qualitative evidence (e.g. examples)", "qualitative"],
    # ["Scientific methodology", "scientific"],
    # ["Controllability (of model owner)", "control"],
    # ["Human-like mechanism", "human"],
    # ["Low cost", "cheap", "cost"],
    # ["Large scale", "scale"],
    # ["Promising"],
    # ["Generality", "general"],
    # ["Principled", "principles"],
    # ["Exactness", "exact"],
    # ["Preciseness", "precise"],
    # ["Concreteness", "correct"],
    # ["Automatic", "automated"],
    # ["Performance"],
    # ["Accuracy"],
    # ["Avoiding train/test discrepancy", "train/test", "discrepancy"],
    # ["State-of-the-art"],
    # ["Efficiency", "efficient"],
    # ["Reduced training time", "training time"],
    # ["Memory efficiency"],
    # ["Data efficiency"],
    # ["Label efficiency (reduced need for labeled data)"],
    # ["Energy efficiency"],
    # ["Effectiveness", "effective"],
    # ["Successful"],
    # ["Building on classic work", "classic work"],
    # ["Building in recent work", "recent work"],
    # ["Unifying ideas or integrating components", "unifying"],
    # ["Identifying limitations", "limitations"],
    # ["Critique", "criticism"],
    # ["Understanding (for researchers)", "understanding"],
    # ["Improvement"],
    # ["Progress"],
    # ["Used in practice/Popular", "practice", "popular"],
    # ["Reproducibility", "reproduce"],
    # ["Easy to implement", "implement"],
    # ["Requires few resources", "resources"],
    # ["Parallelizability / distributed", "parallelizability", "parallelization", "distributed"],
    # ["Facilitating use (e.g. sharing code)", "sharing code"],
    # ["Scales up", "scale up"],
    # ["Applies to real world", "real world"],
    # ["Learning from humans"],
    # ["Practical", "practice"],
    # ["Useful", "usefulness"],
    # ["Interpretable (to users)", "interpretable"],
    # ["Transparent (to users)", "transparent", "transparency"],
    # ["Privacy", "privacy", "private"],
    # ["Fairness", "fair"],
    # ["Not socially biased", "social bias", "socially bias", "social", "society"],
    ["Novelty", "novel", "innovative", "groundbreaking"],
    ["Simplicity", "minimalistic", "concise", "parsimonious", "lightweight"],
    ["Generalization", "generalisable", "generalizable", "transferability", "out-of-distribution", "domain adaptation"],
    ["Flexibility/Extensibility", "flexible", "flexibility", "extensibility", "extensible", "adaptable", "modular", "scalable"],
    ["Robustness", "resilient", "fault-tolerant", "noise tolerance"], #"robust",
    ["Realistic output", "authentic", "plausible"], #, "realistic"
    ["Formal description/analysis", "formal", "mathematical", "rigorous", "analytical", "axiomatic", "proof-based"],
    ["Theoretical guarantees", "guarantee", "provable", "convergence proof", "theoretical bound", "performance bound"],
    ["Approximation", "approximation theory"], #, "estimation" "approximate", 
    ["Quantitative evidence (e.g. experiments)", "quantitative", "numerical results", "empirical study", "measurable"],
    ["Qualitative evidence (e.g. examples)", "case study", "illustrative"], #"qualitative", descriptive
    ["Scientific methodology",  "hypothesis-driven", "scientific"], #empirical
    ["Controllability (of model owner)", "governability", "ownership", "model steering"],
    ["Human-like mechanism", "biologically inspired", "cognitive"],
    ["Low cost", "cheap", "cost", "affordable", "resource-efficient", "budget-friendly"],
    ["Large scale", "scalability", "big data", "high-capacity", "massive-scale"],
    ["Promising"], #, "breakthrough"
    ["Generality", "broad applicability", "domain-independent", "versatile"],
    ["Principled", "theoretically sound", "axiomatic", "methodologically rigorous"],
    ["Exactness", "error-free"], #"exact",
    ["Preciseness", "high-fidelity"], #"precise", 
    ["Concreteness", "grounded", "verifiable"], #correct 
    ["Automatic", "self-operating", "hands-free"], #, "autonomous"
    ["Performance"], # "benchmark", "optimization"
    ["Accuracy", "precision", "recall", "F1-score", "error rate", "reliability"],
    ["Avoiding train/test discrepancy", "train/test", "discrepancy", "distribution shift", "generalization gap"],
    ["State-of-the-art", "SOTA", "best performing", "cutting-edge", "latest"],
    ["Efficiency", "efficient"],
    ["Reduced training time", "training time", "fast training", "speed-up", "low latency"],
    ["Memory efficiency", "memory-efficient", "low memory footprint", "RAM optimization"],
    ["Data efficiency", "data-efficient", "few-shot", "self-supervised", "low data regime"],
    ["Label efficiency (reduced need for labeled data)", "label-efficient", "semi-supervised", "weak supervision"],
    ["Energy efficiency", "energy-efficient", "low power", "green AI", "sustainable AI"],
    ["Effectiveness"], #, "effective"
    ["Successful"], #, "validated"
    ["Building on classic work", "classic work", "foundational", "historical perspective"],
    ["Building on recent work", "recent work", "latest advancements", "current research"],
    ["Unifying ideas or integrating components", "unifying", "integrative", "synergistic", "compositional"],
    ["Identifying limitations", "limitations", "weaknesses", "failure modes"], #"constraints"
    ["Critique", "criticism", "critical review"],
    ["Understanding (for researchers)", "understanding", "conceptual clarity"], # , "interpretation"
    ["Improvement"],
    ["Progress"],
    ["Used in practice/Popular", "used in practice", "popular", "adopted", "real-world usage"],
    ["Reproducibility", "reproduce", "replication", "repeatability", "consistent results"],
    ["Easy to implement", "simple to use", "straightforward"],
    ["Requires few resources", "resources", "low-resource", "minimal requirements"],
    ["Parallelizability / distributed", "parallelizability", "parallelization", "distributed"],
    ["Facilitating use (e.g. sharing code)", "sharing code", "open-source"], #, "collaborative"
    ["Scales up", "scale up", "expands", "large-scale deployment"],
    ["Applies to real world", "real world", "practical application", "real-world relevance"],
    ["Learning from humans", "human learning", "human-in-the-loop", "interactive learning"],
    ["Practical", "applied AI"], #, "practice"
    ["Useful"], #, "beneficial" usefulness
    ["Interpretable (to users)", "interpretable", "explainable"],
    ["Transparent (to users)", "transparent", "transparency", "accountability"],
    ["Privacy", "privacy", "private", "confidentiality", "data protection"],
    ["Fairness", "equitable", "bias mitigation"], #unbiased
    ["Not socially biased", "social bias", "socially biased", "fairness-aware", "bias-free", "equitable AI"],
    ["User influence", "user impact", "user effect", "human influence", "user control", "user agency"],
    ["Collective influence", "collective", "group influence", "crowd dynamics", "social influence", "peer effects"],
    ["Deferral to humans", "human oversight", "human intervention", "human in the loop", "human-AI collaboration"],
    ["Critiqability", "contestability", "scrutability", "reviewability"], #feedback
    ["Beneficence", "beneficable", "welfare", "positive impact", "well-being", "prosocial"],
    ["Non-maleficence", "harm avoidance", "ethical AI", "AI safety", "harm reduction"],
    ["Justice", "equity", "bias mitigation", "equal treatment", "social justice"],
    ["Respect for Persons", "human dignity", "respect for individuals", "respect for rights", "human rights"],
    ["Autonomy (power to decide)", "autonomy", "autonome", "self-determination", "independence", "user agency", "free choice"],
    ["Explicability", "explicable", "interpretability", "transparency", "explainability", "understandability"],
    ["Respect for Law and public interest", "respect for law", "respect for public interest", "compliance", "regulatory adherence", "legal AI", "governance"],
    ["Security", "secure", "cybersecurity", "privacy protection", "adversarial robustness", "data security"],
    ["Easy to work with", "user-friendly", "ease of use"],
    ["Realistic world model", "world model", "real-world applicability", "realistic simulation", "grounded AI", "embodied intelligence"],
    ["Fast", "speed", "low latency", "real-time"]
    ]


    # Process papers and save results
    keyword_matrix_neurips, sentences_neurips = process_papers("Paper Neurips", sections_of_interest, keywords)
    keyword_matrix_icml, sentences_icml = process_papers("Paper ICML", sections_of_interest, keywords)
    
    keyword_matrix_neurips.to_csv("keyword_counts_neurips.csv", index=False)
    print(f"Keyword counts saved to keyword_counts_neurips.csv")
    
    keyword_matrix_icml.to_csv("keyword_counts_icml.csv", index=False)
    print(f"Keyword counts saved to keyword_counts_icml.csv")

Keyword counts saved to keyword_counts_neurips.csv
Keyword counts saved to keyword_counts_icml.csv


In [8]:
# Print sentences containing keyword occurrences
# for key, sentences in sentences_neurips.items():
#     print(f"\nSentences containing '{key}' in NeurIPS papers:")
#     for sentence in sentences:
#         print(f"- {sentence}")
        
# for key, sentences in sentences_icml.items():
#     print(f"\nSentences containing '{key}' in ICML papers:")
#     for sentence in sentences:
#         print(f"- {sentence}")