In [1]:
from PyPDF2 import PdfReader
from collections import Counter
import re
import os
import pandas as pd

def extract_sections(pdf_path, sections_of_interest):
    """
    Extract specific sections (e.g., Abstract, Introduction) from a PDF.
    """
    reader = PdfReader(pdf_path)
    section_texts = {section: "" for section in sections_of_interest}
    
    current_section = None
    for page in reader.pages:
        text = page.extract_text()
        if text:
            # Ensure text is properly encoded and cleaned
            text = text.encode("utf-8", errors="ignore").decode("utf-8")
            lines = text.split("\n")
            for line in lines:
                line_clean = line.strip()
                if line_clean in sections_of_interest:
                    current_section = line_clean
                elif current_section and line_clean:
                    section_texts[current_section] += " " + line_clean
    
    return section_texts


def count_keywords(section_texts, keywords, keyword_list):
    """
    Count occurrences of keyword variations in specified sections.
    """
    keyword_counts = {key[0]: 0 for key in keywords}  # Initialize counts for primary terms
    keyword_sentences = {key[0]: [] for key in keywords}  # Store sentences with occurrences
    
    for section, text in section_texts.items():
        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
        for keyword_group in keywords:
            main_keyword = keyword_group[0]
            variations = keyword_group
            for sentence in sentences:
                for variation in variations:
                    if variation.lower() in sentence.lower():
                        keyword_counts[main_keyword] += 1
                        keyword_sentences[main_keyword].append(sentence.strip())
    
    return keyword_counts, keyword_sentences

def process_papers(folder_path, sections_of_interest, keywords):
    """
    Process all papers in a folder, count keywords, and compile results into a DataFrame.
    """
    results = []
    keyword_list = [k[0] for k in keywords]
    sentence_results = {key[0]: [] for key in keywords}
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            section_texts = extract_sections(pdf_path, sections_of_interest)
            keyword_counts, keyword_sentences = count_keywords(section_texts, keywords, keyword_list)
            results.append([filename] + [keyword_counts.get(keyword, 0) for keyword in keyword_list])
            for key in keyword_sentences:
                sentence_results[key].extend(keyword_sentences[key])
    
    columns = ["Paper Name"] + keyword_list
    df = pd.DataFrame(results, columns=columns)
    
    return df, sentence_results

if __name__ == "__main__":
    # Define sections of interest
    sections_of_interest = ["Abstract", "Introduction", "Discussion", "Conclusion"]

    # Define keyword groups
    keywords = [
    ["Novelty", "novel"],
    ["Simplicity", "simple"],
    ["Generalization", "generalisable"],
    ["Flexibility/Extensibility", "flexible", "flexibility", "extensibility", "extensible"],
    ["Robustness", "robust"],
    ["Realistic output", "realistic"],
    ["Formal description/analysis", "formal", "mathematical"],
    ["Theoretical guarantees", "guarantee"],
    ["Approximation", "approximate"],
    ["Quantitative evidence (e.g. experiments)", "experiment"],
    ["Qualitative evidence (e.g. examples)", "example"],
    ["Scientific methodology", "scientific"],
    ["Controllability (of model owner)", "control"],
    ["Human-like mechanism", "human"],
    ["Low cost", "cheap", "cost"],
    ["Large scale", "scale"],
    ["Promising"],
    ["Generality", "general"],
    ["Principled", "principles"],
    ["Exactness", "exact"],
    ["Preciseness", "precise"],
    ["Concreteness", "correct"],
    ["Automatic", "automated"],
    ["Performance"],
    ["Accuracy"],
    ["Avoiding train/test discrepancy", "train/test", "discrepancy"],
    ["State-of-the-art"],
    ["Efficiency", "efficient"],
    ["Reduced training time", "training time"],
    ["Memory efficiency"],
    ["Data efficiency"],
    ["Label efficiency (reduced need for labeled data)"],
    ["Energy efficiency"],
    ["Effectiveness", "effective"],
    ["Successful"],
    ["Building on classic work", "classic work"],
    ["Building in recent work", "recent work"],
    ["Unifying ideas or integrating components", "unifying"],
    ["Identifying limitations", "limitations"],
    ["Critique", "criticism"],
    ["Understanding (for researchers)", "understanding"],
    ["Improvement"],
    ["Progress"],
    ["Used in practice/Popular", "practice", "popular"],
    ["Reproducibility", "reproduce"],
    ["Easy to implement", "implement"],
    ["Requires few resources", "resources"],
    ["Parallelizability / distributed", "parallelizability", "parallelization", "distributed"],
    ["Facilitating use (e.g. sharing code)", "sharing code"],
    ["Scales up", "scale up"],
    ["Applies to real world", "real world"],
    ["Learning from humans"],
    ["Practical", "practice"],
    ["Useful", "usefulness"],
    ["Interpretable (to users)", "interpretable"],
    ["Transparent (to users)", "transparent", "transparency"],
    ["Privacy", "privacy", "private"],
    ["Fairness", "fair"],
    ["Not socially biased", "social bias", "socially bias", "social", "society"],
    ["User influence", "user"],
    ["Collective influence", "collective"],
    ["Deferral to humans"],
    ["Critiqability", "criticism"],
    ["Beneficence", "beneficable"],
    ["Non-maleficence"],
    ["Justice"],
    ["Respect for Persons"],
    ["Autonomy (power to decide)", "autonomy", "autonome"],
    ["Explicability", "explicable"],
    ["Respect for Law and public interest", "respect for law", "respect for public interest"],
    ["Security", "secure"],
    ["Easy to work with"],
    ["Realistic world model", "world model"],
    ["Fast", "speed"]
    ]


    # Process papers and save results
    keyword_matrix_neurips, sentences_neurips = process_papers("Paper Neurips", sections_of_interest, keywords)
    keyword_matrix_icml, sentences_icml = process_papers("Paper ICML", sections_of_interest, keywords)
    
    keyword_matrix_neurips.to_csv("keyword_counts_neurips.csv", index=False)
    print(f"Keyword counts saved to keyword_counts_neurips.csv")
    
    keyword_matrix_icml.to_csv("keyword_counts_icml.csv", index=False)
    print(f"Keyword counts saved to keyword_counts_icml.csv")

Keyword counts saved to keyword_counts_neurips.csv
Keyword counts saved to keyword_counts_icml.csv


In [7]:
# Print sentences containing keyword occurrences
# for key, sentences in sentences_neurips.items():
#     print(f"\nSentences containing '{key}' in NeurIPS papers:")
#     for sentence in sentences:
#         print(f"- {sentence}")
        
# for key, sentences in sentences_icml.items():
#     print(f"\nSentences containing '{key}' in ICML papers:")
#     for sentence in sentences:
#         print(f"- {sentence}")