In [1]:
from PyPDF2 import PdfReader
from collections import Counter
import re
import os
import pandas as pd
from IPython.display import display


def extract_sections(pdf_path, sections_of_interest):
    """
    Extract specific sections (e.g., Abstract, Introduction) from a PDF.
    """
    reader = PdfReader(pdf_path)
    section_texts = {section: "" for section in sections_of_interest}
    
    current_section = None
    for page in reader.pages:
        text = page.extract_text()
        if text:
            # Ensure text is properly encoded and cleaned
            text = text.encode("utf-8", errors="ignore").decode("utf-8")
            lines = text.split("\n")
            for line in lines:
                line_clean = line.strip()
                if line_clean in sections_of_interest:
                    current_section = line_clean
                elif current_section and line_clean:
                    section_texts[current_section] += " " + line_clean
    
    return section_texts


def count_keywords(section_texts, keywords, keyword_list):
    """
    Count occurrences of keyword variations in specified sections.
    """
    keyword_counts = {key[0]: 0 for key in keywords}  # Initialize counts for primary terms
    keyword_sentences = {key[0]: [] for key in keywords}  # Store sentences with occurrences
    
    for section, text in section_texts.items():
        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
        for keyword_group in keywords:
            main_keyword = keyword_group[0]
            variations = keyword_group
            for sentence in sentences:
                for variation in variations:
                    if variation.lower() in sentence.lower():
                        keyword_counts[main_keyword] += 1
                        keyword_sentences[main_keyword].append(sentence.strip())
    
    return keyword_counts, keyword_sentences

def process_papers(folder_path, sections_of_interest, keywords):
    """
    Process all papers in a folder, count keywords, and compile results into a DataFrame.
    """
    results = []
    keyword_list = [k[0] for k in keywords]
    sentence_results = {key[0]: [] for key in keywords}
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            section_texts = extract_sections(pdf_path, sections_of_interest)
            keyword_counts, keyword_sentences = count_keywords(section_texts, keywords, keyword_list)
            results.append([filename] + [keyword_counts.get(keyword, 0) for keyword in keyword_list])
            for key in keyword_sentences:
                sentence_results[key].extend(keyword_sentences[key])
    
    columns = ["Paper Name"] + keyword_list
    df = pd.DataFrame(results, columns=columns)
    
    return df, sentence_results

if __name__ == "__main__":
    # Define sections of interest
    sections_of_interest = ["Abstract", "Introduction", "Discussion", "Conclusion"]

    # Define keyword groups
    keywords = [
    ["Novelty", "novel", "innovative", "groundbreaking"],
    ["Simplicity", "minimalistic", "concise", "parsimonious", "lightweight"],
    ["Generalization", "generalisable", "generalizable", "transferability", "out-of-distribution", "domain adaptation"],
    ["Flexibility/Extensibility", "flexible", "flexibility", "extensibility", "extensible", "adaptable", "modular", "scalable"],
    ["Robustness", "resilient", "fault-tolerant", "noise tolerance"], 
    ["Realistic output", "authentic", "plausible"], 
    ["Formal description/analysis", "formal", "mathematical", "rigorous", "analytical", "axiomatic", "proof-based"],
    ["Theoretical guarantees", "guarantee", "provable", "convergence proof", "theoretical bound", "performance bound"],
    ["Approximation", "approximation theory"], 
    ["Quantitative evidence (e.g. experiments)", "quantitative", "numerical results", "empirical study", "measurable"],
    ["Qualitative evidence (e.g. examples)", "case study", "illustrative"], 
    ["Scientific methodology",  "hypothesis-driven", "scientific"], 
    ["Controllability (of model owner)", "governability", "ownership", "model steering"],
    ["Human-like mechanism", "biologically inspired", "cognitive"],
    ["Low cost", "cheap", "cost", "affordable", "resource-efficient", "budget-friendly"],
    ["Large scale", "scalability", "big data", "high-capacity", "massive-scale"],
    ["Promising"], 
    ["Generality", "broad applicability", "domain-independent", "versatile"],
    ["Principled", "theoretically sound", "axiomatic", "methodologically rigorous"],
    ["Exactness", "error-free"], 
    ["Preciseness", "high-fidelity"], 
    ["Concreteness", "grounded", "verifiable"], 
    ["Automatic", "self-operating", "hands-free"], 
    ["Performance"], 
    ["Accuracy", "precision", "recall", "F1-score", "error rate", "reliability"],
    ["Avoiding train/test discrepancy", "train/test", "discrepancy", "distribution shift", "generalization gap"],
    ["State-of-the-art", "SOTA", "best performing", "cutting-edge", "latest"],
    ["Efficiency", "efficient"],
    ["Reduced training time", "training time", "fast training", "speed-up", "low latency"],
    ["Memory efficiency", "memory-efficient", "low memory footprint", "RAM optimization"],
    ["Data efficiency", "data-efficient", "few-shot", "self-supervised", "low data regime"],
    ["Label efficiency (reduced need for labeled data)", "label-efficient", "semi-supervised", "weak supervision"],
    ["Energy efficiency", "energy-efficient", "low power", "green AI", "sustainable AI"],
    ["Effectiveness"], 
    ["Successful"], 
    ["Building on classic work", "classic work", "foundational", "historical perspective"],
    ["Building on recent work", "recent work", "latest advancements", "current research"],
    ["Unifying ideas or integrating components", "unifying", "integrative", "synergistic", "compositional"],
    ["Identifying limitations", "limitations", "weaknesses", "failure modes"], 
    ["Critique", "criticism", "critical review"],
    ["Understanding (for researchers)", "understanding", "conceptual clarity"], 
    ["Improvement"],
    ["Progress"],
    ["Used in practice/Popular", "used in practice", "popular", "adopted", "real-world usage"],
    ["Reproducibility", "reproduce", "replication", "repeatability", "consistent results"],
    ["Easy to implement", "simple to use", "straightforward"],
    ["Requires few resources", "resources", "low-resource", "minimal requirements"],
    ["Parallelizability / distributed", "parallelizability", "parallelization", "distributed"],
    ["Facilitating use (e.g. sharing code)", "sharing code", "open-source"], 
    ["Scales up", "scale up", "expands", "large-scale deployment"],
    ["Applies to real world", "real world", "practical application", "real-world relevance"],
    ["Learning from humans", "human learning", "human-in-the-loop", "interactive learning"],
    ["Practical", "applied AI"], 
    ["Useful"], 
    ["Interpretable (to users)", "interpretable", "explainable"],
    ["Transparent (to users)", "transparent", "transparency", "accountability"],
    ["Privacy", "privacy", "private", "confidentiality", "data protection"],
    ["Fairness", "equitable", "bias mitigation"], 
    ["Not socially biased", "social bias", "socially biased", "fairness-aware", "bias-free", "equitable AI"],
    ["User influence", "user impact", "user effect", "human influence", "user control", "user agency"],
    ["Collective influence", "collective", "group influence", "crowd dynamics", "social influence", "peer effects"],
    ["Deferral to humans", "human oversight", "human intervention", "human in the loop", "human-AI collaboration"],
    ["Critiqability", "contestability", "scrutability", "reviewability"], 
    ["Beneficence", "beneficable", "welfare", "positive impact", "well-being", "prosocial"],
    ["Non-maleficence", "harm avoidance", "ethical AI", "AI safety", "harm reduction"],
    ["Justice", "equity", "bias mitigation", "equal treatment", "social justice"],
    ["Respect for Persons", "human dignity", "respect for individuals", "respect for rights", "human rights"],
    ["Autonomy (power to decide)", "autonomy", "autonome", "self-determination", "independence", "user agency", "free choice"],
    ["Explicability", "explicable", "interpretability", "transparency", "explainability", "understandability"],
    ["Respect for Law and public interest", "respect for law", "respect for public interest", "compliance", "regulatory adherence", "legal AI", "governance"],
    ["Security", "secure", "cybersecurity", "privacy protection", "adversarial robustness", "data security"],
    ["Easy to work with", "user-friendly", "ease of use"],
    ["Realistic world model", "world model", "real-world applicability", "realistic simulation", "grounded AI", "embodied intelligence"],
    ["Fast", "speed", "low latency", "real-time"]
    ]


    # Process papers and save results
    keyword_matrix_neurips, sentences_neurips = process_papers("Paper Neurips", sections_of_interest, keywords)
    keyword_matrix_icml, sentences_icml = process_papers("Paper ICML", sections_of_interest, keywords)
    
    keyword_matrix_neurips.to_csv("keyword_counts_neurips.csv", index=False)
    print(f"Keyword counts saved to keyword_counts_neurips.csv")
    
    keyword_matrix_icml.to_csv("keyword_counts_icml.csv", index=False)
    print(f"Keyword counts saved to keyword_counts_icml.csv")

Keyword counts saved to keyword_counts_neurips.csv
Keyword counts saved to keyword_counts_icml.csv


In [2]:
# Ensure the entire DataFrame is displayed
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_colwidth', None)  # Show full content of each cell
pd.set_option('display.expand_frame_repr', False)  # Prevent line wrapping

In [8]:
keyword_to_search = "Fairness"  # Replace with the keyword you want to search for

results_neurips = []

if keyword_to_search in sentences_neurips:
    print(f"\nSentences containing '{keyword_to_search}' in NeurIPS papers:")
    for filename in os.listdir("Paper Neurips"):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join("Paper Neurips", filename)
            section_texts = extract_sections(pdf_path, sections_of_interest)
            keyword_counts, keyword_sentences = count_keywords(section_texts, keywords, [keyword_to_search])
            if keyword_to_search in keyword_sentences:
                for sentence in keyword_sentences[keyword_to_search]:
                    results_neurips.append([filename, sentence])
else:
    print(f"No sentences found for the keyword '{keyword_to_search}' in NeurIPS papers.")

# Create a DataFrame to store the results
df_results = pd.DataFrame(results_neurips, columns=["Paper Name", "Sentence"])
display(df_results)


Sentences containing 'Fairness' in NeurIPS papers:


Unnamed: 0,Paper Name,Sentence
0,NeurIPS-2023-llava-med-training-a-large-language-and-vision-assistant-for-biomedicine-in-one-day-Paper-Datasets_and_Benchmarks.pdf,"Further, by taking the self-enhancement bias into consideration for fairness, we expect that LLaV A-Med actually performs even closer to GPT-4 than the current numbers indicate."
1,NeurIPS-2023-llava-med-training-a-large-language-and-vision-assistant-for-biomedicine-in-one-day-Paper-Datasets_and_Benchmarks.pdf,"(2) Bias and Fairness: Since we do not have access to the training data of GPT4, the generated instruct data might reflect those biases, reinforcing social or cultural inequalities in the base model training."
2,NeurIPS-2023-data-selection-for-language-models-via-importance-resampling-Paper-Conference.pdf,"For example, DSIR can be used to collect more data on underrepresented subpopulations and fine-tune the model on this data to improve model fairness."
3,NeurIPS-2023-generating-images-with-multimodal-language-models-Paper-Conference.pdf,"In Proceedings of the 2021 ACM Conference on Fairness, Accountability, and Transparency , pages 610–623, 2021."
4,NeurIPS-2023-language-models-can-solve-computer-tasks-Paper-Conference.pdf,"Ensuring transparency, accountability, and fairness in AI systems will be vital in harnessing the benefits while minimizing potential harm."
5,NeurIPS-2023-magicbrush-a-manually-annotated-dataset-for-instruction-guided-image-editing-Paper-Datasets_and_Benchmarks.pdf,code repositories to guarantee reproducibility and fairness.
6,NeurIPS-2023-magicbrush-a-manually-annotated-dataset-for-instruction-guided-image-editing-Paper-Datasets_and_Benchmarks.pdf,"C.2 Baseline Details For all baselines, we adopt the default hyperparameters available in the official code repositories to guarantee reproducibility and fairness."
7,NeurIPS-2023-mathematical-capabilities-of-chatgpt-Paper-Datasets_and_Benchmarks.pdf,"In Proceedings of the conference on fairness, accountability, and transparency , pages 220–229, 2019."
8,NeurIPS-2023-aging-with-grace-lifelong-model-editing-with-discrete-key-value-adaptors-Paper-Conference.pdf,Fairlex: A multilingual benchmark for evaluating fairness in legal text processing.
9,NeurIPS-2023-are-emergent-abilities-of-large-language-models-a-mirage-Paper-Conference.pdf,"2022) Over All BIG-Bench T asksaccuracy alignment_score average average_log_probability avg_acc bias_level bleu bleurt bleurt_diff combined_bias correct correct_prob_mass custom_score difference_score exact_str_match f1 fairness full gender_bias_score gender_minority_bias_score gender_minority_stereotype_score gender_stereotype_score log10_p_dev log_likelihood macro_f1 main_words_match mean_accuracy multiple_choice_grade normalized_aggregate_score numeric_match_with_0_1_relative_error overall overall gender bias overall_alpha_avg overall_difference pair-wise-accuracy relative_score rougeLsum sequence_f1 targets_reachedMetric Figure 5: Emergent abilities appear only for specific metrics, not task-model families."


In [7]:
keyword_to_search = "Fairness"  # Replace with the keyword you want to search for

results_icml = []

if keyword_to_search in sentences_icml:
    print(f"\nSentences containing '{keyword_to_search}' in ICML papers:")
    for filename in os.listdir("Paper ICML"):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join("Paper ICML", filename)
            section_texts = extract_sections(pdf_path, sections_of_interest)
            keyword_counts, keyword_sentences = count_keywords(section_texts, keywords, [keyword_to_search])
            if keyword_to_search in keyword_sentences:
                for sentence in keyword_sentences[keyword_to_search]:
                    results_icml.append([filename, sentence])
else:
    print(f"No sentences found for the keyword '{keyword_to_search}' in ICML papers.")

# Create a DataFrame to store the results
df_results_icml = pd.DataFrame(results_icml, columns=["Paper Name", "Sentence"])
display(df_results_icml)


Sentences containing 'Fairness' in ICML papers:


Unnamed: 0,Paper Name,Sentence
0,ICML-2023-Guiding-Pretraining-in-Reinforcement-Learning-with-Large-Language-Models.pdf,"In Proceedings of the 2021 ACM conference on fairness, accountability, and trans- parency , pp."
1,ICML-2023-A-Toy-Model-of-Universality.pdf,"In 2022 ACM Confer- ence on Fairness, Accountability, and Transparency , pp."
2,ICML-2023-Muse-Text-To-Image-Generation-via-Masked-Generative-Transformers.pdf,"InConference on Fairness, Accountability and Trans- parency, pp."
3,ICML-2023-Muse-Text-To-Image-Generation-via-Masked-Generative-Transformers.pdf,"In Proceedings of the 2021 ACM Conference on Fairness, Accountability, and Transparency, pp."
4,ICML-2023-Muse-Text-To-Image-Generation-via-Masked-Generative-Transformers.pdf,"InProceedings of the 2021 ACM conference on Fairness, Accountability, and Transparency, pp."
5,ICML-2023-Change-is-Hard-A-Closer-Look-at-Subpopulation-Shift.pdf,"Finally, when subpopulations are defined based on a partic- ular attribute (e.g., demographic group) (Pfohl et al., 2022; Zong et al., 2022), the objective of maximizing performance for the worst-case group then becomes identical to minimax fairness (Lahoti et al., 2020; Martinez et al., 2020).In this work, we present a unified framework of subpopula- tion shift across these aforementioned scenarios."
6,ICML-2023-Change-is-Hard-A-Closer-Look-at-Subpopulation-Shift.pdf,Fairness without demographics in repeated loss minimiza- tion.
7,ICML-2023-Change-is-Hard-A-Closer-Look-at-Subpopulation-Shift.pdf,Fairness without demograph- ics through adversarially reweighted learning.
8,ICML-2023-Change-is-Hard-A-Closer-Look-at-Subpopulation-Shift.pdf,Minimax pareto fairness: A multi objective perspective.
9,ICML-2023-Change-is-Hard-A-Closer-Look-at-Subpopulation-Shift.pdf,Blind pareto fairness and subgroup ro- bustness.
