In [1]:
from PyPDF2 import PdfReader
from collections import Counter
import re
import os
import pandas as pd
from IPython.display import display


def extract_text_excluding_references(pdf_path):
    """
    Extract text from a PDF excluding the references section.
    """
    reader = PdfReader(pdf_path)
    full_text = ""
    
    for page in reader.pages:
        text = page.extract_text()
        if text:
            # Ensure text is properly encoded and cleaned
            text = text.encode("utf-8", errors="ignore").decode("utf-8")
            full_text += text + "\n"
    
    # Remove the references section
    references_index = full_text.lower().rfind("references")
    if references_index != -1:
        full_text = full_text[:references_index]
    
    return full_text


def count_keywords(text, keywords):
    """
    Count occurrences of keyword variations in specified sections.
    """
    keyword_counts = {key[0]: 0 for key in keywords}  # Initialize counts for primary terms
    keyword_sentences = {key[0]: [] for key in keywords}  # Store sentences with occurrences
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    for keyword_group in keywords:
        main_keyword = keyword_group[0]
        variations = keyword_group
        for sentence in sentences:
            for variation in variations:
                if variation.lower() in sentence.lower():
                    keyword_counts[main_keyword] += 1
                    keyword_sentences[main_keyword].append(sentence.strip())
    
    return keyword_counts, keyword_sentences

def process_papers(folder_path, keywords):
    """
    Process all papers in a folder, count keywords, and compile results into a DataFrame.
    """
    results = []
    keyword_list = [k[0] for k in keywords]
    sentence_results = {key[0]: [] for key in keywords}
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            text = extract_text_excluding_references(pdf_path)
            keyword_counts, keyword_sentences = count_keywords(text, keywords)
            results.append([filename] + [keyword_counts.get(keyword, 0) for keyword in keyword_list])
            for key in keyword_sentences:
                sentence_results[key].extend(keyword_sentences[key])
    
    columns = ["Paper Name"] + keyword_list
    df = pd.DataFrame(results, columns=columns)
    
    return df, sentence_results

if __name__ == "__main__":
     # Define keyword groups
    keywords = [
    ["Novelty", "novel", "innovative", "groundbreaking"],
    ["Simplicity", "minimalistic", "concise", "parsimonious", "lightweight"],
    ["Generalization", "generalisable", "generalizable", "transferability", "out-of-distribution", "domain adaptation"],
    ["Flexibility/Extensibility", "flexible", "flexibility", "extensibility", "extensible", "adaptable", "modular", "scalable"],
    ["Robustness", "resilient", "fault-tolerant", "noise tolerance"], 
    ["Realistic output", "authentic", "plausible"], 
    ["Formal description/analysis", "formal", "mathematical", "rigorous", "analytical", "axiomatic", "proof-based"],
    ["Theoretical guarantees", "guarantee", "provable", "convergence proof", "theoretical bound", "performance bound"],
    ["Approximation", "approximation theory"], 
    ["Quantitative evidence (e.g. experiments)", "quantitative", "numerical results", "empirical study", "measurable"],
    ["Qualitative evidence (e.g. examples)", "case study", "illustrative"], 
    ["Scientific methodology",  "hypothesis-driven", "scientific"], 
    ["Controllability (of model owner)", "governability", "ownership", "model steering"],
    ["Human-like mechanism", "biologically inspired", "cognitive"],
    ["Low cost", "cheap", "cost", "affordable", "resource-efficient", "budget-friendly"],
    ["Large scale", "scalability", "big data", "high-capacity", "massive-scale"],
    ["Promising"], 
    ["Generality", "broad applicability", "domain-independent", "versatile"],
    ["Principled", "theoretically sound", "axiomatic", "methodologically rigorous"],
    ["Exactness", "error-free"], 
    ["Preciseness", "high-fidelity"], 
    ["Concreteness", "grounded", "verifiable"], 
    ["Automatic", "self-operating", "hands-free"], 
    ["Performance"], 
    ["Accuracy", "precision", "recall", "F1-score", "error rate", "reliability"],
    ["Avoiding train/test discrepancy", "train/test", "discrepancy", "distribution shift", "generalization gap"],
    ["State-of-the-art", "SOTA", "best performing", "cutting-edge", "latest"],
    ["Efficiency", "efficient"],
    ["Reduced training time", "training time", "fast training", "speed-up", "low latency"],
    ["Memory efficiency", "memory-efficient", "low memory footprint", "RAM optimization"],
    ["Data efficiency", "data-efficient", "few-shot", "self-supervised", "low data regime"],
    ["Label efficiency (reduced need for labeled data)", "label-efficient", "semi-supervised", "weak supervision"],
    ["Energy efficiency", "energy-efficient", "low power", "green AI", "sustainable AI"],
    ["Effectiveness"], 
    ["Successful"], 
    ["Building on classic work", "classic work", "foundational", "historical perspective"],
    ["Building on recent work", "recent work", "latest advancements", "current research"],
    ["Unifying ideas or integrating components", "unifying", "integrative", "synergistic", "compositional"],
    ["Identifying limitations", "limitations", "weaknesses", "failure modes"], 
    ["Critique", "criticism", "critical review"],
    ["Understanding (for researchers)", "understanding", "conceptual clarity"], 
    ["Improvement"],
    ["Progress"],
    ["Used in practice/Popular", "used in practice", "popular", "adopted", "real-world usage"],
    ["Reproducibility", "reproduce", "replication", "repeatability", "consistent results"],
    ["Easy to implement", "simple to use", "straightforward"],
    ["Requires few resources", "resources", "low-resource", "minimal requirements"],
    ["Parallelizability / distributed", "parallelizability", "parallelization", "distributed"],
    ["Facilitating use (e.g. sharing code)", "sharing code", "open-source"], 
    ["Scales up", "scale up", "expands", "large-scale deployment"],
    ["Applies to real world", "real world", "practical application", "real-world relevance"],
    ["Learning from humans", "human learning", "human-in-the-loop", "interactive learning"],
    ["Practical", "applied AI"], 
    ["Useful"], 
    ["Interpretable (to users)", "interpretable", "explainable"],
    ["Transparent (to users)", "transparent", "transparency", "accountability"],
    ["Privacy", "privacy", "private", "confidentiality", "data protection"],
    ["Fairness", "equitable", "bias mitigation"], 
    ["Not socially biased", "social bias", "socially biased", "fairness-aware", "bias-free", "equitable AI"],
    ["User influence", "user impact", "user effect", "human influence", "user control", "user agency"],
    ["Collective influence", "collective", "group influence", "crowd dynamics", "social influence", "peer effects"],
    ["Deferral to humans", "human oversight", "human intervention", "human in the loop", "human-AI collaboration"],
    ["Critiqability", "contestability", "scrutability", "reviewability"], 
    ["Beneficence", "beneficable", "welfare", "positive impact", "well-being", "prosocial", "altruistic", "altruism", "social good", "ethical principle"],
    ["Non-maleficence", "harm avoidance", "ethical AI", "AI safety", "harm reduction"],
    ["Justice", "equity", "bias mitigation", "equal treatment", "social justice"],
    ["Respect for Persons", "human dignity", "respect for individuals", "respect for rights", "human rights"],
    ["Autonomy (power to decide)", "autonomy", "autonome", "self-determination", "independence", "user agency", "free choice"],
    ["Explicability", "explicable", "interpretability", "transparency", "explainability", "understandability"],
    ["Respect for Law and public interest", "respect for law", "respect for public interest", "compliance", "regulatory adherence", "legal AI", "governance"],
    ["Security", "secure", "cybersecurity", "privacy protection", "adversarial robustness", "data security"],
    ["Easy to work with", "user-friendly", "ease of use"],
    ["Realistic world model", "world model", "real-world applicability", "realistic simulation", "grounded AI", "embodied intelligence"],
    ["Fast", "speed", "low latency", "real-time"]
    ]


    # Process papers and save results
    keyword_matrix_neurips, sentences_neurips = process_papers("Paper Neurips", keywords)
    keyword_matrix_icml, sentences_icml = process_papers("Paper ICML", keywords)
    
    keyword_matrix_neurips.to_csv("keyword_counts_neurips.csv", index=False)
    print(f"Keyword counts saved to keyword_counts_neurips.csv")
    
    keyword_matrix_icml.to_csv("keyword_counts_icml.csv", index=False)
    print(f"Keyword counts saved to keyword_counts_icml.csv")

Keyword counts saved to keyword_counts_neurips.csv
Keyword counts saved to keyword_counts_icml.csv


In [2]:
# Ensure the entire DataFrame is displayed
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_colwidth', None)  # Show full content of each cell
pd.set_option('display.expand_frame_repr', False)  # Prevent line wrapping

In [3]:
# Define the keyword to search for
keyword_to_search = "Fairness"  # Replace with the keyword you want to search for

# Initialize lists to store results
paper_names = []
flagged_sentences = []

# Process NeurIPS papers
if keyword_to_search in sentences_neurips:
    for filename in os.listdir("Paper Neurips"):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join("Paper Neurips", filename)
            text = extract_text_excluding_references(pdf_path)
            keyword_counts, keyword_sentences = count_keywords(text, [[keyword_to_search]])
            if keyword_to_search in keyword_sentences:
                for sentence in keyword_sentences[keyword_to_search]:
                    paper_names.append(filename)
                    flagged_sentences.append(sentence)

# Process ICML papers
if keyword_to_search in sentences_icml:
    for filename in os.listdir("Paper ICML"):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join("Paper ICML", filename)
            text = extract_text_excluding_references(pdf_path)
            keyword_counts, keyword_sentences = count_keywords(text, [[keyword_to_search]])
            if keyword_to_search in keyword_sentences:
                for sentence in keyword_sentences[keyword_to_search]:
                    paper_names.append(filename)
                    flagged_sentences.append(sentence)

# Create a DataFrame to store the results
df_flagged_sentences = pd.DataFrame({
    "Paper Name": paper_names,
    "Flagged Sentence": flagged_sentences
})

# Display the DataFrame
display(df_flagged_sentences)

Unnamed: 0,Paper Name,Flagged Sentence
0,NeurIPS-2023-llava-med-training-a-large-language-and-vision-assistant-for-biomedicine-in-one-day-Paper-Datasets_and_Benchmarks.pdf,"Further,\nby taking the self-enhancement bias into consideration for fairness, we expect that LLaV A-Med\nactually performs even closer to GPT-4 than the current numbers indicate."
1,NeurIPS-2023-data-selection-for-language-models-via-importance-resampling-Paper-Conference.pdf,"For example, DSIR can be used to collect more data on underrepresented\nsubpopulations and fine-tune the model on this data to improve model fairness."
2,NeurIPS-2023-magicbrush-a-manually-annotated-dataset-for-instruction-guided-image-editing-Paper-Datasets_and_Benchmarks.pdf,code repositories to guarantee reproducibility and fairness.
3,NeurIPS-2023-are-emergent-abilities-of-large-language-models-a-mirage-Paper-Conference.pdf,"2022) Over All BIG-Bench T asksaccuracy\nalignment_score\naverage\naverage_log_probability\navg_acc\nbias_level\nbleu\nbleurt\nbleurt_diff\ncombined_bias\ncorrect\ncorrect_prob_mass\ncustom_score\ndifference_score\nexact_str_match\nf1\nfairness\nfull\ngender_bias_score\ngender_minority_bias_score\ngender_minority_stereotype_score\ngender_stereotype_score\nlog10_p_dev\nlog_likelihood\nmacro_f1\nmain_words_match\nmean_accuracy\nmultiple_choice_grade\nnormalized_aggregate_score\nnumeric_match_with_0_1_relative_error\noverall\noverall gender bias\noverall_alpha_avg\noverall_difference\npair-wise-accuracy\nrelative_score\nrougeLsum\nsequence_f1\ntargets_reachedMetric\nFigure 5: Emergent abilities appear only for specific metrics, not task-model families."
4,NeurIPS-2023-are-emergent-abilities-of-large-language-models-a-mirage-Paper-Conference.pdf,"In 2022 ACM Conference on Fairness, Accountability, and Transparency ,\npages 1747–1764, 2022."
5,NeurIPS-2023-large-language-models-as-commonsense-knowledge-for-large-scale-task-planning-Paper-Conference.pdf,Further study about the fairness and bias of LLMs’ knowledge\nwould be beneficial.
6,NeurIPS-2023-qlora-efficient-finetuning-of-quantized-llms-Paper-Conference.pdf,"In Proceedings of the 2021 ACM conference on\nfairness, accountability, and transparency , pages 610–623, 2021."
7,NeurIPS-2023-principle-driven-self-alignment-of-language-models-from-scratch-with-minimal-human-supervision-Paper-Conference.pdf,"•Bias and fairness: TheDromedary model may inadvertently perpetuate or exacerbate existing\nbiases present in the pre-training data of its base language model, potentially leading to unfair or\ndiscriminatory outcomes."
8,NeurIPS-2023-principle-driven-self-alignment-of-language-models-from-scratch-with-minimal-human-supervision-Paper-Conference.pdf,Future work should address bias mitigation strategies to ensure fairness\nand inclusivity in AI applications.
9,NeurIPS-2023-doremi-optimizing-data-mixtures-speeds-up-language-model-pretraining-Paper-Conference.pdf,"Distributionally robust optimization (DRO),\nwhich is used in DoReMi to optimize the data mixture, can have a favorable impact on fairness [ 19]."


Sentence extraction without paper names: 

In [7]:
keyword_to_search = "User influence"

In [8]:
sentences_icml[keyword_to_search]

['However, user control-\nlability of the generated image, and fast adapta-\ntion to new tasks still remains an open challenge,\ncurrently mostly addressed by costly and long re-\ntraining and fine-tuning or ad-hoc adaptations to\nspecific image generation tasks.',
 'Recently,\na surge of methods have been proposed to gain wider and\nbetter user controllability.',
 'However, in\ncontrast to existing works that target a specific application,\nwithout a well defined objective, we propose a more general\napproach that allows us to unify different user control inputs\nin a more principled manner.']

In [9]:
sentences_neurips[keyword_to_search]

['Abstract\nAttaining a high degree of user controllability in visual generation often requires\nintricate, fine-grained inputs like layouts.']