In [13]:
from PyPDF2 import PdfReader
from collections import Counter
import re
import os
import pandas as pd
from IPython.display import display


def extract_text_excluding_references(pdf_path):
    """
    Extract text from a PDF excluding the references section.
    """
    reader = PdfReader(pdf_path)
    full_text = ""
    
    for page in reader.pages:
        text = page.extract_text()
        if text:
            # Ensure text is properly encoded and cleaned
            text = text.encode("utf-8", errors="ignore").decode("utf-8")
            full_text += text + "\n"
    
    # Remove the references section
    references_index = full_text.lower().rfind("references")
    if references_index != -1:
        full_text = full_text[:references_index]
    
    print(full_text)
    return full_text


def count_keywords(text, keywords):
    """
    Count occurrences of keyword variations in the text.
    """
    keyword_counts = {key[0]: 0 for key in keywords}  # Initialize counts for primary terms
    keyword_sentences = {key[0]: [] for key in keywords}  # Store sentences with occurrences
    
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    for keyword_group in keywords:
        main_keyword = keyword_group[0]
        variations = keyword_group
        for sentence in sentences:
            for variation in variations:
                if variation.lower() in sentence.lower():
                    keyword_counts[main_keyword] += 1
                    keyword_sentences[main_keyword].append(sentence.strip())
    
    return keyword_counts, keyword_sentences

def process_papers(folder_path, keywords):
    """
    Process all papers in a folder, count keywords, and compile results into a DataFrame.
    """
    results = []
    keyword_list = [k[0] for k in keywords]
    sentence_results = {key[0]: [] for key in keywords}
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            text = extract_text_excluding_references(pdf_path)
            keyword_counts, keyword_sentences = count_keywords(text, keywords)
            results.append([filename] + [keyword_counts.get(keyword, 0) for keyword in keyword_list])
            for key in keyword_sentences:
                sentence_results[key].extend(keyword_sentences[key])
    
    columns = ["Paper Name"] + keyword_list
    df = pd.DataFrame(results, columns=columns)
    
    return df, sentence_results

if __name__ == "__main__":
    # Define keyword groups
    keywords = [
        ["Novelty", "novel", "innovative", "groundbreaking"],
        ["Simplicity", "minimalistic", "concise", "parsimonious", "lightweight"],
        ["Generalization", "generalisable", "generalizable", "transferability", "out-of-distribution", "domain adaptation"],
        ["Flexibility/Extensibility", "flexible", "flexibility", "extensibility", "extensible", "adaptable", "modular", "scalable"],
        ["Robustness", "resilient", "fault-tolerant", "noise tolerance"], 
        ["Realistic output", "authentic", "plausible"], 
        ["Formal description/analysis", "formal", "mathematical", "rigorous", "analytical", "axiomatic", "proof-based"],
        ["Theoretical guarantees", "guarantee", "provable", "convergence proof", "theoretical bound", "performance bound"],
        ["Approximation", "approximation theory"], 
        ["Quantitative evidence (e.g. experiments)", "quantitative", "numerical results", "empirical study", "measurable"],
        ["Qualitative evidence (e.g. examples)", "case study", "illustrative"], 
        ["Scientific methodology",  "hypothesis-driven", "scientific"], 
        ["Controllability (of model owner)", "governability", "ownership", "model steering"],
        ["Human-like mechanism", "biologically inspired", "cognitive"],
        ["Low cost", "cheap", "cost", "affordable", "resource-efficient", "budget-friendly"],
        ["Large scale", "scalability", "big data", "high-capacity", "massive-scale"],
        ["Promising"], 
        ["Generality", "broad applicability", "domain-independent", "versatile"],
        ["Principled", "theoretically sound", "axiomatic", "methodologically rigorous"],
        ["Exactness", "error-free"], 
        ["Preciseness", "high-fidelity"], 
        ["Concreteness", "grounded", "verifiable"], 
        ["Automatic", "self-operating", "hands-free"], 
        ["Performance"], 
        ["Accuracy", "precision", "recall", "F1-score", "error rate", "reliability"],
        ["Avoiding train/test discrepancy", "train/test", "discrepancy", "distribution shift", "generalization gap"],
        ["State-of-the-art", "SOTA", "best performing", "cutting-edge", "latest"],
        ["Efficiency", "efficient"],
        ["Reduced training time", "training time", "fast training", "speed-up", "low latency"],
        ["Memory efficiency", "memory-efficient", "low memory footprint", "RAM optimization"],
        ["Data efficiency", "data-efficient", "few-shot", "self-supervised", "low data regime"],
        ["Label efficiency (reduced need for labeled data)", "label-efficient", "semi-supervised", "weak supervision"],
        ["Energy efficiency", "energy-efficient", "low power", "green AI", "sustainable AI"],
        ["Effectiveness"], 
        ["Successful"], 
        ["Building on classic work", "classic work", "foundational", "historical perspective"],
        ["Building on recent work", "recent work", "latest advancements", "current research"],
        ["Unifying ideas or integrating components", "unifying", "integrative", "synergistic", "compositional"],
        ["Identifying limitations", "limitations", "weaknesses", "failure modes"], 
        ["Critique", "criticism", "critical review"],
        ["Understanding (for researchers)", "understanding", "conceptual clarity"], 
        ["Improvement"],
        ["Progress"],
        ["Used in practice/Popular", "used in practice", "popular", "adopted", "real-world usage"],
        ["Reproducibility", "reproduce", "replication", "repeatability", "consistent results"],
        ["Easy to implement", "simple to use", "straightforward"],
        ["Requires few resources", "resources", "low-resource", "minimal requirements"],
        ["Parallelizability / distributed", "parallelizability", "parallelization", "distributed"],
        ["Facilitating use (e.g. sharing code)", "sharing code", "open-source"], 
        ["Scales up", "scale up", "expands", "large-scale deployment"],
        ["Applies to real world", "real world", "practical application", "real-world relevance"],
        ["Learning from humans", "human learning", "human-in-the-loop", "interactive learning"],
        ["Practical", "applied AI"], 
        ["Useful"], 
        ["Interpretable (to users)", "interpretable", "explainable"],
        ["Transparent (to users)", "transparent", "transparency", "accountability"],
        ["Privacy", "privacy", "private", "confidentiality", "data protection"],
        ["Fairness", "equitable", "bias mitigation"], 
        ["Not socially biased", "social bias", "socially biased", "fairness-aware", "bias-free", "equitable AI"],
        ["User influence", "user impact", "user effect", "human influence", "user control", "user agency"],
        ["Collective influence", "collective", "group influence", "crowd dynamics", "social influence", "peer effects"],
        ["Deferral to humans", "human oversight", "human intervention", "human in the loop", "human-AI collaboration"],
        ["Critiqability", "contestability", "scrutability", "reviewability"], 
        ["Beneficence", "beneficable", "welfare", "positive impact", "well-being", "prosocial"],
        ["Non-maleficence", "harm avoidance", "ethical AI", "AI safety", "harm reduction"],
        ["Justice", "equity", "bias mitigation", "equal treatment", "social justice"],
        ["Respect for Persons", "human dignity", "respect for individuals", "respect for rights", "human rights"],
        ["Autonomy (power to decide)", "autonomy", "autonome", "self-determination", "independence", "user agency", "free choice"],
        ["Explicability", "explicable", "interpretability", "transparency", "explainability", "understandability"],
        ["Respect for Law and public interest", "respect for law", "respect for public interest", "compliance", "regulatory adherence", "legal AI", "governance"],
        ["Security", "secure", "cybersecurity", "privacy protection", "adversarial robustness", "data security"],
        ["Easy to work with", "user-friendly", "ease of use"],
        ["Realistic world model", "world model", "real-world applicability", "realistic simulation", "grounded AI", "embodied intelligence"],
        ["Fast", "speed", "low latency", "real-time"]
    ]

    # Process papers and save results
    keyword_matrix, sentences = process_papers("Paper Test", keywords)


To Transformers and Beyond: Large Language
Models for the Genome
Micaela E. Consens1, 2, 3, Cameron Dufault1, Michael Wainberg4, Duncan Forster2, 5, 6,
Mehran Karimzadeh2, 7, 8, 9, Hani Goodarzi7, 8, 9, Fabian J. Theis10, 11, 12, 13, Alan Moses1, 14,
and Bo Wang1, 2, 3, 15*
1Department of Computer Science, University of Toronto, Toronto, Ontario, Canada
2Vector Institute for Artificial Intelligence, Toronto, Ontario, Canada
3Peter Munk Cardiac Center, University Health Network, Toronto, Ontario, Canada
4Prosserman Centre for Population Health Research, Lunenfeld-Tanenbaum Research Institute, Toronto, Ontario,
Canada
5Department of Molecular Genetics, University of Toronto, Toronto, Ontario, Canada
6The Donnelly Centre, University of Toronto, Toronto, Ontario, Canada
7Department of Biochemistry & Biophysics, University of California, San Francisco, San Francisco, California, USA
8Department of Urology, University of California, San Francisco, San Francisco, California, USA
9Helen Diller

In [14]:
keyword_matrix

Unnamed: 0,Paper Name,Novelty,Simplicity,Generalization,Flexibility/Extensibility,Robustness,Realistic output,Formal description/analysis,Theoretical guarantees,Approximation,...,Non-maleficence,Justice,Respect for Persons,Autonomy (power to decide),Explicability,Respect for Law and public interest,Security,Easy to work with,Realistic world model,Fast
0,LLMs.pdf,14,1,3,3,0,0,0,0,1,...,0,0,0,0,17,0,0,0,0,2


In [12]:
sentences

{'Novelty': ['At the same time, novel techniques for capturing genomic information9,10such as\nchromatin accessibility11,12, methylation12,13, transcriptional status14,15, chromatin structure16and bound molecules12have\nprovided a large and varied source of omics data to mine17.',
  'Furthermore, transformers applied to genomic\ndata offer a novel conceptual framework, the attention mechanism , to study the organization and grammar of the genome.',
  'This will further accelerate the\napplication of deep learning models for genomics, where methodologies for improving transformer efficiency are now being\nadopted in newer models, and most recently, genomic models are being proposed with novel architectures that claim to be the\n“next transformer”49.',
  'The early research\nin this space holds promising and innovative prospects for the field.',
  'This is followed by a succinct review of previous deep learning architectures used in this field,\nestablishing a foundation for an examinati