In [None]:
!pip install PyMuPDF
!pip install spacy

In [3]:
import re

import fitz  # PyMuPDF for PDF parsing
import spacy

In [None]:
!python -m spacy download en_core_web_sm

In [5]:
def parse_pdf(path):
    fitzed_doc = fitz.open(path)
    text = ""
    for page in fitzed_doc:
        text += page.get_text()
    return text

In [6]:
nlp = spacy.load("en_core_web_sm")
paper = r"Papers/scitranslmed.adi0673 annotated.pdf"  # Path to the PDF file

In [7]:
raw_text = parse_pdf(paper)
doc = nlp(raw_text)

In [12]:
hypothesis_patterns = (
    r"(hypothesis|H\d+)[:–-]",
    r"(theor(ies|y)|hypothes[ei]s) (is|was|are|were|that)",
    r"(we|I|authors?|it is|study|theor(ies|y)|hypothes[ei]s) (hypothesi[sz]e|predict|propose|assume|expect|anticipate|foresee|postulate|conjecture|test|suggest)",
    r"(hypothesi[sz]e|predict|propose|assume|expect|anticipate|foresee|postulate|aim|conjecture|test|suggest)(s|d)? that"
    r"(investigate|examine|predict|test)(s|d)? (whether|if)",
    r"(evidence|data) (predict|suggest|propose|assume)"
    r"(aim|objective|goal) (of th(is|e) study)",
    r"if.*then",
    r"it would appear",
    r"it likely that",
    r"According to the (theor(ies|y)|hypothes[ei]s|views?)"
)

methods_patterns = (
    r"(methods?|methodology|procedures?|techniques?) (is|was|are|were|that|[:–-])",
    r"(we|I|authors?|the study) (used|employed|applied|conducted|performed|carried out|implemented)",
    r"(data (collection|gathering|acquisition)|samples?|participants?|subjects?) (were|was)",
    r"(measurements?|assays?|tests?|analyses?) (were|was) (performed|conducted|carried out|done)",
    r"(statistical|analysis|analytical) (methods?|approach|procedure)",
    r"(in order to (determine|measure|assess|evaluate|test|quantify))",
    r"(equipment|instruments?|software|apparatus|tools?) (was|were|used)",
    r"(protocol|procedure|experimental design) (was|were)",
    r"(collection period|duration|timeframe) (was|of)",
    r"(participants?|subjects?|samples?) (were |was )?(recruited|selected|enrolled|obtained)",
    r"(inclusion|exclusion) (criteria|requirements)",
    r"(we|I|authors?) (measure|perform|conduct|test|quantif(y|ied)|employ|adopt)",
    r"(was|were) (stained|embedded|measured|performed|calculated|obtained|identified|detected|monitored|isolated|stimulated|used|removed|harvested|acqired|gated|tested|considered)",
    r"(regression|ANOVA|t-test|chi-square|Wilcoxon|Mann-Whitney|Kruskal-Wallis|Fisher['’]s (exact)?) (test|analysis|model)",
    r"(correlation|association|relationship) (was|were) (assessed|examined|evaluated|tested)",
    r"(clustering|classification|regression|neural network|deep learning|random forest|support vector) (was|were) (performed|applied|used)",
    r"(principal component analysis|PCA|factor analysis|dimensionality reduction) (was|were) (performed|conducted)",
    r"(R|Python|MATLAB|SPSS|SAS|Stata|GraphPad|TensorFlow|PyTorch|scikit-learn) (was|were) used",
    r"(software|tool|package|library) (was|were) used",
    r"(cross-validation|bootstrapping|permutation test) (was|were) (performed|conducted|used)",
)

dataset_patterns = (
    r"(dataset|data set|database|data source|repository) (was|were|is|are|used|obtained|available|collected|generated|produced)",
    r"(available|accessible|open) (dataset|data set|database|data source|repository)",
    r"(downloaded|retrieved|obtained|accessed) from (the )?(database|repository|server|(web)?site|portal)",
    r"data( were)? (obtained|retrieved|downloaded|sourced)",
    r"(doi|accession number|repository|github|zenodo|figshare|osf|dryad)\s*[:–-]",
    r"(supplementary )?data (available )?(at|in|from|on)",
    r"(gene expression|genomic|proteomic|metabolomic|imaging) dataset",
    r"(data (necessary )?to reproduce|reproducib(le|ility))",
    r"(accession|catalog) (number|code)\s*[:–-]",
    r"(data|dataset) (was|were) (normalized|standardized|preprocessed|filtered|transformed|cleaned)",
    r"(normalized|standardized|preprocessed|filtered|transformed|cleaned) (the )?(data|dataset)",
)

experiment_patterns = (
    r"(experiment|assay|trial|test)\s*(\d+|[A-Z])?(\s*[:–-])",
    r"(test|examine|investigate|evaluate|assess|determine|measure) (the )?(hypothesis|prediction|effect|impact|role|function)",
    r"(performed|conducted|carried out|designed|set up) (an? )?(experiment|assay|trial|test|study)",
    r"experimental (design|setup|procedure|protocol|approach)",
    r"(treatment|condition|group) (was|were)( exposed| treated)",
    r"(control|experimental|treatment) group",
    r"(exposed|subjected|treated|stimulated|challenged) (to|with)",
    r"(intervention|treatment|manipulation) (was|were)",
    r"(validation|confirmatory|confirmation) (experiment|assay|test)",
    r"(parallel|sequential|longitudinal) (experiment|test|measurement)",
)

analysis_patterns = (
    r"(analysis|analyses) (was|were|performed|conducted)",
    r"(p-value(s)?|statistical significance) (was|were) (calculated|assessed|determined)",
    r"(supervised|unsupervised) (learning|modeling|analysis) (was|were)",
    r"(accuracy|precision|recall|F1 score|AUC|ROC curve) (was|were|are|is)",
    r"(we|I|authors?) (evaluated|assessed|validated) (the )?(model|results|performance|hypothesis)",
    r"(data|results) (was|were) (visualized|plotted|presented|graphed) (using|with|as)",
)

result_patterns = (
    r"(we|I|authors?) (analyzed|assessed|evaluated|quantified|examined|compared) (the )?(data|results|effects)",
    r"(results?|findings?|outcomes?|data|evidence|analysis) (show|demonstrate|indicate|reveal|suggest|support|imply|point|provide|confirm|validate)",
    r"(we|I|authors?|the study) (found|observed|noted|reported|detected|identified|demonstrate|confirm|establish|conclude|infer)",
    r"((no )?significant|(not )?statistically significant) (effect|difference|correlation|association|relationship) (was|were) (found|observed|detected)",
    r"(p (<|>|<=|>=|=) \d*\.?\d+| p-value(s)? (of|less than|greater than))",
    r"(higher|lower|greater|smaller|increased|decreased|stronger|weaker) (than|compared to|relative to)",
    r"(positive|negative|significant|weak|strong) (correlation|association|relationship) (was|were) (found|observed|detected)",
    r"(trend(s)?|pattern(s)?) (was|were) (observed|noted|evident|apparent)",
    r"(model|algorithm) (achieved|obtained|yielded|produced) (an? )?(accuracy|precision|recall|F1 score|AUC)",
    r"(effect|impact|influence|role) (of \w+ on \w+) (was|were) (observed|found|significant|notable)",
)

compiled_hypothesis_patterns = [re.compile(p, re.IGNORECASE) for p in hypothesis_patterns]
compiled_methods_patterns = [re.compile(p, re.IGNORECASE) for p in methods_patterns]
compiled_dataset_patterns = [re.compile(p, re.IGNORECASE) for p in dataset_patterns]
compiled_experiment_patterns = [re.compile(p, re.IGNORECASE) for p in experiment_patterns]
compiled_analysis_patterns = [re.compile(p, re.IGNORECASE) for p in analysis_patterns]
compiled_result_patterns = [re.compile(p, re.IGNORECASE) for p in result_patterns]


In [13]:
hypotheses = []
methods = []
datasets = []
experiments = []
analyses = []
results = []
for ind, sent in enumerate(doc.sents):
    sent_text = re.sub(r'\s+', ' ', sent.text.strip().replace("\n", " "))
    if sent_text and any(pattern.search(sent_text) for pattern in compiled_hypothesis_patterns):
        hypotheses.append(sent_text)
    elif sent_text and any(pattern.search(sent_text) for pattern in compiled_methods_patterns):
        methods.append(sent_text)
    elif sent_text and any(pattern.search(sent_text) for pattern in compiled_dataset_patterns):
        datasets.append(sent_text)
    elif sent_text and any(pattern.search(sent_text) for pattern in compiled_experiment_patterns):
        experiments.append(sent_text)
    elif sent_text and any(pattern.search(sent_text) for pattern in compiled_analysis_patterns):
        analyses.append(sent_text)
    elif sent_text and any(pattern.search(sent_text) for pattern in compiled_result_patterns):
        results.append(sent_text)

hypotheses, methods, datasets, experiments, analyses, results

(['We hypothesized that the existence of high-\xadaffinity peanut-\xadspecific IgG+ memory B cells and their abil- ity to undergo class switching to IgE are critical for allergy persistence in pediatric peanut allergy.',
  'In line with our model of IgE cell differentiation developed from mouse studies (2, 3, 7, 43), we proposed that allergen-\xadspecific IgG+ memory B cells ex- pressing CD23/FCER2 and germline IGHE are precursors of patho- genic IgE+ plasma cells in highly sensitized peanut-\xadallergic individuals.',
  'We propose that CD23+IGHE+IgG1+ memory B cells are involved in the persistence of food allergy by providing precursors of pathogenic IgE plasma cells.',
  'We hypothesized that IgG1+ memory B cells are the precursors of high-\xadaffinity, pathogenic IgE+ plasma cells in persistent peanut allergy.',
  '10X Genomics scRNA-\xadseq processing and analysis 10X Genomics 5′ scRNA-\xadseq data were processed with Cell Ranger version 3.1 and aligned to refdata-\xadcellranger-\

In [None]:
for ind, sent in enumerate(doc.sents):
    sent_text = re.sub(r'\s+', ' ', sent.text.strip().replace("\n", " "))
    print(f"{ind}: {sent_text}")