In [3]:
import os
import pandas as pd
import re
from pypdf import PdfReader

# Function to extract references from a PDF file using a context-aware pattern
def parse_references(pdf_path):
    reader = PdfReader(pdf_path)
    refs = []
    ref_section = False # Use as a flag to find the references section
    last_ref_num = 0 # To track the last reference number

    for page in reader.pages:
        # Extract text on a page by page basis
        text = page.extract_text()
        # When the "References" bit is found, cut out the text preceding it
        if "References" in text:
            ref_section = True
            text = text.split("References", 1)[1]
        if ref_section:
            # Find all matches of reference pattern
            for match in re.finditer(r'(\d+)\. (.*?\..*?)\.', text, re.DOTALL):
                current_ref_num = int(match.group(1))
                # Append reference if the current reference number is sequential
                if current_ref_num == last_ref_num + 1:
                    refs.append(match.group(2))
                    last_ref_num = current_ref_num
    # Use dot as column separator for Author and Title (as in the PDF)
    references = pd.DataFrame(refs)[0].str.split('.', n=1, expand=True)
    references.columns = ['Author', 'Title']
    # Replace newline characters with space and strip leading whitespace
    return references.applymap(lambda x: x.replace('\n', ' ').strip())

pdf = os.path.join('..','paper','s40798-019-0202-3.pdf')

# Extract references using the context-aware pattern from the uploaded PDF file
references = parse_references(pdf)

references.to_csv(os.path.join('..','results','paper_refs.csv'), index=False)

references

Unnamed: 0,Author,Title
0,"Russell S, Norvig P",Artificial Intelligence: a modern approach
1,"Witten IH, Frank E, Hall MA, et al",Data Mining: practical Machine Learning tools ...
2,"Zaki MJ, Meira Jr, W",Data Mining and analysis: fundamental concepts...
3,"Passfield L, Hopker JG",A mine of information: can sports analytics pr...
4,"Rein R, Memmert D",Big data and tactical analysis in elite soccer...
...,...,...
98,"Dalton-Barron NE, McLaren SJ, Black CJ, et al",Identifying contextual influences on training ...
99,"McLaren SJ, Weston M, Smith A, et al",Variability of physical performance and player...
100,"Oliveira WK, Jesus K, Andrade AD, et al",Monitoring training load in beach volleyball p...
101,"Düking P, Achtzehn S, Holmberg HC, Sperlich B",Integrated framework of load monitoring by a c...


In [18]:
from Bio import Entrez

def search_pubmed(query):
    Entrez.email = 'matteo.mazzarelli@gmail.com'
    handle = Entrez.esearch(db='pubmed',sort='relevance',
    retmax='5',
    retmode='xml',
    term=query)
    results = Entrez.read(handle)
    return results

for index, row in references.iterrows():
    # Splitting the author field and taking the first part before the comma
    author = row['Author'].split(',')[0].strip()

    # Splitting the title field and taking the first five words
    title = ' '.join(row['Title'].split()[:5])

    # Forming the query
    query = f"{author} {title}"
    print(query)
    studies = search_pubmed(query)
    print(studies)

Russell S Artificial Intelligence: a modern approach
Witten IH Data Mining: practical Machine Learning
Zaki MJ Data Mining and analysis: fundamental
Passfield L A mine of information: can
Rein R Big data and tactical analysis
Blei DM Science and data science
Bunker RP A machine learning framework for
Israni ST Hu manizing artificial intelligence
Shortliffe EH Clinical decision support in the
Baker M Data science: industry allure
{'Count': '1', 'RetMax': '1', 'RetStart': '0', 'IdList': ['25859590'], 'TranslationSet': [{'From': 'Baker M', 'To': 'baker m[Author] OR baker m[Investigator]'}, {'From': 'Data science:', 'To': '"data science"[MeSH Terms] OR ("data"[All Fields] AND "science"[All Fields]) OR "data science"[All Fields]'}, {'From': 'industry', 'To': '"industrial development"[MeSH Terms] OR ("industrial"[All Fields] AND "development"[All Fields]) OR "industrial development"[All Fields] OR "industrialization"[All Fields] OR "industrialize"[All Fields] OR "industrialized"[All Fields] 