In [3]:
import os
import pandas as pd
import re
from pypdf import PdfReader

# Function to extract references from a PDF file using a context-aware pattern
def parse_references(pdf_path):
    reader = PdfReader(pdf_path)
    refs = []
    ref_section = False # Use as a flag to find the references section
    last_ref_num = 0 # To track the last reference number

    for page in reader.pages:
        # Extract text on a page by page basis
        text = page.extract_text()
        # When the "References" bit is found, cut out the text preceding it
        if "References" in text:
            ref_section = True
            text = text.split("References", 1)[1]
        if ref_section:
            # Find all matches of reference pattern
            for match in re.finditer(r'(\d+)\. (.*?\..*?)\.', text, re.DOTALL):
                current_ref_num = int(match.group(1))
                # Append reference if the current reference number is sequential
                if current_ref_num == last_ref_num + 1:
                    refs.append(match.group(2))
                    last_ref_num = current_ref_num
    # Use dot as column separator for Author and Title (as in the PDF)
    references = pd.DataFrame(refs)[0].str.split('.', n=1, expand=True)
    references.columns = ['Author', 'Title']
    # Replace newline characters with space and strip leading whitespace
    return references.applymap(lambda x: x.replace('\n', ' ').strip())

pdf = os.path.join('..','paper','s40798-019-0202-3.pdf')

# Extract references using the context-aware pattern from the uploaded PDF file
references = parse_references(pdf)

references.to_csv(os.path.join('..','results','paper_refs.csv'), index=False)

references

Unnamed: 0,Author,Title
0,"Russell S, Norvig P",Artificial Intelligence: a modern approach
1,"Witten IH, Frank E, Hall MA, et al",Data Mining: practical Machine Learning tools ...
2,"Zaki MJ, Meira Jr, W",Data Mining and analysis: fundamental concepts...
3,"Passfield L, Hopker JG",A mine of information: can sports analytics pr...
4,"Rein R, Memmert D",Big data and tactical analysis in elite soccer...
...,...,...
98,"Dalton-Barron NE, McLaren SJ, Black CJ, et al",Identifying contextual influences on training ...
99,"McLaren SJ, Weston M, Smith A, et al",Variability of physical performance and player...
100,"Oliveira WK, Jesus K, Andrade AD, et al",Monitoring training load in beach volleyball p...
101,"Düking P, Achtzehn S, Holmberg HC, Sperlich B",Integrated framework of load monitoring by a c...


In [52]:
from scholarly import scholarly

# Function to search for a paper and get its abstract
def get_abstract(author, title):
    search_query = scholarly.search_pubs(f'{author} {title}')
    try:
        paper = next(search_query)
        print(paper['bib']['abstract'])
        return paper['bib']['abstract'] if 'abstract' in paper['bib'] else "Abstract not available"
    except StopIteration:
        return "No results found"
    except KeyError:
        return "Abstract not available"
    except Exception as e:
        return f"An error occurred: {e}"

# Retrieve abstracts for the specified papers
abstracts = []
for index, row in references.iterrows():
    # Splitting the author field and taking the first part before the comma
    author = row['Author'].split(',')[0].strip()

    # Splitting the title field and taking the first five words
    title = ' '.join(row['Title'].split()[:3])
    abstracts.append(get_abstract(author, title))
print(abstracts)

(or artificially defined environments such as the simulated chessboard) and into the real world,  there is  In Human Compatible (Russell 2019), I suggest three principles underlying a new
With its comprehensive coverage, algorithmic perspective, and wealth of examples, this  book offers solid guidance in data mining for students, researchers, and practitioners alike.
of the science of sport alongside extensive skills for data handling and analysis. Next we  provide 2 examples of the kind of  that illustrate different ways of mining and modeling data to
amount of available data is becoming  big data technologies from industrial data analytics  domains address these problems. Further, the present work provide an overview how big data
ask why scientists should care about data science. To answer, we discuss data science from  three  Although each of the three is a critical component of data science, we argue that the
Machine learning (ML) is one of the intelligent methodologies that have s

MaxTriesExceededException: Cannot Fetch from Google Scholar.

In [49]:
from scholarly import scholarly

def search_abstract(author, title):
    search_query = scholarly.search_pubs(f"{title} {author}")
    try:
        paper = next(search_query)
        return paper['bib']['abstract']
    except StopIteration:
        return "No results found"
    except KeyError:
        return "Abstract not available"
    except Exception as e:
        return f"An error occurred: {e}"

# Example usage
abstract = search_abstract("Russell S", "Artificial Intelligence")
print(abstract)


Stuart Russell Abstract A long tradition in philosophy and economics equates intelligence   that can be expected to achieve one’s objectives. This framework is so pervasive within AI that


In [41]:
from Bio import Entrez

def search_pubmed(query, email):
    Entrez.email = email
    handle = Entrez.esearch(db='pubmed',sort='relevance',
    retmax='5',
    retmode='xml',
    term=query)
    results = Entrez.read(handle)
    return results

def fetch_details(id_list, email):
    ids = ','.join(id_list)
    Entrez.email = email
    handle = Entrez.efetch(db='pubmed',
    retmode='xml',
    id=ids)
    results = Entrez.read(handle)
    return results

mazza = 'matteo.mazzarelli@gmail.com'

studyids = []
for index, row in references.iterrows():
    # Splitting the author field and taking the first part before the comma
    author = row['Author'].split(',')[0].strip()

    # Splitting the title field and taking the first five words
    title = ' '.join(row['Title'].split()[:8])

    # Forming the query and searching
    studies = search_pubmed(f"{author} {title}", mazza)
    studyids.append(studies['IdList'][:1])
print(studyids)

[['26185241'], [], [], ['27967295'], ['27610328'], ['28784795'], [], [], ['30398550'], ['25859590'], [], ['27252168'], [], ['19691366'], ['29283933'], ['24149722'], ['33782057'], ['26423706'], ['21833989'], [], [], [], [], ['24993662'], [], [], [], [], [], [], [], [], [], [], [], ['28692649'], [], [], ['26176890'], ['28125339'], [], ['29910456'], [], ['25816795'], [], ['29910361'], ['23409787'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['29276071'], [], ['27194668'], [], ['29629182'], ['29266094'], ['30044858'], ['27918659'], ['29283691'], ['25721800'], ['29321637'], ['27159303'], ['7563290'], ['9555629'], [], ['23486837'], [], [], ['27166289'], ['25977310'], ['27486488'], ['27101130'], ['27482527'], ['27095747'], ['30253926'], ['24149123'], ['25435773'], ['28530474'], ['25623172'], ['28601588'], ['26519521'], ['24585673'], ['29226164'], ['24918302'], ['29979279'], ['26118848'], [], ['29783763'], ['26333453']]


In [42]:
title_list = []
abstract_list = []
journal_list = []
language_list = []
pubdate_year_list = []
pubdate_month_list = []

papers = fetch_details(studyids, mazza)
    
for i, paper in enumerate(papers['PubmedArticle']):
    title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])

    try:
        abstract_list.append(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
    except:
        abstract_list.append('No Abstract')

    journal_list.append(paper['MedlineCitation']['Article']['Journal']['Title'])
    language_list.append(paper['MedlineCitation']['Article']['Language'][0])

    try:
        pubdate_year_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])
    except:
        pubdate_year_list.append('No Data')

    try:
        pubdate_month_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Month'])
    except:
        pubdate_month_list.append('No Data')

df = pd.DataFrame(list(zip(
    title_list, abstract_list, journal_list, language_list, pubdate_year_list, pubdate_month_list
)),
columns=[
    'Title', 'Abstract', 'Journal', 'Language', 'Year', 'Month'
])


TypeError: sequence item 0: expected str instance, list found