## Download PDFs to a folder

In [1]:
!pip install pandas requests

Note: you may need to restart the kernel to use updated packages.


In [23]:
import pandas as pd
import requests
import os

# Path to your Excel file
csv_file_path = '/Users/aidakostikova/Desktop/arxiv/top40_papers.csv'

# Folder where you want to save the PDFs
pdf_folder = '/Users/aidakostikova/Desktop/arxiv/grobid_client_python/resources/test_pdf'

# Ensure the directory exists
os.makedirs(pdf_folder, exist_ok=True)

# Column name in your Excel file that contains the PDF URLs
url_column = 'pdf_url'  # Replace with your actual column name

# Read the Excel file
df = pd.read_csv(csv_file_path)

# Iterate over the rows and download each PDF
for index, row in df.iterrows():
    pdf_url = row[url_column]
    try:
        response = requests.get(pdf_url)
        response.raise_for_status()  # Check if the download was successful

        # Save the PDF
        pdf_filename = os.path.join(pdf_folder, f'paper_{index + 1}.pdf')
        with open(pdf_filename, 'wb') as f:
            f.write(response.content)

        print(f"Downloaded and saved: {pdf_filename}")

    except requests.RequestException as e:
        print(f"Error downloading {pdf_url}: {e}")


Downloaded and saved: /Users/aidakostikova/Desktop/arxiv/grobid_client_python/resources/test_pdf/paper_1.pdf
Downloaded and saved: /Users/aidakostikova/Desktop/arxiv/grobid_client_python/resources/test_pdf/paper_2.pdf
Downloaded and saved: /Users/aidakostikova/Desktop/arxiv/grobid_client_python/resources/test_pdf/paper_3.pdf
Downloaded and saved: /Users/aidakostikova/Desktop/arxiv/grobid_client_python/resources/test_pdf/paper_4.pdf
Downloaded and saved: /Users/aidakostikova/Desktop/arxiv/grobid_client_python/resources/test_pdf/paper_5.pdf
Downloaded and saved: /Users/aidakostikova/Desktop/arxiv/grobid_client_python/resources/test_pdf/paper_6.pdf
Downloaded and saved: /Users/aidakostikova/Desktop/arxiv/grobid_client_python/resources/test_pdf/paper_7.pdf
Downloaded and saved: /Users/aidakostikova/Desktop/arxiv/grobid_client_python/resources/test_pdf/paper_8.pdf
Downloaded and saved: /Users/aidakostikova/Desktop/arxiv/grobid_client_python/resources/test_pdf/paper_9.pdf
Downloaded and save

## Get affiliations from the papers

In [2]:
!git clone https://github.com/kermitt2/grobid_client_python
%cd grobid_client_python
!python setup.py install


Cloning into 'grobid_client_python'...
remote: Enumerating objects: 339, done.[K
remote: Counting objects: 100% (156/156), done.[K
remote: Compressing objects: 100% (70/70), done.[K
remote: Total 339 (delta 107), reused 91 (delta 85), pack-reused 183[K
Receiving objects: 100% (339/339), 1.49 MiB | 5.96 MiB/s, done.
Resolving deltas: 100% (198/198), done.
/Users/zhangran/Documents/GitHub/Quaterly-Arxiv/code/grobid_client_python
running install
!!

        ********************************************************************************
        Please avoid running ``setup.py`` directly.
        Instead, use pypa/build, pypa/installer or other
        standards-based tools.

        See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.
        ********************************************************************************

!!
  self.initialize_options()
!!

        *********************************************************************

In [5]:
from grobid_client.grobid_client import GrobidClient

In [6]:
import os
import shutil
os.chdir('/Users/zhangran/Documents/GitHub/Quaterly-Arxiv/code/grobid_client_python/')

In [7]:
for file in os.listdir("./resources/test_out/"):
    try:
        file = file.replace("grobid.tei.xml", "pdf")
        shutil.move("../../data/pdfs/" + file, "../../data/done/" + file)
    except: pass

In [None]:
os.chdir('/Users/zhangran/Documents/GitHub/Quaterly-Arxiv/code/grobid_client_python/')
client = GrobidClient(config_path="./config.json")
client.process("processHeaderDocument", "../../data/pdfs", output="./resources/test_out/", consolidate_citations=True, tei_coordinates=True, force=True)

GROBID server is up and running


## Parsing GROBID.TEI.XML files

In [3]:
import xml.etree.ElementTree as ET
import os

# Directory containing the XML files
xml_directory = './resources/test_out'

# Namespace dictionary to handle namespaces in the XML
namespaces = {'tei': 'http://www.tei-c.org/ns/1.0'}

# Function to extract and print affiliations from a single XML file
def process_xml_file(xml_file_path):
    # Read the XML content from the file
    with open(xml_file_path, 'r', encoding='UTF-8') as file:
        xml_content = file.read()

    # Parse the XML content
    root = ET.fromstring(xml_content)

    # Find all author elements
    authors = root.findall('.//tei:author', namespaces)

    # Extract affiliations for each author
    for author in authors:
        # Extract author's name
        forename = author.find('.//tei:forename', namespaces).text if author.find('.//tei:forename', namespaces) is not None else ''
        surname = author.find('.//tei:surname', namespaces).text if author.find('.//tei:surname', namespaces) is not None else ''
        author_name = f"{forename} {surname}".strip()

        # Extract affiliation components
        affiliation_elements = author.findall('.//tei:affiliation/*', namespaces)
        affiliation_details = []
        for elem in affiliation_elements:
            if elem.tag.endswith('orgName'):
                affiliation_type = elem.get('type')
                affiliation_details.append(f"{affiliation_type.capitalize()}: {elem.text}")
            elif elem.tag.endswith('address'):
                country = elem.find('.//tei:country', namespaces).text if elem.find('.//tei:country', namespaces) is not None else ''
                affiliation_details.append(f"Country: {country}")

        affiliation_info = ', '.join(affiliation_details)

        print(f"Author: {author_name}, Affiliation: {affiliation_info}")

# Process each XML file in the directory
for filename in os.listdir(xml_directory):
    if filename.endswith('.grobid.tei.xml'):
        process_xml_file(os.path.join(xml_directory, filename))


Author: Cristiano De Marchis, Affiliation: Department: Division of Functional and Restorative Neurosurgery, Department: Department of Neurosurgery, Institution: Eberhard Karls University, Country: Germany
Author: Thiago Santos Monteiro, Affiliation: 
Author: Cristina Simon-Martinez, Affiliation: Department: Division of Functional and Restorative Neurosurgery, Department: Department of Neurosurgery, Institution: Eberhard Karls University, Country: Germany
Author: Silvia Conforto, Affiliation: Department: Division of Functional and Restorative Neurosurgery, Department: Department of Neurosurgery, Institution: Eberhard Karls University, Country: Germany
Author: Alireza Gharabaghi, Affiliation: Department: Division of Functional and Restorative Neurosurgery, Department: Department of Neurosurgery, Institution: Eberhard Karls University, Country: Germany
