In [2]:
from ftplib import FTP
import os
import gzip
import xml.etree.ElementTree as ET
import pandas as pd

In [3]:
# FTP details
ftp_url = 'ftp.ncbi.nlm.nih.gov'
ftp_path = '/pubmed/baseline/'
save_path = './pubmed_data/'

# Connect to FTP
ftp = FTP(ftp_url)
ftp.login()

# Navigate to the PubMed baseline directory
ftp.cwd(ftp_path)

# List all files in the directory
files = ftp.nlst()





In [4]:
files

['pubmed24n0002.xml.gz.md5',
 'pubmed24n0002.xml.gz',
 'pubmed24n0004.xml.gz.md5',
 'pubmed24n0004.xml.gz',
 'pubmed24n0003.xml.gz.md5',
 'pubmed24n0003.xml.gz',
 'pubmed24n0001.xml.gz.md5',
 'pubmed24n0001.xml.gz',
 'pubmed24n0006.xml.gz.md5',
 'pubmed24n0006.xml.gz',
 'pubmed24n0008.xml.gz.md5',
 'pubmed24n0008.xml.gz',
 'pubmed24n0005.xml.gz.md5',
 'pubmed24n0005.xml.gz',
 'pubmed24n0009.xml.gz.md5',
 'pubmed24n0009.xml.gz',
 'pubmed24n0007.xml.gz.md5',
 'pubmed24n0007.xml.gz',
 'pubmed24n0010.xml.gz.md5',
 'pubmed24n0010.xml.gz',
 'pubmed24n0013.xml.gz.md5',
 'pubmed24n0013.xml.gz',
 'pubmed24n0014.xml.gz.md5',
 'pubmed24n0014.xml.gz',
 'pubmed24n0011.xml.gz.md5',
 'pubmed24n0011.xml.gz',
 'pubmed24n0012.xml.gz.md5',
 'pubmed24n0012.xml.gz',
 'pubmed24n0015.xml.gz.md5',
 'pubmed24n0015.xml.gz',
 'pubmed24n0016.xml.gz.md5',
 'pubmed24n0016.xml.gz',
 'pubmed24n0018.xml.gz.md5',
 'pubmed24n0018.xml.gz',
 'pubmed24n0017.xml.gz.md5',
 'pubmed24n0017.xml.gz',
 'pubmed24n0021.xml.gz.md5',

In [None]:
# Create a directory to save downloaded files
if not os.path.exists(save_path):
    os.makedirs(save_path)

# Download and extract gzip files
for file in files:
    if file.endswith('.gz'):
        local_file = os.path.join(save_path, file)
        with open(local_file, 'wb') as f:
            ftp.retrbinary('RETR ' + file, f.write)
        # Extract the gzip file
        with gzip.open(local_file, 'rb') as f_in:
            with open(local_file[:-3], 'wb') as f_out:
                f_out.write(f_in.read())

# Parse XML files and store required data
data = []
for file in os.listdir(save_path):
    if file.endswith('.xml'):
        tree = ET.parse(os.path.join(save_path, file))
        root = tree.getroot()
        for article in root.findall('.//PubmedArticle'):
            pub_id = article.find('.//PMID').text
            pub_date = article.find('.//ArticleDate')
            pub_date = pub_date.find('.//Year').text if pub_date is not None else None
            abstract = article.find('.//AbstractText')
            abstract_text = abstract.text if abstract is not None else None
            data.append({'ID': pub_id, 'Date': pub_date, 'Abstract': abstract_text})

# Convert data to a pandas DataFrame
df = pd.DataFrame(data)

# Save to a Parquet file
df.to_parquet("pubmed_baseline_data.parquet", index=False)

# Close FTP connection
ftp.quit()