In [None]:
import requests
from concurrent.futures import ThreadPoolExecutor
import subprocess
import time
import re
import pandas as pd
import numpy as np
import PyPDF2
import os
import ast
from tqdm import tqdm
from collections import OrderedDict

In [20]:
def get_pdf_links_with_titles(file_path):
    pdf_dict = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if ',' in line:
                scopus_id, title, _, link = line.split(',')
                title = title.strip()
                link = link.strip()
                if link.lower().endswith('pdf') or any(domain in link.lower() for domain in [
                    'ieeexplore.ieee.org', 'mdpi-res.com', 'ascelibrary.org',
                    'pubs.rsc.org', 'pubs.acs.org', 'iopscience.iop.org'
                ]):
                    pdf_dict[title] = {}
                    pdf_dict[title]['link'] = link
                    pdf_dict[title]['scopus_id'] = scopus_id
    return pdf_dict

if __name__ == "__main__":
    all_pdfs = {}
    
    file_paths = [
        './retrieved_links_5000-10000.txt',
        './retrieved_links_10000-15000.txt'
    ]
    
    for path in file_paths:
        pdf_dict = get_pdf_links_with_titles(path)
        all_pdfs.update(pdf_dict)  # Merges new dict into main one

    print(f"Total unique PDF entries: {len(all_pdfs)}")

Total unique PDF entries: 2320


In [34]:
failed_pdfs = []

def download_pdf(title, data):
    global failed_pdfs
    try:
        url = data['link']              
        scopus_id = data['scopus_id']
        print(title, url, scopus_id)

        # Clean title for filename
        safe_title = re.sub(r'</?inf>', '', title)
        safe_title = re.sub(r'[^A-Za-z0-9 ]+', '', safe_title)
        safe_title = safe_title.strip() + '-' + scopus_id
        filename = f"./pdfs_from_links_5000_15000/{safe_title}.pdf"

        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=30)
        if response.status_code == 200:
            with open(filename, "wb") as f:
                f.write(response.content)
            try:
                with open(filename, "rb") as f:
                    reader = PyPDF2.PdfReader(f)
                    _ = reader.pages  # check validity
                print(f"Downloaded and Verified: {filename}")
            except Exception as pdf_error:
                print(f"Corrupted PDF detected: {filename} | {pdf_error}")
                os.remove(filename)
                failed_pdfs.append((title, url, scopus_id))
        else:
            print(f"Failed (Bad status): {url} | Status code: {response.status_code}")
            failed_pdfs.append((title, url, scopus_id))
    except Exception as e:
        print(f"Error with {title} | {e}")
        failed_pdfs.append((title, data.get('link', 'N/A'), data.get('scopus_id', 'N/A')))


In [38]:
with ThreadPoolExecutor(max_workers=6) as executor:
    executor.map(lambda item: download_pdf(*item), all_pdfs.items())

Erratum Hydrogen from Sunlight and Water A SidebySide Comparison between Photoelectrochemical and Solar Thermochemical WaterSplitting ACS Energy Letters 2021 69 30963113 DOI 101021acsenergylett1c00758Recent progress in alkali metal LiNaK hybridion batteries pioneering the future of energy storage https://pubs.rsc.org/en/content/articlepdf/2025/ta/d4ta06190j 85214388729
Highly flexible GOpolyurethane solidsolid phase change composite materials for efficient photothermal conversion and thermal energy storage https://pubs.rsc.org/en/content/articlepdf/2025/ta/d4ta07322c 85212928920
 https://pubs.acs.org/doi/pdf/10.1021/acsenergylett.4c03007 85216999297
Exceeding 15 Performance with Energy Level Tuning in TinBased Perovskite Solar Cells https://pubs.acs.org/doi/pdf/10.1021/acsenergylett.4c03172 85211368159
FirstPrinciples Calculations of the Electrical Conductivity of Carbon Nanotubes Functionalized with Copper and Nitrogen Implications for Electronics Energy Storage and Nanodevices https:

In [None]:
with open("failed_pdfs.txt", "w", encoding="utf-8") as f:
    f.write("{\n")
    for title, url, scopus_id in failed_pdfs:
        f.write(f"'{title}': {{'scopus_id': '{scopus_id}',\n  'link': '{url}'}},\n")
    f.write("}")