#### Fetching papers from IEEE Xplore
- Note: used browser agent needs to have institution access to Xplore (logged in)
- I use the script with local network allowing me to retrieve docs from Xplore with my work account

In [None]:
import os
import re
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from PyPDF2 import PdfReader
from fake_useragent import UserAgent

# Bes links
IEEE_XPLORE_SEARCH_URL = "https://ieeexplore.ieee.org/search/searchresult.jsp"
PDF_DOWNLOAD_URL_TEMPLATE = "https://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&arnumber={document_number}&ref="


DOWNLOAD_DIRECTORY = os.path.join(os.getcwd(), "downloads")
os.makedirs(DOWNLOAD_DIRECTORY, exist_ok=True)

# Selenium driver configuration
def get_chrome_driver():
    options = Options()
    prefs = {
        "plugins.always_open_pdf_externally": True,
        "download.default_directory": DOWNLOAD_DIRECTORY,
        "download.prompt_for_download": False,
        "download.directory_upgrade": True,
    }
    options.add_experimental_option("prefs", prefs)
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument(f"user-agent={UserAgent().random}")
    return webdriver.Chrome(options=options)

# Fetch IEEE papers using Selenium
def fetch_ieee_papers(query, year_filter, max_pages=10):
    driver = get_chrome_driver()
    papers = []

    try:
        for page in range(1, max_pages + 1):
            search_url = f"{IEEE_XPLORE_SEARCH_URL}?queryText={query}&highlight=true&returnFacets=ALL&returnType=SEARCH&pageNumber={page}&rowsPerPage=25"
            driver.get(search_url)
            wait = WebDriverWait(driver, 20)

            try:
                wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "List-results-items")))
                results = driver.find_elements(By.CLASS_NAME, "List-results-items")

                if not results:
                    break

                for result in results:
                    try:
                        title_elem = result.find_element(By.CSS_SELECTOR, "h3 a")
                        title = title_elem.text
                        url = title_elem.get_attribute("href")
                        doc_number_match = re.search(r"document/(\d+)", url)
                        document_number = doc_number_match.group(1) if doc_number_match else None

                        author_elems = result.find_elements(By.CSS_SELECTOR, "p.author span a span")
                        authors = "; ".join([author.text for author in author_elems]) if author_elems else "No Authors Available"

                        conf_journal_elem = result.find_element(By.CSS_SELECTOR, "div > a")
                        conf_journal = conf_journal_elem.text if conf_journal_elem else "Unknown Conference/Journal"

                        info_elem = result.find_element(By.CSS_SELECTOR, ".publisher-info-container")
                        match = re.search(r"\b(19|20)\d{2}\b", info_elem.text)
                        year = int(match.group()) if match else None

                        if year == year_filter and document_number:
                            papers.append({
                                "Title": title,
                                "URL": url,
                                "Document ID": document_number,
                                "Authors": authors,
                                "Conference/Journal": conf_journal,
                                "Year": year,
                                "Code Link": None,  # Placeholder for code links
                                "Code Context": None  # Placeholder for code context
                            })
                    except Exception as e:
                        print(f"Error parsing result: {e}")
            except TimeoutException:
                print("Timeout while loading the results.")
    finally:
        driver.quit()

    return papers

# Download PDFs and rename them with their document ID
def download_pdfs(papers):
    # Get a list of already downloaded document IDs
    downloaded_files = [f for f in os.listdir(DOWNLOAD_DIRECTORY) if f.lower().endswith(".pdf")]
    downloaded_ids = {os.path.splitext(f)[0] for f in downloaded_files}  # Extract Document IDs

    driver = get_chrome_driver()
    try:
        for idx, paper in papers.iterrows():
            print
            document_number = paper["Document ID"]

            # Skip if the file has already been downloaded
            if document_number in downloaded_ids:
                print(f"PDF for Document ID {document_number} already downloaded. Skipping.")
                continue

            pdf_url = PDF_DOWNLOAD_URL_TEMPLATE.format(document_number=document_number)
            print(f"Downloading PDF for Document ID {document_number} from {pdf_url}")
            driver.get(pdf_url)
            time.sleep(10)  # Wait for the download to complete

            # Identify the most recently downloaded file and rename it
            try:
                recent_files = sorted(
                    [f for f in os.listdir(DOWNLOAD_DIRECTORY) if f.lower().endswith(".pdf")],
                    key=lambda x: os.path.getctime(os.path.join(DOWNLOAD_DIRECTORY, x)),
                    reverse=True
                )
                if recent_files:
                    original_path = os.path.join(DOWNLOAD_DIRECTORY, recent_files[0])
                    new_path = os.path.join(DOWNLOAD_DIRECTORY, f"{document_number}.pdf")
                    os.rename(original_path, new_path)
                    print(f"Renamed downloaded file to {new_path}")
            except Exception as rename_error:
                print(f"Error renaming file for Document ID {document_number}: {rename_error}")
    except Exception as e:
        print(f"Error downloading PDFs: {e}")
    finally:
        driver.quit()


# Extract code links and context from PDFs and integrate into papers
def extract_links_and_update_papers(papers):
    for idx, paper in papers.iterrows():
        document_number = paper["Document ID"]
        pdf_filename = os.path.join(DOWNLOAD_DIRECTORY, f"{document_number}.pdf")
        if not os.path.exists(pdf_filename):
            continue

        try:
            reader = PdfReader(pdf_filename)
            for page in reader.pages:
                text = page.extract_text()
                if text:
                    links = re.findall(r'(https?://[^\s]+)', text)
                    for link in links:
                        if any(keyword in link.lower() for keyword in ['github.com', 'code', 'repository', 'software', 'implementation', 'gitlab', 'bitbucket']):
                            index = text.find(link)
                            context = text[max(0, index-30):min(len(text), index+len(link)+30)]
                            paper["Code Link"] = link
                            paper["Code Context"] = context
        except Exception as e:
            print(f"Error processing {pdf_filename}: {e}")
    return papers

# General workflow
def fetch_and_download(year=2024, nb_pages=10):
    query = "transportation"
    year_filter = year
    max_pages = nb_pages

    print("Fetching papers metadata...")
    papers = fetch_ieee_papers(query, year_filter, max_pages)
    print(f"Fetched {len(papers)} papers.")

    print("Downloading PDFs...")
    download_pdfs(papers)

    print("Extracting code links from PDFs and updating papers...")
    updated_papers = extract_links_and_update_papers(papers)

    print("Saving results...")
    papers_df = pd.DataFrame(updated_papers)
    papers_df.to_csv(f"IEEE_{year_filter}_transportation_papers_with_code_links.csv", index=False)

    print("Processing complete. Data saved to CSV file.")

    return papers_df


papers = fetch_and_download(year=2022, nb_pages=500)


Fetching papers metadata...
Timeout while loading the results.
Timeout while loading the results.
Fetched 886 papers.
Downloading PDFs...
Downloading PDF for Document ID 9974014 from https://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&arnumber=9974014&ref=
Downloading PDF for Document ID 10047675 from https://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&arnumber=10047675&ref=
Downloading PDF for Document ID 10101452 from https://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&arnumber=10101452&ref=
Downloading PDF for Document ID 9921842 from https://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&arnumber=9921842&ref=
Downloading PDF for Document ID 9970786 from https://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&arnumber=9970786&ref=
Downloading PDF for Document ID 9989297 from https://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&arnumber=9989297&ref=
Downloading PDF for Document ID 9873803 from https://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&arnumber=9873803&ref=
Downloading PDF for Docum

In [None]:
papers

Unnamed: 0,Title,URL,Document ID,Authors,Conference/Journal,Year,Code Link,Code Context
0,Transportation Economic Management Strategy of...,https://ieeexplore.ieee.org/document/9974014/,9974014,Tong Wu;,2022 International Conference on Data Analytic...,2022,,
1,Smart City and Intelligent Upgrading of Urban ...,https://ieeexplore.ieee.org/document/10047675/,10047675,Jun Qiao;,2022 Second International Conference on Advanc...,2022,,
2,Resource Allocation Optimization for Airport L...,https://ieeexplore.ieee.org/document/10101452/,10101452,Xinghao Lou; Ailing Huang; Yufei Yuan; Mingjie...,2022 IEEE 7th International Conference on Inte...,2022,,
3,Construction of autonomous transportation syst...,https://ieeexplore.ieee.org/document/9921842/,9921842,Zi-sheng Zhou; Ming Cai; Chen Xiong; Zhuo-lin ...,2022 IEEE 25th International Conference on Int...,2022,,
4,Empty Container Allocation and Transshipment i...,https://ieeexplore.ieee.org/document/9970786/,9970786,Mingzhu Yu; Zhishan Yu; Bo Jin; Junfeng Wu; ; ...,2022 IEEE International Symposium on Product C...,2022,,
...,...,...,...,...,...,...,...,...
881,A hierarchical graph-based accessibility measu...,https://ieeexplore.ieee.org/document/9861179/,9861179,Maryam Maslek Elayam; Cyril Ray; Christophe Cl...,2022 23rd IEEE International Conference on Mob...,2022,,
882,Federated Learning Framework Coping with Hiera...,https://ieeexplore.ieee.org/document/9922064/,9922064,Rui Song; Liguo Zhou; Venkatnarayanan Lakshmin...,2022 IEEE 25th International Conference on Int...,2022,,
883,A Systematic Survey of Driving Fatigue Monitoring,https://ieeexplore.ieee.org/document/9837786/,9837786,Zhimin Zhang; Huansheng Ning; Fang Zhou; ; ;,IEEE Transactions on Intelligent Transportatio...,2022,,
884,Exact and Heuristics Algorithms for Screen Lin...,https://ieeexplore.ieee.org/document/9843893/,9843893,Mahmoud Owais; Ahmed I. Shahin; ;,IEEE Transactions on Intelligent Transportatio...,2022,,
