# Objective I: Automated Parallel Corpus Generation

This notebook will create a parallel corpus for scientific translation by:

1. Collecting article metadata from the [OpenAlex](https://openalex.org/) Database
2. Automatically download the raw PDFs
3. Convert PDFs to markdown (preserving layout)
4. Filter by language


### Configuration


In [1]:
LANGUAGES = {
    "ta": {"name": "Tamil", "max_articles": 250},
    "he": {"name": "Hebrew", "max_articles": 250},
    "lv": {"name": "Latvian", "max_articles": 250},
}

# Pipeline control flags
DOWNLOAD_METADATA = True
DOWNLOAD_PDFS = True
GENERATE_MARKDOWN = True

### Imports and Setup


In [2]:
import os
import time
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from IPython.display import display, Markdown

import pymupdf4llm
from lingua import LanguageDetectorBuilder

import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from requests.adapters import HTTPAdapter, Retry

Consider using the pymupdf_layout package for a greatly improved page layout analysis.


In [3]:
DATA_DIR = Path.cwd() / "data"
METADATA_DIR = DATA_DIR / "metadata"
PDFS_DIR = DATA_DIR / "pdfs"
EXTRACTED_DIR = DATA_DIR / "extracted"
RESULTS_DIR = Path.cwd().parent / "results"

for dir_path in [DATA_DIR, METADATA_DIR, PDFS_DIR, EXTRACTED_DIR, RESULTS_DIR]:
    dir_path.mkdir(parents=True, exist_ok=True)

print("Directories created")
print(f"Data directory: {DATA_DIR}")

Directories created
Data directory: c:\Users\lucas\OneDrive\Desktop\projects\2025-11-17_Averroes-AI\data


### Helper Functions


In [4]:
def reconstruct_abstract(inverted_index):
    if not inverted_index:
        return ""

    word_indeces = []
    for word, indeces in inverted_index.items():
        word_indeces.extend([(idx, word) for idx in indeces])

    sorted_indeces = sorted(word_indeces, key=lambda x: x[0])
    return " ".join([index[1] for index in sorted_indeces])


def download_metadata(lang_code: str, max_articles: int, output_dir: Path):
    """Download article metadata from OpenAlex"""
    url = "https://api.openalex.org/works"
    params = {
        "filter": f"language:{lang_code},type:article",
        "select": "abstract_inverted_index,primary_location,title,doi,publication_date",
        "mailto": "example@email.com",
        "page": 1,
    }

    session = requests.Session()
    article_data = []
    total_articles = 0

    with tqdm(total=max_articles, desc=f"Collecting {lang_code} articles") as pbar:
        while total_articles < max_articles:
            response = session.get(url, params=params)
            results = response.json()["results"]

            for result in results:
                primary_location = result["primary_location"]
                pdf_url = primary_location.get("pdf_url", "")

                if not pdf_url:
                    continue

                abstract = reconstruct_abstract(result["abstract_inverted_index"])

                article_data.append(
                    {
                        "title": result["title"],
                        "abstract": abstract,
                        "pdf_url": pdf_url,
                        "doi": result["doi"],
                        "publication_date": result["publication_date"],
                    }
                )

                pbar.update(1)
                total_articles += 1
                if total_articles >= max_articles:
                    break

            params["page"] += 1
            time.sleep(4)
    df = pd.DataFrame(article_data)
    metadata_path = output_dir / f"{lang_code}_article_data.csv"
    df.to_csv(metadata_path, index=False, encoding="utf-8")
    return df


def setup_pdf_driver(download_dir: Path):
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument(
        "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
    )
    options.add_experimental_option(
        "prefs",
        {
            "download.default_directory": str(download_dir.absolute()),
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "plugins.always_open_pdf_externally": True,
            "plugins.plugins_disabled": ["Chrome PDF Viewer"],
        },
    )
    return webdriver.Chrome(options=options)


def download_pdf(driver, pdf_url: str):
    driver.get(pdf_url)
    WebDriverWait(driver, 10).until(
        lambda d: d.execute_script("return document.readyState") == "complete"
    )
    time.sleep(5)


def detect_language(text: str, detector):
    result = detector.detect_language_of(text)
    return result.iso_code_639_1.name.lower() if result else None

### Download Metadata


In [5]:
if DOWNLOAD_METADATA:
    display(Markdown("#### Step 1: Downloading Metadata from OpenAlex"))
    metadata_stats = []

    for lang_code, config in LANGUAGES.items():
        display(Markdown(f"##### {config['name']} ({lang_code})"))

        df = download_metadata(lang_code, config["max_articles"], METADATA_DIR)

        metadata_stats.append(
            {
                "Language": config["name"],
                "Code": lang_code,
                "Articles": len(df),
                "With Abstracts": df["abstract"].notna().sum(),
            }
        )

        display(df.head(2))
    # Summary
    summary_df = pd.DataFrame(metadata_stats)
    display(Markdown("##### Summary"))
    display(summary_df)

#### Step 1: Downloading Metadata from OpenAlex

##### Tamil (ta)

Collecting ta articles:   0%|          | 0/250 [00:00<?, ?it/s]

Unnamed: 0,title,abstract,pdf_url,doi,publication_date
0,CLIC 2008 Parameters,This note presents the CLIC parameter set as o...,http://cds.cern.ch/record/1132079,,2008-10-01
1,Extra-axial Ependymoma —Case Report—,A 13-year-old boy presented with a very unusua...,https://www.jstage.jst.go.jp/article/nmc1959/3...,https://doi.org/10.2176/nmc.34.295,1994-01-01


##### Hebrew (he)

Collecting he articles:   0%|          | 0/250 [00:00<?, ?it/s]

Unnamed: 0,title,abstract,pdf_url,doi,publication_date
0,"<i>ATHENA</i>,<i>ARTEMIS</i>,<i>HEPHAESTUS</i>...",A software package for the analysis of X-ray a...,http://journals.iucr.org/s/issues/2005/04/00/p...,https://doi.org/10.1107/s0909049505012719,2005-06-15
1,THE<i>NUCLEAR SPECTROSCOPIC TELESCOPE ARRAY</i...,The Nuclear Spectroscopic Telescope Array (NuS...,https://iopscience.iop.org/article/10.1088/000...,https://doi.org/10.1088/0004-637x/770/2/103,2013-05-30


##### Latvian (lv)

Collecting lv articles:   0%|          | 0/250 [00:00<?, ?it/s]

Unnamed: 0,title,abstract,pdf_url,doi,publication_date
0,Elliptic curve cryptosystems,We discuss analogs based on elliptic curves ov...,https://www.ams.org/mcom/1987-48-177/S0025-571...,https://doi.org/10.1090/s0025-5718-1987-0866109-5,1987-01-01
1,Theory of the Role of Covalence in the Perovsk...,The theory of semicovalent exchange is reviewe...,http://link.aps.org/pdf/10.1103/PhysRev.100.564,https://doi.org/10.1103/physrev.100.564,1955-10-15


##### Summary

Unnamed: 0,Language,Code,Articles,With Abstracts
0,Tamil,ta,250,250
1,Hebrew,he,250,250
2,Latvian,lv,250,250


### Download PDFs


In [6]:
if DOWNLOAD_PDFS:
    display(Markdown("#### Step 2: Downloading Article PDFs"))

    download_stats = []

    for lang_code, config in LANGUAGES.items():
        display(Markdown(f"##### {config['name']} ({lang_code})"))

        lang_pdf_dir = PDFS_DIR / lang_code
        lang_pdf_dir.mkdir(parents=True, exist_ok=True)

        # Load metadata
        metadata_path = METADATA_DIR / f"{lang_code}_article_data.csv"
        if not metadata_path.exists():
            print("No metadata found, skipping")
            continue

        articles_df = pd.read_csv(metadata_path)
        driver = setup_pdf_driver(lang_pdf_dir)

        # Download
        success_count = 0
        for idx, article in tqdm(
            articles_df.iterrows(), total=len(articles_df), desc="Downloading"
        ):
            try:
                download_pdf(driver, article["pdf_url"])
                success_count += 1
            except Exception as e:
                pass

        driver.quit()

        download_stats.append(
            {
                "Language": config["name"],
                "Attempted": len(articles_df),
                "Downloaded": success_count,
            }
        )

        print(f"Downloaded {success_count}/{len(articles_df)} PDFs")

    # Summary
    display(Markdown("##### Download Summary"))
    display(pd.DataFrame(download_stats))

#### Step 2: Downloading Article PDFs

##### Tamil (ta)

Downloading:   0%|          | 0/250 [00:00<?, ?it/s]

Downloaded 250/250 PDFs


##### Hebrew (he)

Downloading:   0%|          | 0/250 [00:00<?, ?it/s]

Downloaded 250/250 PDFs


##### Latvian (lv)

Downloading:   0%|          | 0/250 [00:00<?, ?it/s]

Downloaded 250/250 PDFs


##### Download Summary

Unnamed: 0,Language,Attempted,Downloaded
0,Tamil,250,250
1,Hebrew,250,250
2,Latvian,250,250


### Convert to Markdown


In [9]:
if GENERATE_MARKDOWN:
    display(Markdown("#### Step 3: Converting PDFs to Markdown"))

    detector = LanguageDetectorBuilder.from_all_languages().build()
    processing_stats = []

    for lang_code, config in LANGUAGES.items():
        display(Markdown(f"##### {config['name']} ({lang_code})"))

        lang_pdf_dir = PDFS_DIR / lang_code
        lang_extracted_dir = EXTRACTED_DIR / lang_code
        lang_extracted_dir.mkdir(parents=True, exist_ok=True)

        if not lang_pdf_dir.exists():
            print(f"No PDFs found, skipping")
            continue

        pdf_files = list(lang_pdf_dir.glob("*.pdf"))
        kept_count = 0
        wrong_lang_count = 0
        error_count = 0

        for pdf_path in tqdm(
            pdf_files, total=len(pdf_files), desc=f"Processing {lang_code}"
        ):
            try:
                md_text = pymupdf4llm.to_markdown(str(pdf_path))

                detected_code = detect_language(md_text, detector)
                if detected_code == lang_code:
                    new_pdf_path = lang_pdf_dir / f"{kept_count}.pdf"
                    pdf_path.rename(new_pdf_path)

                    md_path = lang_extracted_dir / f"{kept_count}.md"
                    md_path.write_text(md_text, encoding="utf-8")
                    kept_count += 1
                else:
                    pdf_path.unlink()
                    wrong_lang_count += 1

            except Exception as e:
                pdf_path.unlink(missing_ok=True)
                error_count += 1

        processing_stats.append(
            {
                "Language": config["name"],
                "Total PDFs": len(pdf_files),
                "Kept": kept_count,
                "Wrong Language": wrong_lang_count,
                "Errors": error_count,
                "Success Rate": (
                    f"{kept_count/len(pdf_files)*100:.1f}%" if pdf_files else "0%"
                ),
            }
        )

        print(f"Kept {kept_count}/{len(pdf_files)} documents")

    # Summary
    display(Markdown("### Processing Summary"))
    stats_df = pd.DataFrame(processing_stats)
    display(stats_df)

    # Visualization
    fig, ax = plt.subplots(figsize=(10, 6))
    stats_df.plot(
        x="Language", y=["Kept", "Wrong Language", "Errors"], kind="bar", ax=ax
    )
    ax.set_title("PDF Processing Results by Language")
    ax.set_ylabel("Number of Documents")
    plt.tight_layout()
    plt.savefig(RESULTS_DIR / "processing_results.png", dpi=150, bbox_inches="tight")
    plt.show()

#### Step 3: Converting PDFs to Markdown

##### Tamil (ta)

Processing ta:   0%|          | 0/28 [00:00<?, ?it/s]

PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'c:\\Users\\lucas\\OneDrive\\Desktop\\projects\\2025-11-17_Averroes-AI\\data\\pdfs\\ta\\E664fbzz7W84SWTKM.pdf'