# Metadata Collection

This notebook downloads article metadata from the OpenAlex API for multiple languages. It collects titles, abstracts, PDF URLs, DOIs, and publication dates for scientific articles in the configured languages.

## Setup

### Imports

In [6]:
import yaml
import time
import random
import requests
import pandas as pd
from pathlib import Path
from tqdm import tqdm

### Configuration and Paths

In [7]:
# Load configuration
with open("../config.yaml") as f:
    config = yaml.safe_load(f)

# Set up project paths
project_root = Path.cwd().parent
METADATA_DIR = project_root / config["METADATA_DIR"]

# Progress bar format
PROGRESS_BAR_FORMAT = "{desc:<25}{percentage:3.0f}%|{bar:20}{r_bar}"

## Helper Functions

In [8]:
def reconstruct_abstract(inverted_index):
    if not inverted_index:
        return ""

    word_indeces = []
    for word, indeces in inverted_index.items():
        word_indeces.extend([(idx, word) for idx in indeces])

    sorted_indeces = sorted(word_indeces, key=lambda x: x[0])
    return " ".join([index[1] for index in sorted_indeces])

In [9]:
def download_metadata(lang_code: str, max_articles: int, output_dir: Path):
    """Download article metadata from OpenAlex"""
    url = "https://api.openalex.org/works"
    params = {
        "filter": f"language:{lang_code},type:article",
        "select": "abstract_inverted_index,primary_location,title,doi,publication_date",
        "mailto": "example@email.com",
        "per-page": 100,
        "cursor": "*",
    }

    session = requests.Session()
    article_data = []
    total_articles = 0

    with tqdm(total=max_articles, desc=f"Collecting {lang_code} articles", bar_format=PROGRESS_BAR_FORMAT) as pbar:
        while total_articles < max_articles:
            response = session.get(url, params=params)

            try:
                if "next_cursor" not in response.json()["meta"]:
                    break

                next_cursor = response.json()["meta"]["next_cursor"]
                results = response.json()["results"]

                for result in results:
                    primary_location = result["primary_location"]
                    pdf_url = primary_location.get("pdf_url", "")

                    if not pdf_url:
                        continue

                    abstract = reconstruct_abstract(result["abstract_inverted_index"])

                    article_data.append(
                        {
                            "title": result["title"],
                            "abstract": abstract,
                            "pdf_url": pdf_url,
                            "doi": result["doi"],
                            "publication_date": result["publication_date"],
                        }
                    )

                    pbar.update(1)
                    total_articles += 1
                    if total_articles >= max_articles:
                        break

            except Exception as e:
                print(f"Error downloading article info: {str(e)}")
                print(f"Response: {response.json()}")

            params["cursor"] = next_cursor
            time.sleep(random.randint(2, 4))

    df = pd.DataFrame(article_data)
    metadata_path = output_dir / f"{lang_code}_article_data.csv"
    df.to_csv(metadata_path, index=False, encoding="utf-8")
    return df


## Download Metadata

Download metadata for all configured languages and display summary statistics.

In [10]:
metadata_stats = []

for lang_code, lang_config in config["LANGUAGES"].items():
    df = download_metadata(lang_code, lang_config["max_articles"], METADATA_DIR)
    metadata_stats.append(
        {
            "Language": lang_config["name"],
            "Code": lang_code,
            "Articles": len(df),
            "With Abstracts": df["abstract"].notna().sum(),
        }
    )


Collecting ta articles    54%|██████████▊         | 460/850 [02:10<01:50,  3.52it/s]
Collecting bn articles   100%|████████████████████| 850/850 [01:25<00:00,  9.91it/s]
Collecting th articles   100%|████████████████████| 850/850 [23:12<00:00,  1.64s/it]  


In [11]:
display(pd.DataFrame(metadata_stats))

Unnamed: 0,Language,Code,Articles,With Abstracts
0,Tamil,ta,460,460
1,Bengali,bn,850,850
2,Thai,th,850,850
