In [None]:
import boto3
import gzip
import json
import csv
from botocore import UNSIGNED
from botocore.client import Config
from io import BytesIO

In [None]:

# -----------------------------
# Configuration
# -----------------------------
bucket_name = "openalex"
prefix = "data/works/"  # folder in S3
output_csv = "openalex6.csv"
max_bytes = 10 * 1024**3  # 10 GB
max_files = 100_000       # limit to first 100k files
fields = ["ID", "title", "abstract", "abstract_inverted_index", "year"]

# -

In [None]:
with open("manifest.json") as f:
    manifest = json.load(f)

files = []
for entry in manifest.get("entries", []):
    url = entry.get("url", "")
    if url.startswith("s3://openalex/data/works/") and url.endswith(".gz"):
        key = url.replace("s3://openalex/", "")
        files.append(key)

# Limit to first 100k files
files = files[:max_files]

print(f"Processing {len(files)} files from manifest...")

In [None]:
s3 = boto3.client(
    "s3",
    region_name="eu-central-1",
    config=Config(signature_version=UNSIGNED)
)

In [None]:
def reconstruct_abstract(inv_index):
    if not inv_index:
        return ""
    max_pos = max(pos for positions in inv_index.values() for pos in positions)
    words = [""] * (max_pos + 1)
    for word, positions in inv_index.items():
        for pos in positions:
            words[pos] = word
    return " ".join(words)

In [None]:

# -----------------------------
# Flatten work to CSV row
# -----------------------------
# -----------------------------
# Flatten work to CSV row
# -----------------------------
def flatten_work(work):
    inv_index = work.get("abstract_inverted_index")
    return {
        "ID": work.get("id", ""),
        "title": work.get("display_name", ""),
        "abstract": reconstruct_abstract(inv_index),
        "abstract_inverted_index": json.dumps(inv_index or {}),
        "year": work.get("publication_year", "")
    }



In [None]:
# -----------------------------
# Open CSV and process files
# -----------------------------
total_bytes = 0
english_count = 0

with open(output_csv, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fields)
    writer.writeheader()

    for count, key in enumerate(files, 1):
        # Download file from S3
        obj_body = s3.get_object(Bucket="openalex", Key=key)["Body"].read()

        # Read as gzip
        with gzip.GzipFile(fileobj=BytesIO(obj_body)) as gz:
            for line in gz:
                work = json.loads(line)

                # Filter: dissertations in English
                if work.get("lang") == "en":
                    row = flatten_work(work)
                    writer.writerow(row)
                    total_bytes += sum(len(str(v).encode("utf-8")) for v in row.values())
                    english_count += 1

                    if total_bytes >= max_bytes:
                        print(f"Reached {max_bytes / (1024**3):.1f} GB CSV limit. Stopping.")
                        exit(0)

        if count % 1000 == 0:
            print(f"Processed {count} files, English dissertations so far: {english_count}, CSV size: {total_bytes / (1024**3):.2f} GB")

print(f"Finished writing dissertations CSV! Total English dissertations: {english_count}")