Load data/my_rated_books_clean.csv

For each book (title, author):

Search Open Library

Grab a work key and then a description

Collect extra fields (year, language, subjects when available)

Save to data/my_rated_books_enriched.csv

In [None]:
import requests
import pandas as pd
import numpy as np
import time
from pathlib import Path


PROJECT_ROOT = Path(".").resolve()
DATA_DIR = PROJECT_ROOT / "data"

CLEAN_CSV_PATH = DATA_DIR / "my_rated_books_clean.csv"
ENRICHED_CSV_PATH = DATA_DIR / "my_rated_books_enriched.csv"

CLEAN_CSV_PATH, ENRICHED_CSV_PATH


(WindowsPath('C:/Users/brethm01/book-nlp/data/my_rated_books_clean.csv'),
 WindowsPath('C:/Users/brethm01/book-nlp/data/my_rated_books_enriched.csv'))

In [10]:
df = pd.read_csv(CLEAN_CSV_PATH)
print("Columns:", df.columns.tolist())
print("Number of rated books:", len(df))
df.head()


Columns: ['book_id', 'title', 'author', 'isbn', 'isbn13', 'my_rating', 'date_read', 'date_added', 'my_review']
Number of rated books: 88


Unnamed: 0,book_id,title,author,isbn,isbn13,my_rating,date_read,date_added,my_review
0,865,The Alchemist,Paulo Coelho,0061122416,9780061000000.0,2,,02/07/2019,
1,890,Of Mice and Men,John Steinbeck,0142000671,9780142000000.0,4,,02/07/2019,
2,2657,To Kill a Mockingbird,Harper Lee,0060935464,9780061000000.0,4,,02/07/2019,
3,3869,A Brief History of Time,Stephen Hawking,0553380168,9780553000000.0,4,,02/07/2019,
4,4069,Man's Search for Meaning,Viktor E. Frankl,080701429X,9780807000000.0,5,07/02/2021,26/12/2020,


Open Library search helper

In [11]:
BASE_SEARCH_URL = "https://openlibrary.org/search.json"
BASE_WORK_URL = "https://openlibrary.org"

def search_open_library(title: str, author: str, max_retries: int = 3):
    """
    Search Open Library by title + author.
    Returns the top match dict or None.
    """
    if not isinstance(title, str) or not isinstance(author, str):
        return None
    
    params = {
        "q": f"{title} {author}",
        "fields": "title,author_name,key,isbn,first_publish_year,language,subject",
        "limit": 1
    }
    
    for attempt in range(max_retries):
        try:
            resp = requests.get(BASE_SEARCH_URL, params=params, timeout=15)
            if resp.status_code != 200:
                time.sleep(1.0)
                continue
            data = resp.json()
            docs = data.get("docs", [])
            if not docs:
                return None
            return docs[0]  # top doc
        except Exception as e:
            print(f"Error on attempt {attempt+1} for {title} / {author}: {e}")
            time.sleep(1.0)
    return None


Helper to get description from work key

In [12]:
def get_work_details(work_key: str, max_retries: int = 3):
    """
    Given a work key like '/works/OL12345W', fetch work details JSON.
    """
    if not isinstance(work_key, str):
        return None
    
    url = f"{BASE_WORK_URL}{work_key}.json"
    
    for attempt in range(max_retries):
        try:
            resp = requests.get(url, timeout=15)
            if resp.status_code != 200:
                time.sleep(1.0)
                continue
            return resp.json()
        except Exception as e:
            print(f"Error on attempt {attempt+1} for {work_key}: {e}")
            time.sleep(1.0)
    return None


def extract_description(work_json):
    """
    Extract a clean string description from the work JSON.
    """
    if not work_json:
        return None
    
    desc = work_json.get("description")
    if isinstance(desc, str):
        return desc.strip()
    if isinstance(desc, dict):
        # sometimes description is {"type": "/type/text", "value": "..."}
        return str(desc.get("value", "")).strip() or None
    return None


Enrichment loop

In [13]:
enriched_rows = []

for idx, row in df.iterrows():
    title = row.get("title")
    author = row.get("author")
    print(f"[{idx+1}/{len(df)}] Searching for: {title} — {author}")
    
    # 1) search open library
    meta = search_open_library(title, author)
    
    if meta is None:
        print("   -> No result found")
        enriched_rows.append({
            **row.to_dict(),
            "ol_work_key": None,
            "ol_title": None,
            "ol_author_name": None,
            "ol_isbn_any": None,
            "ol_first_publish_year": None,
            "ol_language": None,
            "ol_subjects": None,
            "ol_description": None
        })
        continue
    
    work_key = meta.get("key")    # e.g. "/works/OL12345W"
    ol_title = meta.get("title")
    author_names = meta.get("author_name") or []
    ol_author_name = author_names[0] if author_names else None
    isbn_list = meta.get("isbn") or []
    ol_isbn_any = isbn_list[0] if isbn_list else None
    ol_year = meta.get("first_publish_year")
    languages = meta.get("language") or []
    ol_language = languages[0] if languages else None
    subjects = meta.get("subject") or []
    subjects_str = "; ".join(subjects) if subjects else None
    
    # 2) get work details (description, etc.)
    work_json = get_work_details(work_key) if work_key else None
    description = extract_description(work_json)
    
    enriched_rows.append({
        **row.to_dict(),
        "ol_work_key": work_key,
        "ol_title": ol_title,
        "ol_author_name": ol_author_name,
        "ol_isbn_any": ol_isbn_any,
        "ol_first_publish_year": ol_year,
        "ol_language": ol_language,
        "ol_subjects": subjects_str,
        "ol_description": description,
    })
    
    # Be polite to the API
    time.sleep(0.3)

df_enriched = pd.DataFrame(enriched_rows)
df_enriched.head()


[1/88] Searching for: The Alchemist — Paulo Coelho
[2/88] Searching for: Of Mice and Men — John Steinbeck
[3/88] Searching for: To Kill a Mockingbird — Harper Lee
[4/88] Searching for: A Brief History of Time — Stephen Hawking
[5/88] Searching for: Man's Search for Meaning — Viktor E. Frankl
[6/88] Searching for: Heart of Darkness — Joseph Conrad
[7/88] Searching for: The Hungry Tide — Amitav Ghosh
[8/88] Searching for: Saturday — Ian McEwan
[9/88] Searching for: The Catcher in the Rye — J.D. Salinger
[10/88] Searching for: Ghostwritten — David  Mitchell
[11/88] Searching for: Amsterdam — Ian McEwan
[12/88] Searching for: Lord of the Flies — William Golding
[13/88] Searching for: The Human Stain (The American Trilogy, #3) — Philip Roth
   -> No result found
[14/88] Searching for: One Day in the Life of Ivan Denisovich — Aleksandr Solzhenitsyn
[15/88] Searching for: His Dark Materials (His Dark Materials #1-3) — Philip Pullman
   -> No result found
[16/88] Searching for: The Origin of S

Unnamed: 0,book_id,title,author,isbn,isbn13,my_rating,date_read,date_added,my_review,ol_work_key,ol_title,ol_author_name,ol_isbn_any,ol_first_publish_year,ol_language,ol_subjects,ol_description
0,865,The Alchemist,Paulo Coelho,0061122416,9780061000000.0,2,,02/07/2019,,/works/OL796465W,O Alquimista,Paulo Coelho,61160644,1988.0,eng,Translations into Indonesian; Voyages and trav...,The Alchemist details the journey of a young A...
1,890,Of Mice and Men,John Steinbeck,0142000671,9780142000000.0,4,,02/07/2019,,/works/OL23204W,Of Mice and Men,John Steinbeck,9781537401812,1937.0,swe,contemporary fiction; literary fiction; classi...,The second book in John Steinbeck’s labor tril...
2,2657,To Kill a Mockingbird,Harper Lee,0060935464,9780061000000.0,4,,02/07/2019,,/works/OL8897870W,"To Kill a Mockingbird, Harper Lee",Jill Green,9781560778479,2007.0,,American literature; Study and teaching; Litté...,
3,3869,A Brief History of Time,Stephen Hawking,0553380168,9780553000000.0,4,,02/07/2019,,/works/OL1892617W,A Brief History of Time,Stephen Hawking,9780553176988,1988.0,cze,Cosmologie; Temps (durée); Espace-temps; Vulga...,Stephen Hawking's ‘A Brief History of Time* ha...
4,4069,Man's Search for Meaning,Viktor E. Frankl,080701429X,9780807000000.0,5,07/02/2021,26/12/2020,,/works/OL1268413W,... Trotzdem Ja zum Leben sagen,Viktor E. Frankl,9781416524281,1946.0,eng,Nazi concentration camps; psychotherapy; meani...,Psychiatrist Viktor Frankl's memoir has rivete...


Quick quality checks

In [16]:
print("Total books:", len(df_enriched))
print("With a description:", df_enriched["ol_description"].notna().sum())
print("Example with description:")
df_enriched[df_enriched["ol_description"].notna()].head(3)[
    ["title", "author", "ol_title", "ol_author_name", "ol_description"]
]


Total books: 88
With a description: 45
Example with description:


Unnamed: 0,title,author,ol_title,ol_author_name,ol_description
0,The Alchemist,Paulo Coelho,O Alquimista,Paulo Coelho,The Alchemist details the journey of a young A...
1,Of Mice and Men,John Steinbeck,Of Mice and Men,John Steinbeck,The second book in John Steinbeck’s labor tril...
3,A Brief History of Time,Stephen Hawking,A Brief History of Time,Stephen Hawking,Stephen Hawking's ‘A Brief History of Time* ha...


In [17]:
df_enriched.to_csv(ENRICHED_CSV_PATH, index=False)
ENRICHED_CSV_PATH

WindowsPath('C:/Users/brethm01/book-nlp/data/my_rated_books_enriched.csv')