Imports & paths

In [32]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from openai import OpenAI

PROJECT_ROOT = Path(".").resolve()
DATA_DIR = PROJECT_ROOT / "data"

CANDIDATES_ENRICHED_PATH = DATA_DIR / "candidates_enriched.csv"
CANDIDATES_EMB_PKL_PATH = DATA_DIR / "candidates_with_embeddings.pkl"
TASTE_VECTOR_NPY_PATH = DATA_DIR / "taste_vector.npy"

CANDIDATES_ENRICHED_PATH, TASTE_VECTOR_NPY_PATH


(WindowsPath('C:/Users/brethm01/book-nlp/data/candidates_enriched.csv'),
 WindowsPath('C:/Users/brethm01/book-nlp/data/taste_vector.npy'))

Load candidates + taste vector

In [33]:
df_cand = pd.read_csv(CANDIDATES_ENRICHED_PATH)
taste_vector = np.load(TASTE_VECTOR_NPY_PATH)

print("Candidates:", df_cand.shape)
df_cand.head()


Candidates: (8, 12)


Unnamed: 0,title_llm,author_llm,isbn13_llm,why_match_llm,ol_work_key,ol_title,ol_author_name,ol_isbn_any,ol_first_publish_year,ol_language,ol_subjects,ol_description
0,The Book Thief,Markus Zusak,9780375842207,This novel set in Nazi Germany explores themes...,/works/OL5819456W,The Book Thief,Markus Zusak,9780399556524,1998,ger,nyt:young-adult-paperback-monthly=2022-09-04; ...,"The extraordinary, beloved novel about the abi..."
1,The Kite Runner,Khaled Hosseini,9781594631931,A powerful story of friendship and redemption ...,/works/OL5781992W,The Kite Runner,Khaled Hosseini,9787542036346,2003,kor,New York Times bestseller; nyt:trade_fiction_p...,"The unforgettable, heartbreaking story of the ..."
2,The Nightingale,Kristin Hannah,9780399170943,This historical novel about two sisters in Naz...,/works/OL17116910W,The Nightingale,Kristin Hannah,9786555650853,2000,spa,Civilians in war; Fiction; FICTION / Contempor...,"Despite their differences, sisters Vianne and ..."
3,The Shadow of the Wind,Carlos Ruiz Zafón,9780143034902,A literary mystery set in post-war Barcelona t...,/works/OL36433603W,The Shadow of the Wind,Carlos Ruiz Zafón,1439569746,2009,,,
4,Life of Pi,Yann Martel,9780156027328,A philosophical adventure about survival and f...,/works/OL2827199W,Life of Pi,Yann Martel,606269312,2000,heb,Teenage boys; Zoo animals; Fiction; Literature...,"After the tragic sinking of a cargo ship, one ..."


Build text_for_embedding for candidates
We’ll be pragmatic:
Prefer ol_description
Fallback to a synthetic sentence using title, author, subjects, why_match

In [34]:
def build_candidate_text(row):
    desc = row.get("ol_description")
    if isinstance(desc, str) and len(desc.strip()) > 30:
        return desc.strip()

    # fallback: synthetic text
    title = row.get("ol_title") or row.get("title_llm") or ""
    author = row.get("ol_author_name") or row.get("author_llm") or ""
    subjects = row.get("ol_subjects") or ""
    why = row.get("why_match_llm") or ""

    parts = [
        f"'{title}' by {author}.",
    ]

    if subjects:
        parts.append(f"Subjects: {subjects}.")
    if why:
        parts.append(f"Recommended because: {why}.")

    return " ".join(parts)

df_cand["text_for_embedding"] = df_cand.apply(build_candidate_text, axis=1)

df_cand[["ol_title", "ol_author_name", "text_for_embedding"]].head()


Unnamed: 0,ol_title,ol_author_name,text_for_embedding
0,The Book Thief,Markus Zusak,"The extraordinary, beloved novel about the abi..."
1,The Kite Runner,Khaled Hosseini,"The unforgettable, heartbreaking story of the ..."
2,The Nightingale,Kristin Hannah,"Despite their differences, sisters Vianne and ..."
3,The Shadow of the Wind,Carlos Ruiz Zafón,'The Shadow of the Wind' by Carlos Ruiz Zafón....
4,Life of Pi,Yann Martel,"After the tragic sinking of a cargo ship, one ..."


OpenAI client + embedding helper (reuse pattern)

In [35]:
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("Please set the OPENAI_API_KEY environment variable.")

client = OpenAI(api_key=api_key)
EMBED_MODEL = "text-embedding-3-small"

def embed_texts(texts, model=EMBED_MODEL, batch_size=32):
    all_embeddings = []
    n = len(texts)
    for start in range(0, n, batch_size):
        batch = texts[start:start+batch_size]
        print(f"Embedding batch {start}–{start+len(batch)-1}")
        resp = client.embeddings.create(
            model=model,
            input=batch
        )
        batch_embs = [d.embedding for d in resp.data]
        all_embeddings.extend(batch_embs)
    return np.array(all_embeddings, dtype=np.float32)


Sample test (embed just a few candidates first)

In [36]:
N_SAMPLE = min(3, len(df_cand))
df_cand_sample = df_cand.head(N_SAMPLE).copy()

df_cand_sample[["ol_title", "ol_author_name", "text_for_embedding"]]

Unnamed: 0,ol_title,ol_author_name,text_for_embedding
0,The Book Thief,Markus Zusak,"The extraordinary, beloved novel about the abi..."
1,The Kite Runner,Khaled Hosseini,"The unforgettable, heartbreaking story of the ..."
2,The Nightingale,Kristin Hannah,"Despite their differences, sisters Vianne and ..."


In [37]:
sample_texts = df_cand_sample["text_for_embedding"].tolist()
sample_embeddings = embed_texts(sample_texts)

print("Sample embeddings shape:", sample_embeddings.shape)
sample_embeddings[0][:10]


Embedding batch 0–2
Sample embeddings shape: (3, 1536)


array([-0.00861588,  0.04252238, -0.00882327,  0.02066863,  0.0297704 ,
       -0.03135848, -0.04384972,  0.05494251, -0.03517459, -0.03576715],
      dtype=float32)

Embed all candidates

In [38]:
texts_all = df_cand["text_for_embedding"].tolist()
cand_embeddings = embed_texts(texts_all)

cand_embeddings.shape


Embedding batch 0–7


(8, 1536)

Rank candidates by cosine similarity to your taste

In [39]:
# Compute similarity:

sims = cosine_similarity(
    cand_embeddings,
    taste_vector.reshape(1, -1)
).flatten()

df_cand["sim_to_taste"] = sims


In [40]:
df_ranked = df_cand.sort_values("sim_to_taste", ascending=False).reset_index(drop=True)

df_ranked[
    ["ol_title", "ol_author_name", "ol_first_publish_year", "ol_language", "sim_to_taste", "why_match_llm"]
].head(15)

# pure embedding-based ranking: “closest to my taste vector.”


Unnamed: 0,ol_title,ol_author_name,ol_first_publish_year,ol_language,sim_to_taste,why_match_llm
0,The Book Thief,Markus Zusak,1998,ger,0.584993,This novel set in Nazi Germany explores themes...
1,The Goldfinch,Donna Tartt,2013,eng,0.553718,This Pulitzer Prize-winning novel explores the...
2,The Shadow of the Wind,Carlos Ruiz Zafón,2009,,0.494834,A literary mystery set in post-war Barcelona t...
3,The Help,Kathryn Stockett,2009,eng,0.444609,A compelling story set in the 1960s American S...
4,The Kite Runner,Khaled Hosseini,2003,kor,0.438882,A powerful story of friendship and redemption ...
5,Life of Pi,Yann Martel,2000,heb,0.361335,A philosophical adventure about survival and f...
6,The Immortal Life of Henrietta Lacks,Rebecca Skloot,2009,eng,0.337923,This non-fiction book blends science and perso...
7,The Nightingale,Kristin Hannah,2000,spa,0.328327,This historical novel about two sisters in Naz...


In [41]:
df_cand["embedding"] = list(cand_embeddings)
df_cand.to_pickle(CANDIDATES_EMB_PKL_PATH)

CANDIDATES_EMB_PKL_PATH

WindowsPath('C:/Users/brethm01/book-nlp/data/candidates_with_embeddings.pkl')

In [42]:
df_cand.head()

Unnamed: 0,title_llm,author_llm,isbn13_llm,why_match_llm,ol_work_key,ol_title,ol_author_name,ol_isbn_any,ol_first_publish_year,ol_language,ol_subjects,ol_description,text_for_embedding,sim_to_taste,embedding
0,The Book Thief,Markus Zusak,9780375842207,This novel set in Nazi Germany explores themes...,/works/OL5819456W,The Book Thief,Markus Zusak,9780399556524,1998,ger,nyt:young-adult-paperback-monthly=2022-09-04; ...,"The extraordinary, beloved novel about the abi...","The extraordinary, beloved novel about the abi...",0.584993,"[-0.008604629, 0.042549063, -0.008800189, 0.02..."
1,The Kite Runner,Khaled Hosseini,9781594631931,A powerful story of friendship and redemption ...,/works/OL5781992W,The Kite Runner,Khaled Hosseini,9787542036346,2003,kor,New York Times bestseller; nyt:trade_fiction_p...,"The unforgettable, heartbreaking story of the ...","The unforgettable, heartbreaking story of the ...",0.438882,"[-0.018309388, 0.012103747, -0.008122251, 0.00..."
2,The Nightingale,Kristin Hannah,9780399170943,This historical novel about two sisters in Naz...,/works/OL17116910W,The Nightingale,Kristin Hannah,9786555650853,2000,spa,Civilians in war; Fiction; FICTION / Contempor...,"Despite their differences, sisters Vianne and ...","Despite their differences, sisters Vianne and ...",0.328327,"[-0.008186101, 0.010401164, 0.009373888, -0.01..."
3,The Shadow of the Wind,Carlos Ruiz Zafón,9780143034902,A literary mystery set in post-war Barcelona t...,/works/OL36433603W,The Shadow of the Wind,Carlos Ruiz Zafón,1439569746,2009,,,,'The Shadow of the Wind' by Carlos Ruiz Zafón....,0.494834,"[-0.015731765, 0.04576915, -0.048112888, -0.00..."
4,Life of Pi,Yann Martel,9780156027328,A philosophical adventure about survival and f...,/works/OL2827199W,Life of Pi,Yann Martel,606269312,2000,heb,Teenage boys; Zoo animals; Fiction; Literature...,"After the tragic sinking of a cargo ship, one ...","After the tragic sinking of a cargo ship, one ...",0.361335,"[0.027463343, 0.053714294, 0.0157084, 0.076380..."


filter out anything too close to something you’ve read
If you also computed READ_WORK_KEYS and/or READ_KEYS in Step 4, you can load them in here or recompute.

In [43]:
# If you have df_my loaded here:
df_my = pd.read_csv(DATA_DIR / "my_rated_books_enriched.csv")
READ_WORK_KEYS = set(df_my["ol_work_key"].dropna().unique())

df_ranked = df_ranked[~df_ranked["ol_work_key"].isin(READ_WORK_KEYS)].reset_index(drop=True)

df_ranked[
    ["ol_title", "ol_author_name", "ol_first_publish_year", "sim_to_taste", "why_match_llm"]
].head(15)


Unnamed: 0,ol_title,ol_author_name,ol_first_publish_year,sim_to_taste,why_match_llm
0,The Book Thief,Markus Zusak,1998,0.584993,This novel set in Nazi Germany explores themes...
1,The Goldfinch,Donna Tartt,2013,0.553718,This Pulitzer Prize-winning novel explores the...
2,The Shadow of the Wind,Carlos Ruiz Zafón,2009,0.494834,A literary mystery set in post-war Barcelona t...
3,The Help,Kathryn Stockett,2009,0.444609,A compelling story set in the 1960s American S...
4,The Kite Runner,Khaled Hosseini,2003,0.438882,A powerful story of friendship and redemption ...
5,Life of Pi,Yann Martel,2000,0.361335,A philosophical adventure about survival and f...
6,The Immortal Life of Henrietta Lacks,Rebecca Skloot,2009,0.337923,This non-fiction book blends science and perso...
7,The Nightingale,Kristin Hannah,2000,0.328327,This historical novel about two sisters in Naz...
