From my_rated_books_enriched.csv üëâ create

a good text field per book (for embeddings)

an embedding for each book

a single taste vector representing your reading taste

Imports & paths

In [13]:
import os
import json
from pathlib import Path

import numpy as np
import pandas as pd

from openai import OpenAI

PROJECT_ROOT = Path(".").resolve()
DATA_DIR = PROJECT_ROOT / "data"

ENRICHED_CSV_PATH = DATA_DIR / "my_rated_books_enriched.csv"
EMBEDDINGS_PKL_PATH = DATA_DIR / "my_rated_books_with_embeddings.pkl"
TASTE_VECTOR_NPY_PATH = DATA_DIR / "taste_vector.npy"

ENRICHED_CSV_PATH, EMBEDDINGS_PKL_PATH, TASTE_VECTOR_NPY_PATH


(WindowsPath('C:/Users/brethm01/book-nlp/data/my_rated_books_enriched.csv'),
 WindowsPath('C:/Users/brethm01/book-nlp/data/my_rated_books_with_embeddings.pkl'),
 WindowsPath('C:/Users/brethm01/book-nlp/data/taste_vector.npy'))

Load enriched ratings

In [14]:
df = pd.read_csv(ENRICHED_CSV_PATH)
print("Columns:", df.columns.tolist())
print("Number of rated books:", len(df))
df.head()


Columns: ['book_id', 'title', 'author', 'isbn', 'isbn13', 'my_rating', 'date_read', 'date_added', 'my_review', 'ol_work_key', 'ol_title', 'ol_author_name', 'ol_isbn_any', 'ol_first_publish_year', 'ol_language', 'ol_subjects', 'ol_description']
Number of rated books: 88


Unnamed: 0,book_id,title,author,isbn,isbn13,my_rating,date_read,date_added,my_review,ol_work_key,ol_title,ol_author_name,ol_isbn_any,ol_first_publish_year,ol_language,ol_subjects,ol_description
0,865,The Alchemist,Paulo Coelho,0061122416,9780061000000.0,2,,02/07/2019,,/works/OL796465W,O Alquimista,Paulo Coelho,61160644,1988.0,eng,Translations into Indonesian; Voyages and trav...,The Alchemist details the journey of a young A...
1,890,Of Mice and Men,John Steinbeck,0142000671,9780142000000.0,4,,02/07/2019,,/works/OL23204W,Of Mice and Men,John Steinbeck,9781537401812,1937.0,swe,contemporary fiction; literary fiction; classi...,The second book in John Steinbeck‚Äôs labor tril...
2,2657,To Kill a Mockingbird,Harper Lee,0060935464,9780061000000.0,4,,02/07/2019,,/works/OL8897870W,"To Kill a Mockingbird, Harper Lee",Jill Green,9781560778479,2007.0,,American literature; Study and teaching; Litt√©...,
3,3869,A Brief History of Time,Stephen Hawking,0553380168,9780553000000.0,4,,02/07/2019,,/works/OL1892617W,A Brief History of Time,Stephen Hawking,9780553176988,1988.0,cze,Cosmologie; Temps (dur√©e); Espace-temps; Vulga...,Stephen Hawking's ‚ÄòA Brief History of Time* ha...
4,4069,Man's Search for Meaning,Viktor E. Frankl,080701429X,9780807000000.0,5,07/02/2021,26/12/2020,,/works/OL1268413W,... Trotzdem Ja zum Leben sagen,Viktor E. Frankl,9781416524281,1946.0,eng,Nazi concentration camps; psychotherapy; meani...,Psychiatrist Viktor Frankl's memoir has rivete...


Build a text field for embeddings
We‚Äôll construct a text_for_embedding column, trying in this order:
- ol_description (Open Library description)
- my_review (your own review)
- fallback: 'Title' by Author + a short generic phrase

In [15]:
def build_text_for_embedding(row):
    desc = row.get("ol_description")
    review = row.get("my_review")
    
    if isinstance(desc, str) and len(desc.strip()) > 30:
        return desc.strip()
    
    if isinstance(review, str) and len(review.strip()) > 30:
        return review.strip()
    
    title = row.get("title", "")
    author = row.get("author", "")
    return f"'{title}' by {author}. A book I have read and rated {row.get('my_rating', '')} out of 5."

df["text_for_embedding"] = df.apply(build_text_for_embedding, axis=1)

# Preview a few texts that will be embedded
N_SAMPLE = 3  # you can change this to 5, 10, etc.
df_sample = df.head(N_SAMPLE).copy()

df_sample[["title", "author", "my_rating", "text_for_embedding"]]

df[["title", "author", "my_rating", "text_for_embedding"]].head()


Unnamed: 0,title,author,my_rating,text_for_embedding
0,The Alchemist,Paulo Coelho,2,The Alchemist details the journey of a young A...
1,Of Mice and Men,John Steinbeck,4,The second book in John Steinbeck‚Äôs labor tril...
2,To Kill a Mockingbird,Harper Lee,4,'To Kill a Mockingbird' by Harper Lee. A book ...
3,A Brief History of Time,Stephen Hawking,4,Stephen Hawking's ‚ÄòA Brief History of Time* ha...
4,Man's Search for Meaning,Viktor E. Frankl,5,Psychiatrist Viktor Frankl's memoir has rivete...


OpenAI client setup

In [16]:
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("Please set the OPENAI_API_KEY environment variable.")

client = OpenAI(api_key=api_key)
EMBED_MODEL = "text-embedding-3-small"  # good default; cheap & strong


Embedding helper (batched)

In [17]:
def embed_texts(texts, model=EMBED_MODEL, batch_size=32):
    """
    texts: list of strings
    returns: np.ndarray of shape (n, d)
    """
    all_embeddings = []
    n = len(texts)
    for start in range(0, n, batch_size):
        batch = texts[start:start+batch_size]
        print(f"Embedding batch {start}‚Äì{start+len(batch)-1}")
        response = client.embeddings.create(
            model=model,
            input=batch
        )
        batch_embs = [item.embedding for item in response.data]
        all_embeddings.extend(batch_embs)
    return np.array(all_embeddings, dtype=np.float32)


Compute embeddings for your rated books

In [18]:
# --- SAMPLE EMBEDDING TEST ---

sample_texts = df_sample["text_for_embedding"].tolist()
sample_embeddings = embed_texts(sample_texts)

print("Sample embeddings shape:", sample_embeddings.shape)
sample_embeddings[0][:10]


Embedding batch 0‚Äì2
Sample embeddings shape: (3, 1536)


array([ 0.0250668 , -0.04028007, -0.00154473,  0.00803963, -0.01899319,
       -0.01955491,  0.02736049,  0.07798558, -0.0492442 , -0.05364435],
      dtype=float32)

In [19]:
# --- FULL EMBEDDING RUN ---

texts = df["text_for_embedding"].tolist()
book_embeddings = embed_texts(texts)

print("All embeddings shape:", book_embeddings.shape)


Embedding batch 0‚Äì31
Embedding batch 32‚Äì63
Embedding batch 64‚Äì87
All embeddings shape: (88, 1536)


In [20]:
df["embedding"] = list(book_embeddings)

# Save as a pickle so we preserve the array structure
df.to_pickle(EMBEDDINGS_PKL_PATH)

EMBEDDINGS_PKL_PATH


WindowsPath('C:/Users/brethm01/book-nlp/data/my_rated_books_with_embeddings.pkl')

Build your taste vector

We‚Äôll compute a weighted average of all book embeddings, using my_rating as the weight, but centered so that ‚Äúmeh‚Äù ratings don‚Äôt dominate.

### Question: what if my taste are actually quite diverse? 

In [21]:
# Ensure ratings are numeric
df["my_rating"] = pd.to_numeric(df["my_rating"], errors="coerce")

# Use ratings as weights (you can tweak this logic)
ratings = df["my_rating"].to_numpy(dtype=np.float32)

print("Ratings:", ratings)

# Center ratings around their mean so neutrals contribute less
mean_rating = np.nanmean(ratings)
weights = ratings - mean_rating

# Optional: zero-out negative weights so books you didn't like don't pull the taste
weights = np.where(weights < 0, 0.0, weights)

print("Weights after centering & clipping:", weights)

# Avoid division by zero
if weights.sum() == 0:
    # fallback: uniform weights
    weights = np.ones_like(weights) / len(weights)
else:
    weights = weights / (weights.sum() + 1e-8)

print("Normalized weights:", weights, "sum:", weights.sum())


Ratings: [2. 4. 4. 4. 5. 3. 4. 4. 5. 4. 3. 5. 3. 3. 5. 4. 3. 2. 4. 4. 5. 3. 2. 3.
 4. 3. 5. 4. 3. 4. 3. 4. 3. 3. 5. 3. 5. 4. 4. 4. 3. 4. 3. 3. 4. 3. 4. 3.
 4. 4. 3. 3. 3. 3. 4. 3. 3. 4. 4. 3. 4. 2. 4. 4. 3. 3. 3. 3. 3. 5. 2. 3.
 2. 4. 4. 3. 4. 3. 4. 3. 3. 4. 4. 5. 2. 3. 2. 3.]
Weights after centering & clipping: [0.         0.48863626 0.48863626 0.48863626 1.4886363  0.
 0.48863626 0.48863626 1.4886363  0.48863626 0.         1.4886363
 0.         0.         1.4886363  0.48863626 0.         0.
 0.48863626 0.48863626 1.4886363  0.         0.         0.
 0.48863626 0.         1.4886363  0.48863626 0.         0.48863626
 0.         0.48863626 0.         0.         1.4886363  0.
 1.4886363  0.48863626 0.48863626 0.48863626 0.         0.48863626
 0.         0.         0.48863626 0.         0.48863626 0.
 0.48863626 0.48863626 0.         0.         0.         0.
 0.48863626 0.         0.         0.48863626 0.48863626 0.
 0.48863626 0.         0.48863626 0.48863626 0.         0.
 0.         0.

Now compute the weighted average across embeddings:

In [22]:
# book_embeddings already has shape (N, D)
taste_vector = (book_embeddings * weights[:, None]).sum(axis=0)

# Normalize the taste vector to unit length
norm = np.linalg.norm(taste_vector) + 1e-9
taste_vector = taste_vector / norm

taste_vector.shape, np.linalg.norm(taste_vector)


((1536,), np.float32(1.0))

Save the taste vector

In [23]:
np.save(TASTE_VECTOR_NPY_PATH, taste_vector)
TASTE_VECTOR_NPY_PATH


WindowsPath('C:/Users/brethm01/book-nlp/data/taste_vector.npy')

Quick sanity check: similarity to your books
Just to see if it ‚Äúmakes sense‚Äù, we can compute the cosine similarity between your taste vector and each book:

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

sims = cosine_similarity(
    book_embeddings,
    taste_vector.reshape(1, -1)
).flatten()

df["sim_to_taste"] = sims

df_sorted = df.sort_values("sim_to_taste", ascending=False)
df_sorted[["title", "author", "my_rating", "sim_to_taste"]].head(10)


Unnamed: 0,title,author,my_rating,sim_to_taste
89,The Door,Magda Szab√≥,4,0.725915
50,Abig√©l,Magda Szab√≥,4,0.7227
33,The Diary of a Young Girl,Anne Frank,5,0.722481
9,Un amour d√©vastateur,Eileen Chang,5,0.721168
81,Hotel World,Ali Smith,4,0.721081
28,Magnus,Sylvie Germain,5,0.716125
249,Free Love and Other Stories,Ali Smith,2,0.711535
10,A Pigeon and a Boy,Meir Shalev,5,0.710745
91,A Book of American Martyrs,Joyce Carol Oates,4,0.708998
5,Six Stories (Penguin Modern Classics),Stefan Zweig,5,0.708183
