In [1]:
import pandas as pd
import numpy as np

import os
import time

from dotenv import load_dotenv
from openai import OpenAI
from tqdm import tqdm

In [2]:
# Load API key from .env file
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key = OPENAI_API_KEY)
if not OPENAI_API_KEY:
    raise ValueError("OpenAI API key not found! Make sure it's in the .env file.")

MODEL = "text-embedding-3-small"

## Loading the dataset

In [3]:
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATA_DIR = os.path.join(BASE_DIR, "data")
DATA_PATH = os.path.join(DATA_DIR, "books_data.csv")

# Load the CSV file
df = pd.read_csv(DATA_PATH)


In [4]:
df.head()

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount
0,Its Only Art If Its Well Hung!,,['Julie Strain'],http://books.google.com/books/content?id=DykPA...,http://books.google.nl/books?id=DykPAAAACAAJ&d...,,1996,http://books.google.nl/books?id=DykPAAAACAAJ&d...,['Comics & Graphic Novels'],
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,http://books.google.nl/books?id=IjvHQsCn_pgC&p...,A&C Black,2005-01-01,http://books.google.nl/books?id=IjvHQsCn_pgC&d...,['Biography & Autobiography'],
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],http://books.google.com/books/content?id=2tsDA...,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,,2000,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,['Religion'],
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,iUniverse,2005-02,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,['Fiction'],
4,"Nation Dance: Religion, Identity and Cultural ...",,['Edward Long'],,http://books.google.nl/books?id=399SPgAACAAJ&d...,,2003-03-01,http://books.google.nl/books?id=399SPgAACAAJ&d...,,


## Generate a sample of the dataset

In [5]:
df_sample = df.sample(n=100, random_state=123)
# Save to a new CSV file
df_sample.to_csv(os.path.join(DATA_DIR, "books_data_sample.csv"), index=False)

## Generate embeddigns for this sample

In [6]:
def get_embedding(text, model= MODEL):
    return client.embeddings.create(input = [text], model=model).data[0].embedding

In [7]:
# Sample text
text = df["description"][1]
print(f"Sample description: {text}")
# Start timer
start_time = time.time()
# Generate embedding
embedding = get_embedding(text)
# End timer
end_time = time.time()
# Print time taken
print(f"Time taken to generate one embedding: {end_time - start_time:.4f} seconds")

Sample description: Philip Nel takes a fascinating look into the key aspects of Seuss's career - his poetry, politics, art, marketing, and place in the popular imagination." "Nel argues convincingly that Dr. Seuss is one of the most influential poets in America. His nonsense verse, like that of Lewis Carroll and Edward Lear, has changed language itself, giving us new words like "nerd." And Seuss's famously loopy artistic style - what Nel terms an "energetic cartoon surrealism" - has been equally important, inspiring artists like filmmaker Tim Burton and illustrator Lane Smith. --from back cover
Time taken to generate one embedding: 0.3476 seconds


In [8]:

# Convert descriptions to a list, replacing NaNs with placeholders
df_sample["description"] = df_sample["description"].fillna("MISSING")  # Use "MISSING" as a placeholder

texts = df_sample["description"].tolist()

# Set batch size (adjust based on rate limits)
batch_size = 20
all_embeddings = []

# Start timer
start_time = time.time()

try:
    # Process embeddings in batches
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]

        batch_start_time = time.time()

        # Generate embeddings for the batch
        response = client.embeddings.create(input=batch, model=MODEL)
        batch_embeddings = [item.embedding for item in response.data]

        all_embeddings.extend(batch_embeddings)

        batch_end_time = time.time()
        batch_time = batch_end_time - batch_start_time

        # Raise an error if batch takes too long
        if batch_time > 10:
            raise TimeoutError(f"Batch {i // batch_size + 1} took too long: {batch_time:.2f} sec")

        print(f"Processed batch {i // batch_size + 1} / {len(texts) // batch_size + 1} "
              f"({batch_time:.2f} sec per batch)")

        time.sleep(1)
except Exception as e:
    print(f"Error: {e}")

# End timer
end_time = time.time()
total_time = end_time - start_time

# Add embeddings to DataFrame
df_sample["embedding"] = all_embeddings

# Ensure NaNs are retained in the final dataset
df_sample.replace("MISSING", np.nan, inplace=True)

print(f"Completed embedding generation for {len(df_sample)} rows!")
print(f"Total time taken: {total_time:.2f} seconds")

Processed batch 1 / 6 (0.50 sec per batch)
Processed batch 2 / 6 (3.11 sec per batch)
Processed batch 3 / 6 (0.68 sec per batch)
Processed batch 4 / 6 (0.39 sec per batch)
Processed batch 5 / 6 (0.30 sec per batch)
Completed embedding generation for 100 rows!
Total time taken: 9.99 seconds


In [9]:
df_sample.head()

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount,embedding
122732,Manual of instruction for the volunteers and m...,,['William Gilham'],http://books.google.com/books/content?id=xmw9k...,http://books.google.nl/books?id=xmw9kXUyOlcC&p...,,1861,https://play.google.com/store/books/details?id...,,,"[0.0034267951268702745, 0.015257937833666801, ..."
14560,Building Wealth from the Ground Up,"""Building Wealth from the Ground Up"" is a stra...",['Mikel Brown'],http://books.google.com/books/content?id=0n8KA...,http://books.google.nl/books?id=0n8KAAAACAAJ&d...,Cjc Publishing Company,2004-04-01,http://books.google.nl/books?id=0n8KAAAACAAJ&d...,['Business & Economics'],,"[-0.038205672055482864, 0.038760989904403687, ..."
84305,"The Gay Guy's Guide to Love: The Dos, Don'ts, ...",,,,,,,,,,"[0.003398788394406438, 0.015259557403624058, 0..."
193371,Sweet Talkers (Richard Kasak Books),"""An anthology of writing by women about their ...",['Shar Rednour'],http://books.google.com/books/content?id=qfNKS...,http://books.google.com/books?id=qfNKSwjWEzMC&...,Richard Kasak Books,1996,http://books.google.com/books?id=qfNKSwjWEzMC&...,"['Erotic stories, American']",,"[0.034600213170051575, 0.007466230075806379, 0..."
176381,Trails of the Heart,,,,,,,,,,"[0.003398788394406438, 0.015259557403624058, 0..."


In [10]:
df_sample.to_csv(os.path.join(DATA_DIR, "books_data_sample_embeddings.csv"), index = False)