In [1]:
import pandas as pd
from sqlalchemy import create_engine

In [2]:
engine = create_engine('mysql+pymysql://root:root@127.0.0.1:3306/AICHALLENGE')

# Replace 'your_table' with the actual name of your table
table_name = 'Book'

In [3]:
query = f'SELECT * FROM {table_name}'

# Query the data into a pandas DataFrame
df = pd.read_sql(query, con=engine)
df = df.head(100)
df

Unnamed: 0,bookId,title,series,author,rating,description,pages,publisher,publishDate,coverImg,price,language_2,language_3,language_4,language_5,language_1
0,book_1,The Hunger Games,The Hunger Games #1,Suzanne Collins,0.0,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,374,Scholastic Press,09/14/08,https://i.gr-assets.com/images/S/compressed.ph...,5.09,portuguese,german,russian,japanese,english
1,book_2,Harry Potter and the Order of the Phoenix,Harry Potter #5,"J.K. Rowling, Mary GrandPré (Illustrator)",0.0,There is a door at the end of a silent corrido...,870,Scholastic Inc.,09/28/04,https://i.gr-assets.com/images/S/compressed.ph...,7.38,hindi,portuguese,arabic,german,english
2,book_3,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,0.0,The unforgettable novel of a childhood in a sl...,324,Harper Perennial Modern Classics,05/23/06,https://i.gr-assets.com/images/S/compressed.ph...,10.00,chinese,hindi,arabic,japanese,english
3,book_4,Pride and Prejudice,,"Jane Austen, Anna Quindlen (Introduction)",0.0,Alternate cover edition of ISBN 9780679783268S...,279,Modern Library,10/10/00,https://i.gr-assets.com/images/S/compressed.ph...,10.00,chinese,bengali,japanese,italian,english
4,book_6,The Book Thief,,Markus Zusak (Goodreads Author),0.0,Librarian's note: An alternate cover edition c...,552,Alfred A. Knopf,03/14/06,https://i.gr-assets.com/images/S/compressed.ph...,3.80,russian,german,bengali,chinese,english
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,book_97,Winnie-the-Pooh,Winnie-the-Pooh #1,"A.A. Milne, Ernest H. Shepard (Illustrator)",0.0,The adventures of Christopher Robin and his fr...,145,Dutton Juvenile,10/01/01,https://i.gr-assets.com/images/S/compressed.ph...,5.30,spanish,spanish,russian,german,english
96,book_98,The Complete Stories and Poems,,Edgar Allan Poe,0.0,This single volume brings together all of Poe'...,821,"Doubleday & Company, Inc.",08/15/84,https://i.gr-assets.com/images/S/compressed.ph...,5.27,japanese,hindi,portuguese,arabic,english
97,book_99,Interview with the Vampire,The Vampire Chronicles #1,Anne Rice,0.0,"This is the story of Louis, as told in his own...",342,Ballantine Books,08/31/04,https://i.gr-assets.com/images/S/compressed.ph...,2.66,russian,russian,bengali,german,english
98,book_100,A Prayer for Owen Meany,,John Irving (Goodreads Author),0.0,"Eleven-year-old Owen Meany, playing in a Littl...",637,Black Swan,10/28/90,https://i.gr-assets.com/images/S/compressed.ph...,2.86,italian,italian,italian,portuguese,english


In [4]:
import pickle
import openai
from embeddings_utils import (
    get_embedding,
    distances_from_embeddings,
    tsne_components_from_embeddings,
    chart_from_components,
    indices_of_nearest_neighbors_from_distances,
)

# constants
EMBEDDING_MODEL = "text-embedding-ada-002"

# caching

In [21]:
from openai import OpenAI
from dotenv import load_dotenv
import os

load_dotenv()
api_key=os.getenv("OPENAI_KEY",None)

client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key=api_key,
)

In [56]:
from typing import List


def get_embedding_local(text: str, engine="text-embedding-ada-002") -> List[float]:

    # replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")

    res = client.embeddings.create(
        model=engine,
        input=[text],
        encoding_format="float"
    )
    return res.data[0].embedding

In [77]:
# establish a cache of embeddings to avoid recomputing
# cache is a dict of tuples (text, model) -> embedding, saved as a pickle file

# set path to embedding cache
embedding_cache_path = "data/description_embeddings_cache.pkl"

# load the cache if it exists, and save a copy to disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path, "wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)

# define a function to retrieve embeddings from the cache if present, and otherwise request via the API
def embedding_from_string(
    id: str,
    string: str,
    model: str = EMBEDDING_MODEL,
    embedding_cache=embedding_cache
) -> list:
    print((id, model))
    """Return embedding of given string, using a cache to avoid recomputing."""
    if (id, model) not in embedding_cache.keys():
        embedding_cache[(id, model)] = get_embedding_local(string, model)
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return (id, embedding_cache[(id, model)])


In [81]:
def recommendations_from_strings(
   ids: List[str],
   strings: List[str],
   index_of_source_string: str,
   model=EMBEDDING_MODEL,
   k_nearest_neighbors: int = 5,
) -> List[int]:
   """Return nearest neighbors of a given string."""

   # get embeddings for all strings
   embeddings = [embedding_from_string(id, string, model=model) for id, string in zip(ids, strings)]

   # get the embedding of the source string
   query_embedding = embeddings[index_of_source_string]

   # get distances between the source embedding and other embeddings (function from embeddings_utils.py)
   distances = distances_from_embeddings(query_embedding, embeddings, distance_metric="cosine")

   # get indices of nearest neighbors (function from embeddings_utils.py)
   indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)

   # print out source string
   query_string = strings[index_of_source_string]

   # print out its k nearest neighbors
   k_counter = 0

   # list of the first k distances, ordered
   k_distances = []
   for i in indices_of_nearest_neighbors:
      # skip any strings that are identical matches to the starting string
      if query_string != strings[i]:
         
         # stop after printing out k articles
         if k_counter >= k_nearest_neighbors:
            return k_distances
         
         k_counter += 1
         

         # print out the similar strings and their distances
         print(
            f"""
         --- Recommendation #{k_counter}: (nearest neighbor {k_counter} of {k_nearest_neighbors}) ---
         String: {strings[i]}
         Distance: {distances[i]:0.3f}"""
         )

         k_distances.append(i)

condition = df["bookId"]=='book_98'
index_book = df[condition].index.to_list()[0]
recommendations_from_strings(df["bookId"], df["description"], index_book)

('book_1', 'text-embedding-ada-002')
('book_2', 'text-embedding-ada-002')
('book_3', 'text-embedding-ada-002')
('book_4', 'text-embedding-ada-002')
('book_6', 'text-embedding-ada-002')
('book_7', 'text-embedding-ada-002')
('book_8', 'text-embedding-ada-002')
('book_9', 'text-embedding-ada-002')
('book_10', 'text-embedding-ada-002')
('book_11', 'text-embedding-ada-002')
('book_12', 'text-embedding-ada-002')
('book_13', 'text-embedding-ada-002')
('book_14', 'text-embedding-ada-002')
('book_15', 'text-embedding-ada-002')
('book_16', 'text-embedding-ada-002')
('book_17', 'text-embedding-ada-002')
('book_18', 'text-embedding-ada-002')
('book_19', 'text-embedding-ada-002')
('book_20', 'text-embedding-ada-002')
('book_21', 'text-embedding-ada-002')
('book_22', 'text-embedding-ada-002')
('book_23', 'text-embedding-ada-002')
('book_24', 'text-embedding-ada-002')
('book_25', 'text-embedding-ada-002')
('book_26', 'text-embedding-ada-002')
('book_27', 'text-embedding-ada-002')
('book_28', 'text-em

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.

In [55]:
# print out source string
query_string = df["description"][0]
print(f"Source string: {query_string}")

Source string: WINNING MEANS FAME AND FORTUNE.LOSING MEANS CERTAIN DEATH.THE HUNGER GAMES HAVE BEGUN. . . .In the ruins of a place once known as North America lies the nation of Panem, a shining Capitol surrounded by twelve outlying districts. The Capitol is harsh and cruel and keeps the districts in line by forcing them all to send one boy and once girl between the ages of twelve and eighteen to participate in the annual Hunger Games, a fight to the death on live TV.Sixteen-year-old Katniss Everdeen regards it as a death sentence when she steps forward to take her sister's place in the Games. But Katniss has been close to dead before—and survival, for her, is second nature.
