In [1]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import matplotlib.pylab as plt
import scipy.sparse as sparse
import numpy as np

In [2]:
from dotenv import dotenv_values

config = dotenv_values("../.env")


In [3]:
book_df = pd.read_parquet('../data/book_eng.parquet')
book_df

Unnamed: 0,Id,Name,Authors,ISBN,Rating,PublishYear,PublishMonth,PublishDay,Publisher,RatingDist5,...,RatingDist3,RatingDist2,RatingDist1,RatingDistTotal,CountsOfReview,Language,PagesNumber,Description,pagesNumber,Count of text reviews
0,1900574,The Political and Social Ideas of Jules Verne,Jean Chesneaux,0500010846,0.00,1972,10,2,Thames and Hudson,5:0,...,3:0,2:0,1:0,total:0,0,eng,224.0,Translated from Une lecture politique de Jules...,,
1,1900581,Risky Business,Nora Roberts,0786253843,3.87,2003,8,1,Thorndike Press,5:1826,...,3:1483,2:392,1:89,total:5503,3,eng,344.0,Liz Palmer is raising her daughter alone becau...,,
2,1900585,"His Brain, Her Brain: How Divinely Designed Di...",Walt Larimore,031024028X,4.19,2008,1,27,Zondervan,5:115,...,3:30,2:8,1:8,total:240,30,en-US,233.0,"She reads people, and he reads manualsHe doesn...",,
3,1900601,The Secrets of Rosslyn,Roddy Martine,1841585904,3.78,2008,1,1,Birlinn Publishers,5:6,...,3:11,2:1,1:0,total:32,5,eng,224.0,For generations the tiny chapel of Rosslyn has...,,
4,1900644,Oranges from Spain,David Park,0747571627,3.31,2004,4,1,Bloomsbury Publishing PLC,5:8,...,3:11,2:6,1:4,total:39,6,eng,192.0,A collection of stories about of the trials of...,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209513,1499848,Makeover Magic,Kelly McKain,0746066899,4.07,2005,26,8,Usborne,5:89,...,3:29,2:16,1:5,total:182,12,en-GB,,"A fun and funky new series charting the hopes,...",160.0,12.0
209514,1499849,The Goddess Society,Kelly McKain,0439963699,3.24,2005,21,1,Scholastic Point,5:8,...,3:24,2:11,1:4,total:66,1,eng,,,288.0,1.0
209515,1499854,The Moonstone Mass and Others,Harriet Prescott Spofford,1553100085,3.50,2000,15,9,Ash-Tree Press,5:1,...,3:1,2:1,1:0,total:4,0,eng,,"In 1859, a twenty-four year old author burst u...",183.0,0.0
209516,1499880,Boy,James Hanley,1847490069,3.61,2007,1,4,Oneworld Classics,5:31,...,3:54,2:11,1:5,total:152,17,eng,,Acclaimed by luminaries such as William Faulkn...,220.0,17.0


In [4]:
book_df_cleaned = book_df.dropna(subset = ['Description'])
book_df_cleaned.reset_index(drop = True, inplace = True)

In [5]:
book_df.sort_values('RatingDistTotal', ascending = False)[:1000]

Unnamed: 0,Id,Name,Authors,ISBN,Rating,PublishYear,PublishMonth,PublishDay,Publisher,RatingDist5,...,RatingDist3,RatingDist2,RatingDist1,RatingDistTotal,CountsOfReview,Language,PagesNumber,Description,pagesNumber,Count of text reviews
78636,2339477,The Last Report on the Miracles at Little No H...,Louise Erdrich,0061577626,4.19,2016,8,16,Harper Perennial,5:4531,...,3:1463,2:410,1:131,total:9999,50,eng,361.0,<strong>A <em>New York Times</em> Notable Book...,,
135434,310258,"The Snowy Day (Peter, #1)",Ezra Jack Keats,0140501827,4.16,1976,28,10,Puffin Books,5:49230,...,3:17356,2:4842,1:2193,total:99986,3220,eng,,,40.0,
193122,30189,"Cast in Courtlight (Chronicles of Elantra, #2)",Michelle Sagara,0373802447,4.11,2006,18,7,Luna,5:3737,...,3:1810,2:298,1:71,total:9998,296,eng,,,488.0,
129955,549129,"Therapy (Alex Delaware, #18)",Jonathan Kellerman,0739313304,3.82,2004,20,4,Random House Audio,5:2346,...,3:3069,2:424,1:78,total:9997,4,eng,,,0.0,
111340,1829655,The Demon's Lexicon,Sarah Rees Brennan,1416963790,3.73,2009,6,2,Margaret K. McElderry Books,5:2770,...,3:2512,2:846,1:438,total:9997,1011,eng,336.0,"Sixteen-year-old Nick and his brother, Alan, a...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65853,898167,A History of the Ancient World: Volume II Rome,Michael Rostovtzeff,0195002245,3.95,1960,31,12,"Oxford University Press, USA",5:29,...,3:24,2:5,1:0,total:98,2,eng,,First published in 1927 this monumental book h...,368.0,2.0
177936,4225565,The Talleyrand Maxim,J.S. Fletcher,1406941115,3.72,2006,11,3,Hard Press,5:20,...,3:31,2:5,1:2,total:98,1,eng,180.0,"Nesta, left alone, gave herself up to deep tho...",,
176098,4066740,Introductory Quantum Mechanics,Richard L. Liboff,0201878798,3.66,1997,6,23,Addison Wesley Publishing Company,5:21,...,3:26,2:11,1:2,total:98,0,eng,874.0,,,
130870,559401,Blood + Water,Judd Winick,140120175X,3.43,2009,20,10,Vertigo,5:11,...,3:43,2:9,1:2,total:98,7,en-US,,,128.0,


In [6]:
sample_book_cleaned = book_df_cleaned.sample(100, random_state=42)

### Generate Description Embeddings

#### TF-IDF

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), stop_words='english')
tf_matrix = tf.fit_transform(sample_book_cleaned['Description'])
tf.get_feature_names_out()
tf_matrix.shape

(100, 16729)

In [8]:
columns = tf.get_feature_names_out()  # New method
df = pd.DataFrame(tf_matrix.todense(), columns=columns, index=sample_book_cleaned.Name)


#### OpenAI Embeddings

In [9]:
import openai

openai.api_key = config['OPENAI_API_KEY']

In [10]:
# %%time

import multiprocessing
import pandas as pd
import numpy as np
from multiprocessing import Pool
import scipy.sparse as sp
import tiktoken

In [11]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [12]:
tokens = []
for desc in sample_book_cleaned['Description']:
    tokens.append(num_tokens_from_string(desc, "cl100k_base"))

tokens


[366,
 54,
 105,
 57,
 157,
 311,
 127,
 105,
 190,
 204,
 162,
 40,
 68,
 19,
 103,
 116,
 52,
 304,
 345,
 214,
 235,
 69,
 136,
 275,
 206,
 316,
 144,
 66,
 71,
 322,
 221,
 209,
 136,
 114,
 379,
 18,
 150,
 67,
 46,
 203,
 324,
 69,
 70,
 350,
 163,
 130,
 173,
 89,
 42,
 247,
 50,
 149,
 239,
 58,
 100,
 76,
 138,
 188,
 470,
 93,
 303,
 28,
 265,
 266,
 139,
 144,
 310,
 43,
 91,
 369,
 67,
 125,
 121,
 81,
 257,
 138,
 7,
 121,
 224,
 158,
 134,
 220,
 120,
 474,
 49,
 101,
 62,
 33,
 177,
 195,
 309,
 116,
 168,
 13,
 86,
 35,
 42,
 381,
 95,
 366]

In [13]:
print(np.mean(tokens))
print(np.max(tokens))

161.33
474


In [14]:
sample_book_cleaned.iloc[:20]['Description'].values.tolist()

['Meet Isabel "Izzy" Spellman, private investigator. This twenty-eight-year-old may have a checkered past littered with romantic mistakes, excessive drinking, and creative vandalism; she may be addicted to Get Smart reruns and prefer entering homes through windows rather than doors -- but the upshot is she\'s good at her job as a licensed private investigator with her family\'s firm, Spellman Investigations. Invading people\'s privacy comes naturally to Izzy. In fact, it comes naturally to all the Spellmans. If only they could leave their work at the office. To be a Spellman is to snoop on a Spellman; tail a Spellman; dig up dirt on, blackmail, and wiretap a Spellman. <br /><br />Part Nancy Drew, part Dirty Harry, Izzy walks an indistinguishable line between Spellman family member and Spellman employee. Duties include: completing assignments from the bosses, aka Mom and Dad (preferably without scrutiny); appeasing her chronically perfect lawyer brother (often under duress); setting an 

In [15]:
%%time

def create_embeddings_in_batches(documents, batch_size, model):
    all_embeddings = []
    
    # Split the documents into batches of the specified size
    for i in range(0, len(documents), batch_size):
        batch = documents[i:i + batch_size]
        response = openai.embeddings.create(
            input=batch,
            model=model
        )
        # Extract the embeddings from the response and add them to the list
        batch_embeddings = response.data[0].embedding
        all_embeddings.append(batch_embeddings)
    
    return all_embeddings

# Your list of 1000 documents
documents = sample_book_cleaned.iloc[:2]['Description'].values.tolist()  # Replace with your actual documents

# Call the function with your documents, desired batch size, and the model you want to use
embeddings = create_embeddings_in_batches(documents, 1, "text-embedding-ada-002")

CPU times: total: 234 ms
Wall time: 3.34 s


In [16]:
len(embeddings[0])

1536

In [17]:
item_feature_matrix_array = np.array(embeddings)

item_feature_matrix_array.shape

# Print the item feature matrix
print("Item Feature Matrix:\n", item_feature_matrix_array)

Item Feature Matrix:
 [[ 0.0017763  -0.00845794  0.00289332 ...  0.00891568 -0.01742144
  -0.02986922]
 [-0.00147497  0.01578262  0.01989394 ... -0.00914905  0.00187478
  -0.0350815 ]]


#### Embeddings Langchain

In [18]:
data = sample_book_cleaned

In [19]:
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = "text-embedding-ada-002"
api_key = config["OPENAI_API_KEY"]  

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=api_key 
)


  embed = OpenAIEmbeddings(


In [20]:
data

Unnamed: 0,Id,Name,Authors,ISBN,Rating,PublishYear,PublishMonth,PublishDay,Publisher,RatingDist5,...,RatingDist3,RatingDist2,RatingDist1,RatingDistTotal,CountsOfReview,Language,PagesNumber,Description,pagesNumber,Count of text reviews
54015,2167181,"The Spellman Files (The Spellmans, #1)",Lisa Lutz,1847820069,3.88,2007,12,1,Charnwood (U.K.),5:9141,...,3:7346,2:1783,1:653,total:31453,9,eng,434.0,"Meet Isabel ""Izzy"" Spellman, private investiga...",,
96931,3069617,Insight Day and Night Guide Boston,Insight Guides,9812468013,4.00,2006,7,15,Insight Guides,5:0,...,3:0,2:0,1:0,total:1,1,eng,1.0,Insight Day &amp; Night Guides let you plan yo...,,
105559,3430781,"Sixteen Candles (Terror Academy, #3)",Nicholas Pine,0749716886,3.60,1994,6,13,Mammoth,5:12,...,3:18,2:9,1:1,total:67,0,eng,174.0,<b>TERROR ACADEMY - A KILLER IS ON THE LOOSE.....,,
75202,2971345,His Wedding Ring of Revenge,Julia James,0263185966,3.52,2005,12,2,Thorndike Press,5:52,...,3:71,2:30,1:9,total:221,1,eng,288.0,Rachel Vail is still haunted by Vito Farneste'...,,
89816,1108028,All for the Union: The Civil War Diary & Lette...,Robert Hunt Rhodes,0679738282,4.12,1992,28,7,Vintage,5:303,...,3:129,2:32,1:9,total:720,30,eng,,All for the Union is the eloquent and moving d...,270.0,30.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121574,1484139,"Spinning Spells, Weaving Wonders: Modern Magic...",Patricia J. Telesco,0895948036,3.70,1996,1,3,Crossing Press,5:13,...,3:10,2:2,1:3,total:33,0,eng,,This essential book of over 300 spells tells h...,256.0,0.0
82239,1886108,Through the Medicine Cabinet (The Zack Files #2),Dan Greenburg,0448412624,3.96,1996,8,6,Grosset & Dunlap,5:115,...,3:80,2:17,1:4,total:296,26,eng,64.0,One minute I was looking for my retainer in th...,,
88653,989125,In Pursuit of His Glory,R.T. Kendall,1591854547,3.73,2004,5,5,Charisma House,5:4,...,3:7,2:0,1:1,total:22,2,eng,,Pursuing the glory of God doesn't happen overn...,310.0,2.0
20356,686352,His Own Words: Translation and Analysis of the...,Laura Mansfield,1847288804,3.70,2006,17,7,Lulu.com,5:2,...,3:3,2:1,1:0,total:10,1,eng,,Al Qaeda second in command Dr. Ayman Zawahiri ...,364.0,


In [21]:
from tqdm.auto import tqdm
from uuid import uuid4

batch_size = 100

texts = []
metadatas = []

for i in tqdm(range(0, len(data), batch_size)):
    # get end of batch
    i_end = min(len(data), i+batch_size)
    batch = data.iloc[i:i_end]
    # first get metadata fields for this record
    
    # get the list of contexts / documents
    documents = batch['Description']
    # create document embeddings
    embeds = embed.embed_documents(documents)
    # get IDs
    ids = batch['Id'].astype(str)

  0%|          | 0/1 [00:00<?, ?it/s]

In [22]:
ids.values

array(['2167181', '3069617', '3430781', '2971345', '1108028', '3400827',
       '1974029', '2238834', '4838239', '4413662', '2115046', '809171',
       '1367775', '2053903', '4293954', '1536012', '935995', '2334212',
       '4538410', '4055855', '2543380', '2590535', '1690826', '1708357',
       '862355', '1915342', '2827276', '2833823', '2558374', '1843325',
       '3326999', '4375030', '1327580', '873206', '1966495', '4191602',
       '3372529', '702384', '1195113', '1500830', '2993119', '1793726',
       '3142245', '3107622', '2669088', '1896340', '1131591', '1293595',
       '802241', '2092973', '2069496', '3092139', '720525', '3010560',
       '3581328', '2988400', '1639088', '3110677', '920766', '1082847',
       '849880', '1725399', '1802466', '672335', '2523448', '997117',
       '2393511', '1378138', '1802563', '1334382', '1890078', '1232729',
       '1012959', '3281091', '1962492', '3052564', '1165710', '1280983',
       '3254309', '1646536', '776434', '642169', '1375925', '6

In [23]:
item_feature_matrix_array = np.array(embeds)

# # Print the item feature matrix
print("Item Feature Matrix:\n", item_feature_matrix_array)

Item Feature Matrix:
 [[ 1.87404467e-03 -8.45967984e-03  2.94004639e-03 ...  8.97218107e-03
  -1.74796936e-02 -2.98480468e-02]
 [-1.49515339e-03  1.58131475e-02  1.99218632e-02 ... -9.10945413e-03
   1.81614676e-03 -3.51403277e-02]
 [-2.84401896e-02 -2.31814378e-02 -3.79922192e-05 ... -1.11278953e-02
  -2.65218179e-02 -9.36379815e-03]
 ...
 [ 9.45164315e-04 -1.18867824e-02  7.61414472e-03 ... -2.70094115e-02
   7.02640942e-03 -3.38641241e-02]
 [-3.27086822e-02 -8.52923471e-03 -1.40410197e-03 ... -1.12128873e-02
  -1.41572673e-03 -1.74171758e-02]
 [ 2.97835429e-03 -1.80370085e-02 -8.89759936e-04 ...  1.23561688e-02
  -2.61972570e-02 -1.95491760e-03]]


In [24]:
item_feature_matrix = pd.DataFrame(item_feature_matrix_array)
item_feature_matrix.insert(0, 'ID', ids.values)

item_feature_matrix

Unnamed: 0,ID,0,1,2,3,4,5,6,7,8,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
0,2167181,0.001874,-0.008460,0.002940,-0.018682,-0.011473,0.025652,0.016250,-0.018423,-0.011740,...,0.023630,-0.016181,-0.007038,-0.018573,-0.039415,0.014213,0.015553,0.008972,-0.017480,-0.029848
1,3069617,-0.001495,0.015813,0.019922,-0.037438,0.021530,0.025058,-0.019841,-0.017719,0.005788,...,0.030951,0.011245,0.022449,-0.016273,-0.036384,-0.029707,-0.008008,-0.009109,0.001816,-0.035140
2,3430781,-0.028440,-0.023181,-0.000038,-0.037670,0.001409,0.007365,-0.020633,-0.031928,-0.020445,...,0.017332,0.004014,0.003501,-0.019023,-0.046121,-0.003012,-0.003934,-0.011128,-0.026522,-0.009364
3,2971345,-0.027397,-0.026718,0.004749,-0.013765,-0.018491,0.007655,-0.002666,-0.026212,-0.019196,...,0.013026,-0.008586,0.004729,-0.020288,-0.021073,0.002741,-0.002880,0.006693,-0.013911,-0.008314
4,1108028,-0.025645,-0.001861,0.015499,-0.028049,-0.011846,0.008705,-0.010910,-0.025552,-0.012623,...,-0.009542,-0.023613,0.021820,-0.010239,-0.027106,0.013340,0.013639,-0.009980,0.009682,0.007915
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1484139,0.017113,0.016916,-0.003973,-0.031465,-0.006657,0.010870,-0.004505,-0.058935,-0.008346,...,0.003440,-0.010502,0.034935,-0.014431,0.011612,-0.010856,0.004475,-0.020990,0.013091,-0.045555
96,1886108,-0.015918,0.000268,0.022646,-0.020857,-0.018256,0.006621,0.022370,-0.027060,0.009828,...,0.032161,0.004145,0.036688,-0.017044,-0.008753,-0.016793,-0.049392,0.008359,-0.000471,-0.010666
97,989125,0.000945,-0.011887,0.007614,-0.018108,0.010005,0.028977,-0.011913,-0.013458,-0.031592,...,-0.001390,-0.009589,-0.016906,0.002779,-0.019864,0.013551,0.003995,-0.027009,0.007026,-0.033864
98,686352,-0.032709,-0.008529,-0.001404,-0.017577,0.008669,0.021536,-0.012468,-0.017670,-0.010230,...,0.049714,-0.000745,0.009486,-0.029228,-0.042646,-0.018666,-0.013259,-0.011213,-0.001416,-0.017417


In [25]:
# Get the column names from the 2nd column until the last column
columns_to_rename = item_feature_matrix.columns[1:]

# Create a dictionary of new column names
new_column_names = {column: f'New_{column}' for column in columns_to_rename}

In [26]:
item_feature_matrix

Unnamed: 0,ID,0,1,2,3,4,5,6,7,8,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
0,2167181,0.001874,-0.008460,0.002940,-0.018682,-0.011473,0.025652,0.016250,-0.018423,-0.011740,...,0.023630,-0.016181,-0.007038,-0.018573,-0.039415,0.014213,0.015553,0.008972,-0.017480,-0.029848
1,3069617,-0.001495,0.015813,0.019922,-0.037438,0.021530,0.025058,-0.019841,-0.017719,0.005788,...,0.030951,0.011245,0.022449,-0.016273,-0.036384,-0.029707,-0.008008,-0.009109,0.001816,-0.035140
2,3430781,-0.028440,-0.023181,-0.000038,-0.037670,0.001409,0.007365,-0.020633,-0.031928,-0.020445,...,0.017332,0.004014,0.003501,-0.019023,-0.046121,-0.003012,-0.003934,-0.011128,-0.026522,-0.009364
3,2971345,-0.027397,-0.026718,0.004749,-0.013765,-0.018491,0.007655,-0.002666,-0.026212,-0.019196,...,0.013026,-0.008586,0.004729,-0.020288,-0.021073,0.002741,-0.002880,0.006693,-0.013911,-0.008314
4,1108028,-0.025645,-0.001861,0.015499,-0.028049,-0.011846,0.008705,-0.010910,-0.025552,-0.012623,...,-0.009542,-0.023613,0.021820,-0.010239,-0.027106,0.013340,0.013639,-0.009980,0.009682,0.007915
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1484139,0.017113,0.016916,-0.003973,-0.031465,-0.006657,0.010870,-0.004505,-0.058935,-0.008346,...,0.003440,-0.010502,0.034935,-0.014431,0.011612,-0.010856,0.004475,-0.020990,0.013091,-0.045555
96,1886108,-0.015918,0.000268,0.022646,-0.020857,-0.018256,0.006621,0.022370,-0.027060,0.009828,...,0.032161,0.004145,0.036688,-0.017044,-0.008753,-0.016793,-0.049392,0.008359,-0.000471,-0.010666
97,989125,0.000945,-0.011887,0.007614,-0.018108,0.010005,0.028977,-0.011913,-0.013458,-0.031592,...,-0.001390,-0.009589,-0.016906,0.002779,-0.019864,0.013551,0.003995,-0.027009,0.007026,-0.033864
98,686352,-0.032709,-0.008529,-0.001404,-0.017577,0.008669,0.021536,-0.012468,-0.017670,-0.010230,...,0.049714,-0.000745,0.009486,-0.029228,-0.042646,-0.018666,-0.013259,-0.011213,-0.001416,-0.017417


In [27]:
item_feature_matrix.columns = [str(x) for x in range(-1, 1536)]

In [28]:
item_feature_matrix

Unnamed: 0,-1,0,1,2,3,4,5,6,7,8,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
0,2167181,0.001874,-0.008460,0.002940,-0.018682,-0.011473,0.025652,0.016250,-0.018423,-0.011740,...,0.023630,-0.016181,-0.007038,-0.018573,-0.039415,0.014213,0.015553,0.008972,-0.017480,-0.029848
1,3069617,-0.001495,0.015813,0.019922,-0.037438,0.021530,0.025058,-0.019841,-0.017719,0.005788,...,0.030951,0.011245,0.022449,-0.016273,-0.036384,-0.029707,-0.008008,-0.009109,0.001816,-0.035140
2,3430781,-0.028440,-0.023181,-0.000038,-0.037670,0.001409,0.007365,-0.020633,-0.031928,-0.020445,...,0.017332,0.004014,0.003501,-0.019023,-0.046121,-0.003012,-0.003934,-0.011128,-0.026522,-0.009364
3,2971345,-0.027397,-0.026718,0.004749,-0.013765,-0.018491,0.007655,-0.002666,-0.026212,-0.019196,...,0.013026,-0.008586,0.004729,-0.020288,-0.021073,0.002741,-0.002880,0.006693,-0.013911,-0.008314
4,1108028,-0.025645,-0.001861,0.015499,-0.028049,-0.011846,0.008705,-0.010910,-0.025552,-0.012623,...,-0.009542,-0.023613,0.021820,-0.010239,-0.027106,0.013340,0.013639,-0.009980,0.009682,0.007915
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1484139,0.017113,0.016916,-0.003973,-0.031465,-0.006657,0.010870,-0.004505,-0.058935,-0.008346,...,0.003440,-0.010502,0.034935,-0.014431,0.011612,-0.010856,0.004475,-0.020990,0.013091,-0.045555
96,1886108,-0.015918,0.000268,0.022646,-0.020857,-0.018256,0.006621,0.022370,-0.027060,0.009828,...,0.032161,0.004145,0.036688,-0.017044,-0.008753,-0.016793,-0.049392,0.008359,-0.000471,-0.010666
97,989125,0.000945,-0.011887,0.007614,-0.018108,0.010005,0.028977,-0.011913,-0.013458,-0.031592,...,-0.001390,-0.009589,-0.016906,0.002779,-0.019864,0.013551,0.003995,-0.027009,0.007026,-0.033864
98,686352,-0.032709,-0.008529,-0.001404,-0.017577,0.008669,0.021536,-0.012468,-0.017670,-0.010230,...,0.049714,-0.000745,0.009486,-0.029228,-0.042646,-0.018666,-0.013259,-0.011213,-0.001416,-0.017417


In [29]:
item_feature_matrix.to_parquet('../data/item_feature_matrix_sample.parquet')

#### Generate Embeddings Spark

In [30]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, pandas_udf
from pyspark.sql.functions import PandasUDFType
from pyspark.sql.types import StructType, StructField, LongType, ArrayType, FloatType
import pandas as pd
import openai
import numpy as np

In [31]:
spark = SparkSession.builder.appName("OpenAIEmbeddings").getOrCreate()

In [32]:
schema = StructType([
    StructField("book_id", LongType(), False),
    StructField("vector_embeddings", ArrayType(FloatType()), False)
])

In [33]:
@pandas_udf(schema, functionType=PandasUDFType.GROUPED_MAP)
def generate_embeddings(doc):
    # List to hold embeddings
    embeddings_list = []

    # Batch request to OpenAI embed API
    # Please note that OpenAI has a max batch size, so consider that here.
    responses = openai.embeddings.create(
        model="text-embedding-ada-002",  
        input=doc['Description'].tolist()
    )

    # Retrieve the embeddings from OpenAI responses
    for response in responses.data[0]:
        embeddings_list.append(response.embedding)

    # Return the new DataFrame
    return pd.DataFrame({
        # 'book_id': doc['book_id'],
        # 'vector_embeddings': embeddings_list
        'test': input
    })


In [34]:

# sample_book_cleaned.to_csv('../data/sample_books_cleaned.csv')

df = spark.read.csv("../data/sample_books_cleaned.csv", header=True, inferSchema=True)



In [35]:
repartitioned_df = df.repartition(10)

In [36]:
spark.stop()

#### Embeddings FlagModel

In [37]:
import dask.dataframe as dd
from dask.multiprocessing import get

In [38]:
df = sample_book_cleaned.copy()

In [39]:
ddata = dd.from_pandas(df[['Id', 'Description']], npartitions=30)
ddata

Unnamed: 0_level_0,Id,Description
npartitions=30,Unnamed: 1_level_1,Unnamed: 2_level_1
632,int64,string
2685,...,...
...,...,...
115572,...,...
121574,...,...


In [40]:
from FlagEmbedding import FlagModel

model = FlagModel('BAAI/bge-large-en-v1.5', 
                  query_instruction_for_retrieval="Represent this sentence for searching relevant passages: ",
                  use_fp16=False)




In [41]:
def embed_text(doc):
    return model.encode(doc)

In [42]:
ddata['DescEmbeddings'] = ddata['Description'].apply(embed_text, meta = ('Description', 'object'))

In [43]:
embeddings_df = ddata.compute()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [44]:
embeddings_df

Unnamed: 0,Id,Description,DescEmbeddings
632,1915342,"Never before has there been a comprehensive, i...","[0.019671878, -0.010680034, 0.026023319, -0.01..."
1749,1945041,"This trilogy tells the story of Orlando King, ...","[0.027494779, -0.029625587, 0.006587275, 0.007..."
2327,1962492,This anthology contains: Introduction by Rober...,"[0.016765222, -0.034488436, 0.016900204, 0.038..."
2448,1966495,History and fiction collide with deadly conseq...,"[0.03885193, -0.04640678, 0.0014486656, 0.0191..."
2685,1974029,Australia has been invaded. Nothing is as it w...,"[0.03403786, -0.005640195, -0.02139525, 0.0319..."
...,...,...,...
113806,4458655,"This anthology contains a wide range of poems,...","[-0.038541503, -0.00971655, 0.06514756, -0.008..."
114648,4538410,Explore the daily lives of ancient Egyptians i...,"[-0.007341509, 0.01686493, 0.00946882, -0.0080..."
115572,4619663,"This unforgettable tale, illustrated by Caldec...","[0.016770002, 0.0072702975, 0.0013750584, 0.01..."
117988,4838239,"Out of Print<br /><br /><i>""A book in The Huss...","[0.033698123, 0.015909018, -0.022079704, 0.050..."


In [45]:
embeddings = pd.DataFrame(embeddings_df['DescEmbeddings'].tolist())
embeddings.columns = [str(x) for x in range(embeddings.shape[1])]
embeddings.insert(0, 'Id', embeddings_df['Id'].tolist())
embeddings

Unnamed: 0,Id,0,1,2,3,4,5,6,7,8,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,1915342,0.019672,-0.010680,0.026023,-0.012980,-0.011049,-0.042468,-0.027985,-0.046368,0.013330,...,0.039828,0.017217,-0.034920,0.008253,0.021344,0.012759,-0.014850,-0.001685,0.029440,0.026171
1,1945041,0.027495,-0.029626,0.006587,0.007185,-0.002153,-0.036707,-0.075638,-0.008425,0.027270,...,0.030011,0.028950,-0.026864,-0.002844,0.017709,0.022555,-0.029902,-0.012736,-0.017359,0.018013
2,1962492,0.016765,-0.034488,0.016900,0.038029,0.000625,-0.050714,-0.033872,-0.014772,0.014385,...,0.035867,0.005906,0.002038,-0.013409,-0.002867,0.022738,-0.001784,-0.009821,0.041732,-0.028875
3,1966495,0.038852,-0.046407,0.001449,0.019194,0.012761,-0.079849,-0.044601,-0.027145,-0.005161,...,0.058807,-0.001943,-0.023606,-0.014104,0.037743,0.029775,-0.005517,0.018269,0.012411,-0.004323
4,1974029,0.034038,-0.005640,-0.021395,0.031956,0.015899,-0.051512,-0.046775,0.019667,0.008174,...,0.016996,-0.014675,-0.025072,0.005797,-0.014186,0.036789,0.014259,-0.022536,-0.013623,0.002711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,4458655,-0.038542,-0.009717,0.065148,-0.008847,-0.005102,-0.016512,0.027890,0.031256,0.025674,...,0.042025,-0.032761,-0.027181,-0.039687,-0.005633,0.033426,0.018797,-0.007476,0.016977,-0.040620
96,4538410,-0.007342,0.016865,0.009469,-0.008030,-0.013015,-0.001679,-0.056378,0.017555,0.044251,...,0.055220,-0.012609,-0.042055,0.003586,0.043074,0.036913,0.009439,-0.027818,-0.020053,-0.007549
97,4619663,0.016770,0.007270,0.001375,0.019624,0.012343,-0.048060,-0.019651,0.016987,0.033504,...,0.053097,-0.016903,0.026127,0.003713,0.005002,0.037359,0.023257,0.008849,0.017564,0.004834
98,4838239,0.033698,0.015909,-0.022080,0.050326,0.008085,-0.007000,-0.007634,0.031579,0.029642,...,0.049828,-0.026999,-0.011276,0.016474,0.026631,0.024221,0.026163,0.017199,0.049599,-0.024829


In [46]:
embeddings.to_parquet('../data/item_feature_matrix_sample_dask.parquet')

### Generate Title Embeddings

In [47]:
ddata = dd.from_pandas(df[['Id', 'Name']], npartitions=30)
ddata

Unnamed: 0_level_0,Id,Name
npartitions=30,Unnamed: 1_level_1,Unnamed: 2_level_1
632,int64,string
2685,...,...
...,...,...
115572,...,...
121574,...,...


In [48]:
embeddings = df['Description'].apply(embed_text)
embeddings

54015     [0.016871104, -0.0142891975, 0.006068613, 0.00...
96931     [0.011660191, 0.015110547, -0.031430822, -0.02...
105559    [0.044182006, -0.015399105, -0.0057496214, 0.0...
75202     [0.01951505, -0.032270074, -0.008384888, 0.033...
89816     [0.025072854, -0.027432684, -0.016972603, 0.01...
                                ...                        
121574    [-0.039292198, 0.023619933, -0.014403005, -0.0...
82239     [0.0017016948, -0.0034102353, -0.036388166, 0....
88653     [0.018419713, -0.033249404, -0.007858549, 0.01...
20356     [0.042169802, 0.004726366, 0.012943066, -0.026...
34548     [0.008475209, 0.012764636, -0.041651383, 0.013...
Name: Description, Length: 100, dtype: object

In [49]:
embeddings_df = pd.DataFrame(embeddings.tolist())
embeddings_df.columns = [str(x) for x in range(embeddings_df.shape[1])]
embeddings_df.insert(0, 'Id', df['Id'].tolist())
embeddings_df

Unnamed: 0,Id,0,1,2,3,4,5,6,7,8,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,2167181,0.016871,-0.014289,0.006069,0.006901,-0.000575,-0.028419,-0.031998,0.006701,-0.006108,...,0.044029,-0.006014,-0.015839,0.001971,0.007722,0.019793,-0.040538,0.027163,0.012334,0.028178
1,3069617,0.011660,0.015111,-0.031431,-0.029204,-0.023611,-0.040591,-0.004589,0.048419,0.017929,...,0.013596,-0.044199,-0.024734,-0.050729,0.038620,0.031805,0.026043,-0.002245,-0.015313,0.005503
2,3430781,0.044182,-0.015399,-0.005750,0.054954,0.000253,-0.021884,0.004243,0.023303,-0.072284,...,0.038711,-0.027899,0.026158,0.014439,0.062457,0.030255,-0.004003,0.022742,-0.028293,-0.064190
3,2971345,0.019515,-0.032270,-0.008385,0.033657,-0.025756,-0.023265,-0.046986,0.032251,0.018391,...,0.026283,0.009768,-0.036358,-0.040202,0.054221,0.023555,0.027178,-0.010965,0.004345,0.005805
4,1108028,0.025073,-0.027433,-0.016973,0.018087,-0.019807,-0.019906,0.004375,0.021954,0.034171,...,0.021489,0.024228,-0.027435,-0.031106,0.009198,0.007005,0.010375,-0.012862,0.011688,0.014282
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1484139,-0.039292,0.023620,-0.014403,-0.006401,-0.004513,-0.038431,0.038959,-0.014227,-0.002381,...,0.025358,-0.020934,0.049738,-0.020004,0.036521,-0.001323,0.059343,0.016928,0.026968,0.025706
96,1886108,0.001702,-0.003410,-0.036388,0.054165,-0.033801,-0.044523,0.005675,0.019056,-0.013114,...,0.041248,-0.039679,-0.002356,-0.013519,0.000028,-0.014913,-0.000805,-0.007656,0.033420,-0.034740
97,989125,0.018420,-0.033249,-0.007859,0.010590,-0.012348,0.018414,-0.007415,0.011025,0.026387,...,0.044350,0.005563,-0.028686,-0.068962,0.016592,0.031268,0.002999,0.002450,-0.036882,0.036958
98,686352,0.042170,0.004726,0.012943,-0.026190,0.006399,-0.037676,-0.004517,0.016114,-0.011290,...,0.024825,-0.033248,0.018162,-0.048114,0.004966,0.032374,-0.065015,0.022777,-0.002493,-0.014655


In [50]:
ddata['TitleEmbeddings'] = ddata['Name'].apply(embed_text, meta = ('Name', 'object'))

In [51]:
embeddings_df = ddata.compute()

In [52]:
embeddings = pd.DataFrame(embeddings_df['TitleEmbeddings'].tolist())
embeddings.columns = [str(x) for x in range(embeddings.shape[1])]
embeddings.insert(0, 'Id', embeddings_df['Id'].tolist())
embeddings

Unnamed: 0,Id,0,1,2,3,4,5,6,7,8,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,1915342,0.031399,-0.024493,0.012370,-0.005739,-0.020743,-0.059919,-0.036419,-0.042084,0.004719,...,0.024711,-0.001506,-0.006059,0.007519,0.027024,0.010831,-0.031846,0.011259,0.021093,0.027407
1,1945041,0.021745,-0.016952,0.020417,-0.011403,-0.010153,-0.018079,-0.024769,-0.007973,0.029864,...,0.030106,0.032769,-0.001990,-0.008776,0.013326,0.001546,-0.034584,-0.018282,-0.000191,-0.004799
2,1962492,0.025225,-0.000295,0.012156,0.021322,0.011494,-0.034446,-0.041135,-0.026693,-0.007239,...,0.002216,0.001882,0.005547,-0.026220,0.022226,0.010203,0.041053,-0.012110,0.033955,-0.021945
3,1966495,0.007741,-0.015637,0.016903,0.004039,-0.011639,-0.096463,-0.013699,-0.000347,-0.014831,...,-0.001014,-0.012654,-0.033562,-0.036246,0.027374,0.019149,-0.010276,0.008718,-0.002202,0.000216
4,1974029,0.022673,0.011059,-0.042224,0.008642,-0.015433,-0.048572,-0.060135,0.021256,-0.009187,...,0.008759,-0.006051,0.009320,-0.003143,-0.022458,0.049964,0.016951,-0.028461,-0.040786,-0.018901
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,4458655,-0.031841,-0.023764,0.045918,-0.046154,0.019710,-0.036336,0.028953,0.024803,0.021356,...,0.036628,-0.007497,0.018323,-0.039291,0.035235,0.039669,-0.013801,-0.007832,0.004409,0.004863
96,4538410,0.010488,0.020760,0.010961,-0.020280,-0.020682,0.006781,-0.055386,-0.002088,0.041425,...,0.065677,-0.009558,0.012826,0.000350,0.037277,0.017887,-0.009633,-0.018048,-0.024166,0.011467
97,4619663,0.010956,-0.012829,0.034672,0.012926,0.021252,-0.019998,-0.011179,0.040224,0.054586,...,0.049847,0.002667,0.014616,-0.001504,0.043358,0.017657,0.044806,-0.018140,0.025231,-0.001165
98,4838239,0.020996,0.021444,-0.001163,0.008398,-0.021590,-0.029452,0.008113,0.044567,0.019881,...,0.013262,-0.034436,0.016029,0.008735,0.014723,0.070592,0.010350,-0.019286,0.000652,-0.007991


In [53]:
embeddings.to_parquet('../data/item_feature_matrix_sample_dask_title.parquet')

### Item Similarity

In [None]:
from math import pow, sqrt
from decimal import Decimal
import multiprocessing as mp
import pandas as pd
import numpy as np
from itertools import repeat


#### Description

In [56]:
item_feature_matrix_desc = pd.read_parquet('../data/item_feature_matrix_sample_dask.parquet')

In [57]:
embeddings_1 = item_feature_matrix_desc.iloc[:, 1:]
embeddings_2 = item_feature_matrix_desc.iloc[:, 1:]

In [58]:
similarity_desc = embeddings_1 @ embeddings_2.T
similarity_desc

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,1.000000,0.526837,0.632033,0.695036,0.609768,0.432785,0.603111,0.553024,0.586080,0.629477,...,0.522749,0.443133,0.566124,0.513378,0.491066,0.565221,0.555311,0.440486,0.587951,0.451447
1,0.526837,1.000000,0.492372,0.584184,0.546634,0.520349,0.472886,0.479329,0.565793,0.536692,...,0.389790,0.382277,0.400010,0.468905,0.371932,0.443957,0.454804,0.472686,0.444640,0.357570
2,0.632033,0.492372,1.000000,0.562633,0.582954,0.423598,0.555454,0.503359,0.591041,0.566976,...,0.491422,0.434630,0.477383,0.452821,0.414232,0.564213,0.533176,0.445573,0.574720,0.380326
3,0.695036,0.584184,0.562633,1.000000,0.662237,0.473375,0.594471,0.564176,0.618939,0.591548,...,0.476985,0.472241,0.495778,0.481265,0.414493,0.448498,0.557470,0.503486,0.609517,0.397962
4,0.609768,0.546634,0.582954,0.662237,1.000000,0.541586,0.602264,0.572614,0.651075,0.570457,...,0.463764,0.503837,0.522584,0.566721,0.449153,0.512617,0.552613,0.492092,0.629950,0.385611
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.565221,0.443957,0.564213,0.448498,0.512617,0.393384,0.577754,0.576664,0.511505,0.587230,...,0.458717,0.459695,0.474869,0.503496,0.454622,1.000000,0.496634,0.470338,0.484930,0.430949
96,0.555311,0.454804,0.533176,0.557470,0.552613,0.455830,0.524547,0.523626,0.557235,0.524120,...,0.601306,0.465644,0.579993,0.547912,0.578587,0.496634,1.000000,0.394735,0.549727,0.497715
97,0.440486,0.472686,0.445573,0.503486,0.492092,0.400361,0.510945,0.496534,0.501005,0.476173,...,0.328709,0.510738,0.421188,0.420240,0.294488,0.470338,0.394735,1.000000,0.444225,0.429590
98,0.587951,0.444640,0.574720,0.609517,0.629950,0.481019,0.560960,0.545550,0.671934,0.552092,...,0.443256,0.387623,0.500391,0.442378,0.420971,0.484930,0.549727,0.444225,1.000000,0.404850


#### Title

In [59]:
item_feature_matrix_title = pd.read_parquet('../data/item_feature_matrix_sample_dask_title.parquet')

In [60]:
item_feature_matrix_title

Unnamed: 0,Id,0,1,2,3,4,5,6,7,8,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,1915342,0.031399,-0.024493,0.012370,-0.005739,-0.020743,-0.059919,-0.036419,-0.042084,0.004719,...,0.024711,-0.001506,-0.006059,0.007519,0.027024,0.010831,-0.031846,0.011259,0.021093,0.027407
1,1945041,0.021745,-0.016952,0.020417,-0.011403,-0.010153,-0.018079,-0.024769,-0.007973,0.029864,...,0.030106,0.032769,-0.001990,-0.008776,0.013326,0.001546,-0.034584,-0.018282,-0.000191,-0.004799
2,1962492,0.025225,-0.000295,0.012156,0.021322,0.011494,-0.034446,-0.041135,-0.026693,-0.007239,...,0.002216,0.001882,0.005547,-0.026220,0.022226,0.010203,0.041053,-0.012110,0.033955,-0.021945
3,1966495,0.007741,-0.015637,0.016903,0.004039,-0.011639,-0.096463,-0.013699,-0.000347,-0.014831,...,-0.001014,-0.012654,-0.033562,-0.036246,0.027374,0.019149,-0.010276,0.008718,-0.002202,0.000216
4,1974029,0.022673,0.011059,-0.042224,0.008642,-0.015433,-0.048572,-0.060135,0.021256,-0.009187,...,0.008759,-0.006051,0.009320,-0.003143,-0.022458,0.049964,0.016951,-0.028461,-0.040786,-0.018901
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,4458655,-0.031841,-0.023764,0.045918,-0.046154,0.019710,-0.036336,0.028953,0.024803,0.021356,...,0.036628,-0.007497,0.018323,-0.039291,0.035235,0.039669,-0.013801,-0.007832,0.004409,0.004863
96,4538410,0.010488,0.020760,0.010961,-0.020280,-0.020682,0.006781,-0.055386,-0.002088,0.041425,...,0.065677,-0.009558,0.012826,0.000350,0.037277,0.017887,-0.009633,-0.018048,-0.024166,0.011467
97,4619663,0.010956,-0.012829,0.034672,0.012926,0.021252,-0.019998,-0.011179,0.040224,0.054586,...,0.049847,0.002667,0.014616,-0.001504,0.043358,0.017657,0.044806,-0.018140,0.025231,-0.001165
98,4838239,0.020996,0.021444,-0.001163,0.008398,-0.021590,-0.029452,0.008113,0.044567,0.019881,...,0.013262,-0.034436,0.016029,0.008735,0.014723,0.070592,0.010350,-0.019286,0.000652,-0.007991


In [61]:
embeddings_1 = item_feature_matrix_title.iloc[:, 1:]
embeddings_2 = item_feature_matrix_title.iloc[:, 1:]

In [62]:
similarity_title = embeddings_1 @ embeddings_2.T
similarity_title

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,1.000000,0.603575,0.632853,0.625821,0.571715,0.540333,0.617729,0.547926,0.434458,0.489653,...,0.490651,0.585662,0.407597,0.489123,0.385287,0.560283,0.517048,0.553936,0.442694,0.603432
1,0.603575,1.000000,0.583939,0.539317,0.536228,0.549631,0.568717,0.549124,0.523107,0.443252,...,0.469722,0.524559,0.447283,0.453682,0.453342,0.534956,0.488407,0.528275,0.415996,0.544937
2,0.632853,0.583939,1.000000,0.549669,0.640465,0.525406,0.593067,0.608517,0.517557,0.439012,...,0.556982,0.696019,0.428358,0.514378,0.443148,0.525795,0.545870,0.561426,0.486109,0.578829
3,0.625821,0.539317,0.549669,1.000000,0.523907,0.553211,0.533510,0.547866,0.526313,0.508759,...,0.499189,0.627802,0.456117,0.493426,0.413888,0.489729,0.487010,0.511205,0.480534,0.558408
4,0.571715,0.536228,0.640465,0.523907,1.000000,0.614154,0.468622,0.503808,0.484233,0.483190,...,0.427203,0.511247,0.450502,0.426020,0.401861,0.480122,0.487839,0.486194,0.437768,0.479307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.560283,0.534956,0.525795,0.489729,0.480122,0.408746,0.591846,0.583034,0.475705,0.499321,...,0.491275,0.581280,0.427657,0.530307,0.387216,1.000000,0.509033,0.695385,0.437482,0.565132
96,0.517048,0.488407,0.545870,0.487010,0.487839,0.460519,0.486506,0.515377,0.446156,0.436572,...,0.484255,0.566871,0.457026,0.491910,0.421373,0.509033,1.000000,0.544015,0.396011,0.598004
97,0.553936,0.528275,0.561426,0.511205,0.486194,0.513256,0.607761,0.598682,0.511941,0.455125,...,0.470722,0.570665,0.383905,0.503044,0.436734,0.695385,0.544015,1.000000,0.430655,0.593926
98,0.442694,0.415996,0.486109,0.480534,0.437768,0.463487,0.542893,0.402456,0.444004,0.463148,...,0.490637,0.456227,0.451801,0.375590,0.437331,0.437482,0.396011,0.430655,1.000000,0.421810


##### Non-parallel

In [63]:
embeddings = embeddings_1
similarity_matrix = np.matrix(np.zeros((embeddings.shape[0], embeddings.shape[0],)))
for i in range(0,embeddings.shape[0]):
    for j in range(0,embeddings.shape[0]):
        similarity_matrix[i, j] = np.sum(embeddings.iloc[i, :] * embeddings.iloc[j, :])

In [66]:
final = pd.DataFrame(similarity_matrix)
final.columns = item_feature_matrix_title['Id'].tolist()
final.index = item_feature_matrix_title['Id'].tolist()
final.index = final_similarity_matrix.columns.astype('str')
final.index = final_similarity_matrix.index.astype('str')
final


Unnamed: 0,1915342,1945041,1962492,1966495,1974029,1629841,1630607,1639088,1646536,1690826,...,4055855,4191602,4293954,4375030,4413662,4458655,4538410,4619663,4838239,1484139
0,1.000000,0.603575,0.632853,0.625821,0.571715,0.540333,0.617729,0.547926,0.434458,0.489653,...,0.490651,0.585662,0.407597,0.489123,0.385287,0.560283,0.517048,0.553936,0.442694,0.603432
1,0.603575,1.000000,0.583939,0.539317,0.536228,0.549631,0.568717,0.549124,0.523107,0.443252,...,0.469722,0.524559,0.447283,0.453682,0.453342,0.534956,0.488407,0.528275,0.415996,0.544937
2,0.632853,0.583939,1.000000,0.549668,0.640465,0.525406,0.593067,0.608517,0.517557,0.439012,...,0.556982,0.696019,0.428358,0.514378,0.443148,0.525795,0.545870,0.561426,0.486109,0.578829
3,0.625821,0.539317,0.549668,1.000000,0.523907,0.553211,0.533510,0.547866,0.526313,0.508760,...,0.499189,0.627802,0.456117,0.493426,0.413888,0.489729,0.487010,0.511205,0.480534,0.558408
4,0.571715,0.536228,0.640465,0.523907,1.000000,0.614154,0.468622,0.503808,0.484233,0.483190,...,0.427203,0.511247,0.450502,0.426020,0.401861,0.480122,0.487839,0.486194,0.437768,0.479307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.560283,0.534956,0.525795,0.489729,0.480122,0.408746,0.591846,0.583034,0.475705,0.499321,...,0.491275,0.581280,0.427657,0.530307,0.387216,1.000000,0.509033,0.695385,0.437482,0.565132
96,0.517048,0.488407,0.545870,0.487010,0.487839,0.460519,0.486506,0.515377,0.446156,0.436572,...,0.484255,0.566871,0.457025,0.491910,0.421373,0.509033,1.000000,0.544015,0.396011,0.598004
97,0.553936,0.528275,0.561426,0.511205,0.486194,0.513256,0.607761,0.598682,0.511941,0.455125,...,0.470722,0.570665,0.383904,0.503044,0.436735,0.695385,0.544015,1.000000,0.430655,0.593926
98,0.442694,0.415996,0.486109,0.480534,0.437768,0.463487,0.542894,0.402456,0.444004,0.463149,...,0.490637,0.456227,0.451801,0.375590,0.437331,0.437482,0.396011,0.430655,1.000000,0.421810


#### Final Similarity Matrix

In [69]:
final_similarity_matrix = 0.4*similarity_title + 0.6*similarity_desc
final_similarity_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,1.000000,0.557532,0.632361,0.667350,0.594547,0.475804,0.608958,0.550985,0.525431,0.573547,...,0.509910,0.500145,0.502713,0.503676,0.448755,0.563246,0.540006,0.485866,0.529848,0.512241
1,0.557532,1.000000,0.528999,0.566237,0.542471,0.532062,0.511218,0.507247,0.548719,0.499316,...,0.421763,0.439190,0.418919,0.462816,0.404496,0.480357,0.468245,0.494922,0.433183,0.432516
2,0.632361,0.528999,1.000000,0.557447,0.605958,0.464321,0.570499,0.545422,0.561647,0.515791,...,0.517646,0.539186,0.457773,0.477443,0.425798,0.548846,0.538254,0.491914,0.539276,0.459727
3,0.667350,0.566237,0.557447,1.000000,0.606905,0.505309,0.570087,0.557652,0.581889,0.558433,...,0.485867,0.534465,0.479913,0.486130,0.414251,0.464990,0.529286,0.506574,0.557924,0.462141
4,0.594547,0.542471,0.605958,0.606905,1.000000,0.570613,0.548807,0.545092,0.584338,0.535550,...,0.449139,0.506801,0.493751,0.510440,0.430236,0.499619,0.526703,0.489733,0.553077,0.423089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.563246,0.480357,0.548846,0.464990,0.499619,0.399529,0.583391,0.579212,0.497185,0.552067,...,0.471740,0.508329,0.455984,0.514220,0.427659,1.000000,0.501593,0.560357,0.465951,0.484622
96,0.540006,0.468245,0.538254,0.529286,0.526703,0.457705,0.509331,0.520326,0.512803,0.489101,...,0.554485,0.506135,0.530806,0.525511,0.515702,0.501593,1.000000,0.454447,0.488241,0.537831
97,0.485866,0.494922,0.491914,0.506574,0.489733,0.445519,0.549672,0.537394,0.505379,0.467754,...,0.385514,0.534709,0.406275,0.453362,0.351386,0.560357,0.454447,1.000000,0.438797,0.495324
98,0.529848,0.433183,0.539276,0.557924,0.553077,0.474006,0.553733,0.488312,0.580762,0.516515,...,0.462208,0.415065,0.480955,0.415663,0.427515,0.465951,0.488241,0.438797,1.000000,0.411634


In [70]:
final_similarity_matrix.columns = item_feature_matrix_title['Id'].tolist()
final_similarity_matrix.index = item_feature_matrix_title['Id'].tolist()
final_similarity_matrix.columns = final_similarity_matrix.columns.astype('str')
final_similarity_matrix.index = final_similarity_matrix.index.astype('str')

In [86]:
final_similarity_matrix.to_parquet('../data/final_item_similarity_matrix_sample.parquet')

### Create User Item Matrix (Ratings)

In [71]:
existing_sample_books = final_similarity_matrix.columns.tolist()
existing_sample_books = [str(x) for x in existing_sample_books]

In [72]:
user_rating_df = pd.read_parquet('../data/user_rating_total.parquet')
user_rating_df

Unnamed: 0,ID,Name,book_id,Rating
0,1,Agile Web Development with Rails: A Pragmatic ...,45.0,5
1,1,The Restaurant at the End of the Universe (Hit...,862825.0,5
2,1,Siddhartha,1943716.0,5
3,1,"The Hunger Games (The Hunger Games, #1)",2767052.0,5
4,1,"The Clue in the Embers (Hardy Boys, #35)",76932.0,5
...,...,...,...,...
231419,5403,"The MacGregors: Alan & Grant (The MacGregors, ...",26055.0,2
231420,5403,The MacGregors: Serena & Caine (The MacGregors...,26054.0,2
231421,5403,Time and Again: Time Was / Times Change,40530.0,2
231422,5403,"Dance Upon The Air (Three Sisters Island, #1)",685379.0,4


In [73]:
user_rating_df_cleaned = user_rating_df.drop_duplicates()
user_rating_df_cleaned.dropna(inplace=True)
user_rating_df_cleaned.reset_index(drop = True, inplace = True)


In [74]:
user_rating_df_cleaned.book_id = user_rating_df_cleaned.book_id.astype('int').astype('str')
user_rating_df_cleaned

Unnamed: 0,ID,Name,book_id,Rating
0,1,Agile Web Development with Rails: A Pragmatic ...,45,5
1,1,The Restaurant at the End of the Universe (Hit...,862825,5
2,1,Siddhartha,1943716,5
3,1,"The Hunger Games (The Hunger Games, #1)",2767052,5
4,1,"The Clue in the Embers (Hardy Boys, #35)",76932,5
...,...,...,...,...
231419,5403,"The MacGregors: Alan & Grant (The MacGregors, ...",26055,2
231420,5403,The MacGregors: Serena & Caine (The MacGregors...,26054,2
231421,5403,Time and Again: Time Was / Times Change,40530,2
231422,5403,"Dance Upon The Air (Three Sisters Island, #1)",685379,4


In [75]:
sample_user_rating = user_rating_df_cleaned[user_rating_df_cleaned['book_id'].isin(existing_sample_books)]

In [76]:
sample_user_rating

Unnamed: 0,ID,Name,book_id,Rating
7094,192,The Best of Fritz Leiber,606215,4
7430,201,"The Well of Ascension (Mistborn, #2)",2115046,4
10570,284,"Incubus Dreams (Anita Blake, Vampire Hunter, #12)",1080513,1
18443,464,"Incubus Dreams (Anita Blake, Vampire Hunter, #12)",1080513,3
26325,678,"Incubus Dreams (Anita Blake, Vampire Hunter, #12)",1080513,5
28744,768,"Wild Cards (Wild Cards, #1)",776434,4
30132,853,"The Well of Ascension (Mistborn, #2)",2115046,3
30545,855,"The Well of Ascension (Mistborn, #2)",2115046,4
76619,6845,"The Well of Ascension (Mistborn, #2)",2115046,4
76620,7396,"The Well of Ascension (Mistborn, #2)",2115046,3


In [77]:
sample_user_rating.book_id = sample_user_rating.book_id.astype('int').astype('str')
sample_user_rating.ID = sample_user_rating.ID.astype('str')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_user_rating.book_id = sample_user_rating.book_id.astype('int').astype('str')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_user_rating.ID = sample_user_rating.ID.astype('str')


In [78]:
sample_user_rating.groupby(['ID', 'book_id']).size().reset_index().rename(columns={0:'count'}).sort_values('count')

Unnamed: 0,ID,book_id,count
0,1045,2115046,1
28,4878,2993119,1
29,5111,1375925,1
30,5192,2115046,1
31,5527,1375925,1
32,5869,1080513,1
33,678,1080513,1
34,6845,2115046,1
35,7268,1080513,1
36,7396,2115046,1


In [79]:
ddata = dd.from_pandas(sample_user_rating[['ID', 'book_id', 'Rating']], npartitions=30)
ddata_aggregated = ddata.groupby(['ID', 'book_id']).aggregate('mean')

In [80]:
ddata_aggregated

Unnamed: 0_level_0,Rating
npartitions=1,Unnamed: 1_level_1
,float64
,...


In [81]:
df_aggregated = ddata_aggregated.compute().reset_index()
user_item_matrix = df_aggregated.pivot(index='ID', columns='book_id', values='Rating').fillna(0)

In [83]:
# Apply mean only to the 'Rating' column
df_aggregated = sample_user_rating.groupby(['ID', 'book_id'])['Rating'].mean().reset_index()

In [84]:
user_item_matrix = df_aggregated.pivot(index='ID', columns='book_id', values='Rating').fillna(0)
user_item_matrix

book_id,1080513,1375925,1646536,1690826,1843325,2115046,2238834,2993119,606215,776434,849880,920766
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1045,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
10560,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
10857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
1326,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1327,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
1455,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
1784,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
1829,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
201,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0


In [85]:
user_item_matrix.to_parquet('../data/user_item_matrix_sample.parquet')