## Review embeddings

This code extracts the embeddings for individual sample and averages the embeddings per item

In [1]:
import pandas as pd
import numpy as np

#### Read and explore the dataset

In [2]:
goodreads_df = pd.read_csv('goodreads_reviews.csv')

In [3]:
goodreads_df.shape # It has 1378033 samples

(1378033, 6)

In [4]:
goodreads_df.head()

Unnamed: 0,user_id,book_id,rating,review_text,n_votes,n_comments
0,8842281e1d1347389f2ab93d60773d4d,18245960,5,This is a special book. It started slow for ab...,28,1
1,8842281e1d1347389f2ab93d60773d4d,16981,3,Recommended by Don Katz. Avail for free in Dec...,1,0
2,8842281e1d1347389f2ab93d60773d4d,28684704,3,"A fun, fast paced science fiction thriller. I ...",22,0
3,8842281e1d1347389f2ab93d60773d4d,27161156,0,Recommended reading to understand what is goin...,5,1
4,8842281e1d1347389f2ab93d60773d4d,25884323,4,"I really enjoyed this book, and there is a lot...",9,1


In [32]:
sum(goodreads_df['book_id'] == 16981)

90

In [5]:
goodreads_df.dtypes

user_id        object
book_id         int64
rating          int64
review_text    object
n_votes         int64
n_comments      int64
dtype: object

In [None]:
goodreads_df['book_id'].nunique() # we have 25475 books

25475

## Apply the embeddings per book

#### Load the embedding model (We will use SBERT)

Code adpated from https://www.sbert.net/

In [35]:
from sentence_transformers import SentenceTransformer




In [36]:
model = SentenceTransformer("all-MiniLM-L6-v2")



### Get the embeddings

Create a dictionary that has the book as a key and the value as a list of reviews

In [26]:
book2review = goodreads_df.groupby('book_id')['review_text'].apply(list).to_dict()

In [87]:
book2embeddings = {}

In [None]:
for book, reviews in book2review.items():
    embeddings = model.encode(reviews) # getting the embedding per book
    book2embeddings[book] = embeddings.mean(axis=0) # averaging the embeddings by columns 

#### Save the object to disk

In [91]:
import pickle
with open('book_embeddings.pkl', 'wb') as file:
    pickle.dump(book2embeddings, file)

In [1]:
# Loading the object
import pickle
with open('book_embeddings.pkl', 'rb') as file:
    loaded_book_embeddings = pickle.load(file)

In [4]:
loaded_book_embeddings[1].shape

(384,)