Imports

In [6]:
import numpy as np
import pickle
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from keras.models import Model, load_model
from keras.layers import Input, Embedding, Flatten, Dot, Dense
from keras.optimizers import Adam, RMSprop, SGD
from keras.callbacks import ModelCheckpoint
from keras.losses import MeanAbsoluteError

# Load data
We read the CSV file and load it into a pandas DataFrame. 

In [7]:
data = pd.read_csv('../Data/training/ratings.csv')

books_df = pd.read_csv('../Data/training/books.csv')
book_id_to_name = pd.Series(books_df.title.values, index = books_df.index).to_dict()

# Get unique user IDs from the ratings data
user_ids = data['user_id'].unique()
user_ids = sorted(user_ids)

# Create a DataFrame with user IDs
users = pd.DataFrame({'user_id': user_ids})

# Save the users DataFrame to a users.csv file
users.to_csv('../Data/users.csv', index=False)


We print the first few records and a summary of the data for a quick examination.

In [8]:
print(data.head())
print(data.describe())

   book_id  user_id  rating
0        1      314       5
1        1      439       3
2        1      588       5
3        1     1169       4
4        1     1185       4
             book_id        user_id         rating
count  981756.000000  981756.000000  981756.000000
mean     4943.275636   25616.759933       3.856534
std      2873.207415   15228.338826       0.983941
min         1.000000       1.000000       1.000000
25%      2457.000000   12372.000000       3.000000
50%      4921.000000   25077.000000       4.000000
75%      7414.000000   38572.000000       5.000000
max     10000.000000   53424.000000       5.000000


Check for missing values

In [9]:
print(data.isnull().sum())

book_id    0
user_id    0
rating     0
dtype: int64


# Create user-id and book-id mapping
We're creating two mapping dictionaries for users and books - from id to index and from index to id.  
This will help in embedding layer where we'll be dealing with indices.

In [10]:
user_ids = data['user_id'].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
book_ids = data['book_id'].unique().tolist()
book2book_encoded = {x: i for i, x in enumerate(book_ids)}
book_encoded2book = {i: x for i, x in enumerate(book_ids)}

In [11]:
user2user_encoded

{314: 0,
 439: 1,
 588: 2,
 1169: 3,
 1185: 4,
 2077: 5,
 2487: 6,
 2900: 7,
 3662: 8,
 3922: 9,
 5379: 10,
 5461: 11,
 5885: 12,
 6630: 13,
 7563: 14,
 9246: 15,
 10140: 16,
 10146: 17,
 10246: 18,
 10335: 19,
 10610: 20,
 10944: 21,
 11854: 22,
 11927: 23,
 12471: 24,
 13282: 25,
 13544: 26,
 15494: 27,
 16377: 28,
 16913: 29,
 17434: 30,
 17663: 31,
 17984: 32,
 18031: 33,
 18313: 34,
 18361: 35,
 20076: 36,
 20467: 37,
 20848: 38,
 21228: 39,
 21487: 40,
 21713: 41,
 22602: 42,
 23576: 43,
 23612: 44,
 24326: 45,
 24389: 46,
 24499: 47,
 24834: 48,
 24845: 49,
 25164: 50,
 25182: 51,
 25214: 52,
 26145: 53,
 26629: 54,
 26661: 55,
 28158: 56,
 28767: 57,
 29123: 58,
 29703: 59,
 30681: 60,
 31001: 61,
 32055: 62,
 32305: 63,
 32592: 64,
 32635: 65,
 32748: 66,
 32923: 67,
 33065: 68,
 33697: 69,
 33716: 70,
 33872: 71,
 33890: 72,
 37284: 73,
 37834: 74,
 38080: 75,
 38082: 76,
 38475: 77,
 39423: 78,
 41074: 79,
 42404: 80,
 43985: 81,
 44243: 82,
 44397: 83,
 45269: 84,
 45493: 8

In [24]:
# Now save your dictionaries
with open('../Data/processed/user2user_encoded.pkl', 'wb') as f:
    pickle.dump(user2user_encoded, f)

with open('../Data/processed/book2book_encoded.pkl', 'wb') as f:
    pickle.dump(book2book_encoded, f)

# save book_id to name mapping
with open('../Data/processed/book_id_to_name.pkl', 'wb') as f:
    pickle.dump(book_id_to_name, f)

# Map user-id and book-ids to user and book indices
We're creating two new columns in our DataFrame to hold the indices of users and books.

In [12]:
data['user'] = data['user_id'].map(user2user_encoded)
data['book'] = data['book_id'].map(book2book_encoded)

# Split data into training and testing set
We split our data into a training set (80%) and a test set (20%).

In [26]:
train, test = train_test_split(data, test_size=0.2, random_state=42)

# Get the number of users and books
We calculate the total number of unique users and books in our data.

In [27]:
num_users = len(user2user_encoded)
num_books = len(book_encoded2book)

# Set embedding dimension
This is a hyperparameter for our model representing the size of the embedding vectors.

In [28]:
embedding_size=10

# Build model
We're using Keras Functional API to build a model with Embedding layers for users and books.  
These embeddings will learn to represent user preferences and book properties during training.

In [29]:
user_input = Input(shape=[1])
user_embedding = Embedding(num_users, embedding_size)(user_input)
user_vec = Flatten()(user_embedding)

book_input = Input(shape=[1])
book_embedding = Embedding(num_books, embedding_size)(book_input)
book_vec = Flatten()(book_embedding)

We then calculate the dot product of these vectors to predict the user's rating of the book.

In [30]:
product = Dot(axes=1)([book_vec, user_vec])

Our model takes as input the user and book indices, and outputs the predicted rating.

In [31]:
model = Model(inputs=[user_input, book_input], outputs=product)

We compile our model with a mean squared error loss function, perfect for regression problem, and an Adam optimizer.

In [17]:
# Define the path where you want to save the best model
mae_checkpoint_path = '../Data/mae_best_model.h5'
mse_checkpoint_path = '../Data/mse_best_model.h5'

# Define a callback for model checkpointing
mae_checkpoint = ModelCheckpoint(mae_checkpoint_path, monitor='val_loss', save_best_only=True, verbose=1)
mse_checkpoint = ModelCheckpoint(mse_checkpoint_path, monitor='val_loss', save_best_only=True, verbose=1)

mae_initial_weights=model.get_weights()
mse_initial_weights=model.get_weights()

# Train model
We train our model for 5 epochs, with a batch size of 64. We also specify our validation data for validation accuracy.

In [None]:
model.compile(loss=MeanAbsoluteError(), optimizer=Adam())
print('loss function=MeanAbsoluteError()')
print('optimizer=Adam()')
print('batch_size=8')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=8, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=16')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=16, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=32')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=32, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=64')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=64, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=128')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=128, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('')
print('')
model.compile(loss=MeanAbsoluteError(), optimizer=RMSprop())
print('optimizer=RMSprop()')
print('batch_size=8')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=8, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=16')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=16, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=32')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=32, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=64')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=64, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=128')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=128, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('')
print('')
model.compile(loss=MeanAbsoluteError(), optimizer=SGD())
print('optimizer=SGD()')
print('batch_size=8')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=8, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=16')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=16, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=32')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=32, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=64')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=64, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=128')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=128, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('')
print('')

In [None]:
model.compile(loss='mean_squared_error', optimizer=Adam())
print('loss function=mean_squared_error')
print('optimizer=Adam()')
print('batch_size=8')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=8, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=16')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=16, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=32')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=32, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=64')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=64, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=128')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=128, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('')
print('')
model.compile(loss='mean_squared_error', optimizer=RMSprop())
print('optimizer=RMSprop()')
print('batch_size=8')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=8, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=16')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=16, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=32')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=32, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=64')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=64, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=128')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=128, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('')
print('')
model.compile(loss='mean_squared_error', optimizer=SGD())
print('optimizer=SGD()')
print('batch_size=8')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=8, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=16')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=16, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=32')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=32, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=64')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=64, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=128')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=128, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('')
print('')

# Evaluate model
We evaluate our trained model on the test data to see how well it generalizes.

In [18]:
model=load_model('../book/mse_best_model.h5')
mse = model.evaluate([test.user.values, test.book.values], test.rating.values)
print(f'Test MSE: {mse}')

Test MSE: 0.9375975728034973


In [19]:
model=load_model('../book/mae_best_model.h5')
mae = model.evaluate([test.user.values, test.book.values], test.rating.values)
print(f'Test MSE: {mae}')

Test MSE: 0.7050154209136963


In [34]:
book_ids = data['book_id'].unique().tolist()
book_id_to_name_keys = set(book_id_to_name.keys())

# Now find the difference
missing_ids = book_ids - book_id_to_name_keys

if missing_ids:
    print("The following book IDs are not included in the dictionary:")
    for book_id in missing_ids:
        print(book_id)
else:
    print("All book IDs are included in the dictionary.")


TypeError: unsupported operand type(s) for -: 'list' and 'set'

In [None]:
# Iterate over each row in the DataFrame
for index, row in books_df.iterrows():
    image_url = row['image_url']
    
    
    if image_url.startswith('https://s.gr-assets.com/'):
        print(index,' startswith s.gr')
        book_title = row['title']
        search_title = transform_to_search_engine_friendly(book_title)
        search_term = f"{search_title}+book+cover+amazon"
        
        # Construct the search URL
        search_url = f"https://www.googleapis.com/customsearch/v1?key={api_key}&cx={search_engine_id}&q={search_term}"
        print(search_url)
        # Perform the search and retrieve the image URLs
        response = requests.get(search_url)
        search_results = response.json()
        items = search_results.get("items", [])  # Get the list of items from the search results

        index = 0
        image_url = None

        while index < len(items) and image_url is None:
            item = items[index]
            pagemap = item.get("pagemap", {})  # Get the pagemap dictionary of the item
            scraped = pagemap.get("scraped", [])  # Get the list of scraped items
    
            if scraped:
                image_link = scraped[0].get("image_link")  # Get the image link from the scraped item
                if image_link:
                    image_url = image_link  # Found an image link, assign it to image_url
    
        index += 1

        print(image_url)

        books_df.at[index, 'image_url'] = image_url