Imports

In [2]:
import numpy as np
import pickle
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from keras.models import Model, load_model
from keras.layers import Input, Embedding, Flatten, Dot, Dense
from keras.optimizers import Adam, RMSprop, SGD
from keras.callbacks import ModelCheckpoint
from keras.losses import MeanAbsoluteError

2023-06-12 12:35:21.260866: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load data
We read the CSV file and load it into a pandas DataFrame. 

In [3]:
ratings = pd.read_csv('../Data/training/ratings.csv')

books = pd.read_csv('../Data/training/books.csv')
book_id_to_name = pd.Series(books.title.values, index = books.index).to_dict()

We print the first few records and a summary of the data for a quick examination.

In [43]:
print(ratings.head())
print(ratings.describe())

   book_id  user_id  rating
0        1      314       5
1        1      439       3
2        1      588       5
3        1     1169       4
4        1     1185       4
             book_id        user_id         rating
count  981756.000000  981756.000000  981756.000000
mean     4943.275636   25616.759933       3.856534
std      2873.207415   15228.338826       0.983941
min         1.000000       1.000000       1.000000
25%      2457.000000   12372.000000       3.000000
50%      4921.000000   25077.000000       4.000000
75%      7414.000000   38572.000000       5.000000
max     10000.000000   53424.000000       5.000000


glkh

In [4]:
user_ratings = ratings.groupby('user_id')['rating'].count()
user_rating_counts = ratings['user_id'].value_counts()
# Count the number of users for each number of ratings
users_with_ratings = user_rating_counts.groupby(user_ratings).count()
# Create a list of rating count thresholds
rating_thresholds = list(range(5, 100, 5))

# Count the number of users with fewer than X ratings, excluding the previous ranks
count_per_threshold = []
previous_count = 0
total_users = 53424  # Total number of users
for threshold in rating_thresholds:
    count = user_ratings[user_ratings < threshold].count() - previous_count
    count_per_threshold.append(count)
    previous_count += count

# Calculate the percentage of the whole user base
percent_per_threshold = [round((count / total_users) * 100) for count in count_per_threshold]

# Create the DataFrame
df = pd.DataFrame({"fewer than X": rating_thresholds, "count": count_per_threshold, "percent": percent_per_threshold})

# Print the DataFrame
(df)


Unnamed: 0,fewer than X,count,percent
0,5,17714,33
1,10,11305,21
2,15,5859,11
3,20,3907,7
4,25,2759,5
5,30,2082,4
6,35,1671,3
7,40,1305,2
8,45,1020,2
9,50,875,2


In [7]:
filter_out= 10
filtered_ratings = ratings[~ratings['user_id'].isin(user_rating_counts[user_rating_counts < filter_out].index.tolist())]
filtered_ratings.loc[:, 'user_id'] = filtered_ratings.groupby('user_id').ngroup()
# Get unique user IDs from the ratings data


# Count the number of ratings per user
rating_counts = filtered_ratings.groupby('user_id').size().reset_index(name='rating_count')

# Create the users DataFrame
users = pd.DataFrame(rating_counts)
ratings = filtered_ratings
users['new_data'] = False
users.head()

Unnamed: 0,user_id,rating_count,new_data
0,0,76,False
1,1,16,False
2,2,24,False
3,3,19,False
4,4,42,False


In [11]:
ratings.to_csv('../Data/ratings.csv',index=False)
users.to_csv('../data/users.csv',index=False)

# Create user-id and book-id mapping
We're creating two mapping dictionaries for users and books - from id to index and from index to id.  
This will help in embedding layer where we'll be dealing with indices.

In [12]:
user_ids = ratings['user_id'].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
book_ids = ratings['book_id'].unique().tolist()
book2book_encoded = {x: i for i, x in enumerate(book_ids)}
book_encoded2book = {i: x for i, x in enumerate(book_ids)}

# Map user-id and book-ids to user and book indices
We're creating two new columns in our DataFrame to hold the indices of users and books.

In [13]:
ratings['user'] = ratings['user_id'].map(user2user_encoded)
ratings['book'] = ratings['book_id'].map(book2book_encoded)

# Split data into training and testing set
We split our data into a training set (80%) and a test set (20%).

In [14]:
train, test = train_test_split(ratings, test_size=0.2, random_state=42)

# Get the number of users and books
We calculate the total number of unique users and books in our data.

In [15]:
num_users = len(user2user_encoded)
num_books = len(book_encoded2book)

# Set embedding dimension
This is a hyperparameter for our model representing the size of the embedding vectors.

In [16]:
embedding_size=10

# Build model
We're using Keras Functional API to build a model with Embedding layers for users and books.  
These embeddings will learn to represent user preferences and book properties during training.

In [17]:
user_input = Input(shape=[1])
user_embedding = Embedding(num_users, embedding_size)(user_input)
user_vec = Flatten()(user_embedding)

book_input = Input(shape=[1])
book_embedding = Embedding(num_books, embedding_size)(book_input)
book_vec = Flatten()(book_embedding)

We then calculate the dot product of these vectors to predict the user's rating of the book.

In [18]:
product = Dot(axes=1)([book_vec, user_vec])

Our model takes as input the user and book indices, and outputs the predicted rating.

In [19]:
model = Model(inputs=[user_input, book_input], outputs=product)

In [20]:
model.compile(loss=MeanAbsoluteError(), optimizer=Adam())
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=64, epochs=1, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values))
model.save("../models/mae_best_model.h5")




We compile our model with a mean squared error loss function, perfect for regression problem, and an Adam optimizer.

In [None]:
# Define the path where you want to save the best model
mae_checkpoint_path = '../Data/mae_best_model.h5'
mse_checkpoint_path = '../Data/mse_best_model.h5'

# Define a callback for model checkpointing
mae_checkpoint = ModelCheckpoint(mae_checkpoint_path, monitor='val_loss', save_best_only=True, verbose=1)
mse_checkpoint = ModelCheckpoint(mse_checkpoint_path, monitor='val_loss', save_best_only=True, verbose=1)

mae_initial_weights=model.get_weights()
mse_initial_weights=model.get_weights()

# Train model
We train our model for 5 epochs, with a batch size of 64. We also specify our validation data for validation accuracy.

In [None]:
model.compile(loss=MeanAbsoluteError(), optimizer=Adam())
print('loss function=MeanAbsoluteError()')
print('optimizer=Adam()')
print('batch_size=8')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=8, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=16')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=16, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=32')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=32, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=64')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=64, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=128')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=128, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('')
print('')
model.compile(loss=MeanAbsoluteError(), optimizer=RMSprop())
print('optimizer=RMSprop()')
print('batch_size=8')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=8, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=16')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=16, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=32')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=32, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=64')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=64, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=128')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=128, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('')
print('')
model.compile(loss=MeanAbsoluteError(), optimizer=SGD())
print('optimizer=SGD()')
print('batch_size=8')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=8, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=16')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=16, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=32')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=32, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=64')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=64, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=128')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=128, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('')
print('')

In [None]:
model.compile(loss='mean_squared_error', optimizer=Adam())
print('loss function=mean_squared_error')
print('optimizer=Adam()')
print('batch_size=8')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=8, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=16')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=16, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=32')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=32, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=64')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=64, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=128')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=128, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('')
print('')
model.compile(loss='mean_squared_error', optimizer=RMSprop())
print('optimizer=RMSprop()')
print('batch_size=8')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=8, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=16')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=16, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=32')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=32, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=64')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=64, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=128')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=128, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('')
print('')
model.compile(loss='mean_squared_error', optimizer=SGD())
print('optimizer=SGD()')
print('batch_size=8')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=8, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=16')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=16, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=32')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=32, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=64')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=64, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=128')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=128, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('')
print('')

# Evaluate model
We evaluate our trained model on the test data to see how well it generalizes.

In [None]:
model=load_model('../book/mse_best_model.h5')
mse = model.evaluate([test.user.values, test.book.values], test.rating.values)
print(f'Test MSE: {mse}')

In [None]:
model=load_model('../book/mae_best_model.h5')
mae = model.evaluate([test.user.values, test.book.values], test.rating.values)
print(f'Test MSE: {mae}')

In [None]:
# Iterate over each row in the DataFrame
for index, row in books_df.iterrows():
    image_url = row['image_url']
    
    
    if image_url.startswith('https://s.gr-assets.com/'):
        print(index,' startswith s.gr')
        book_title = row['title']
        search_title = transform_to_search_engine_friendly(book_title)
        search_term = f"{search_title}+book+cover+amazon"
        
        # Construct the search URL
        search_url = f"https://www.googleapis.com/customsearch/v1?key={api_key}&cx={search_engine_id}&q={search_term}"
        print(search_url)
        # Perform the search and retrieve the image URLs
        response = requests.get(search_url)
        search_results = response.json()
        items = search_results.get("items", [])  # Get the list of items from the search results

        index = 0
        image_url = None

        while index < len(items) and image_url is None:
            item = items[index]
            pagemap = item.get("pagemap", {})  # Get the pagemap dictionary of the item
            scraped = pagemap.get("scraped", [])  # Get the list of scraped items
    
            if scraped:
                image_link = scraped[0].get("image_link")  # Get the image link from the scraped item
                if image_link:
                    image_url = image_link  # Found an image link, assign it to image_url
    
        index += 1

        print(image_url)

        books_df.at[index, 'image_url'] = image_url

In [3]:

user_id=123
num_books=5
book_id_to_name = pd.Series(books.title.values, index = books.index).to_dict()
model = load_model('../models/mae_best_model.h5')
ratings=pd.read_csv('../data/ratings.csv')
books=pd.read_csv('../data/books.csv')
user_ids = ratings['user_id'].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
book_ids = ratings['book_id'].unique().tolist()
book2book_encoded = {x: i for i, x in enumerate(book_ids)}
user_encoded = user2user_encoded[user_id]
# Getting the book ids in the encoding order
book_ids = list(book2book_encoded.keys())
book_ids = np.array(book_ids) - 1
# Repeating the user id to match the shape of book ids
user_array = np.array([user_encoded for _ in range(len(book_ids))])

# Making the prediction
pred_ratings = model.predict([user_array, np.array(book_ids)])

# Getting the indices of the top num_books ratings
top_indices = pred_ratings.flatten().argsort()[-num_books:][::-1]

 # Returning the corresponding book names
recommended_books = []
for i in top_indices:
            book_id = book_ids[i] + 1
            book_title = book_id_to_name[book_id]
            book_image_url = books.loc[books['title'] == book_title, 'image_url'].values[0]
            amazon_link = f"https://www.amazon.com/"
            recommended_books.append({
                "title": book_title,
                "image": book_image_url,
                "amazon_link": amazon_link
            })
print(recommended_books)        

[{'title': 'The Cabinet of Curiosities (Pendergast, #3)', 'image': 'https://images.gr-assets.com/books/1169235779m/39031.jpg', 'amazon_link': 'https://www.amazon.com/'}, {'title': 'Losing It (Losing It, #1)', 'image': 'https://images.gr-assets.com/books/1348459319m/16034964.jpg', 'amazon_link': 'https://www.amazon.com/'}, {'title': 'The Cat Who Could Read Backwards (Cat Who..., #1)', 'image': 'https://s.gr-assets.com/assets/nophoto/book/111x148-bcc042a9c91a29c1d680899eff700a03.png', 'amazon_link': 'https://www.amazon.com/'}, {'title': 'In Flight (Up in the Air, #1)', 'image': 'https://images.gr-assets.com/books/1397321579m/16134782.jpg', 'amazon_link': 'https://www.amazon.com/'}, {'title': "Ender's Game, Volume 1: Battle School (Ender's Saga)", 'image': 'https://s.gr-assets.com/assets/nophoto/book/111x148-bcc042a9c91a29c1d680899eff700a03.png', 'amazon_link': 'https://www.amazon.com/'}]


In [11]:
user_id = 1
book_id = 1
rating = 1
users = pd.read_csv('../data/users.csv')
if user_id in users['user_id'].values:
        if user_id and rating:
            print('lsa')

        elif user_id:
            print('didnt find any raiting for that user')
            
else:
        print('good')

lsa
