## Importing Libraries & Data Preparation

In [1]:
import os
import numpy as np
import pandas as pd
import sys
import openai
import re
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Add, Concatenate, Dense, Dropout
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.regularizers import l2
from sklearn.utils import resample
from kerastuner import RandomSearch
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error, mean_absolute_error


Using TensorFlow backend


  from kerastuner import RandomSearch


In [27]:
DATA_DIR =  os.path.join(os.path.dirname(os.path.realpath('__file__')), 'dev-data')

# Load goodreads datasets
ratings_df = pd.read_csv(f"{DATA_DIR}/data/ratings.csv")
to_read_df = pd.read_csv(f"{DATA_DIR}/data/to_read.csv")
books_df = pd.read_csv(f"{DATA_DIR}/data/books.csv")
book_tags_df = pd.read_csv(f"{DATA_DIR}/data/book_tags.csv")
tags_df = pd.read_csv(f"{DATA_DIR}/data/tags.csv")


In [5]:
# Convert 'to_read' interactions into implicit ratings (e.g., 1)
to_read_df['implicit_rating'] = 1

# Normalize explicit ratings to range 0-1
ratings_df['rating'] = (ratings_df['rating'] - ratings_df['rating'].min()) / (ratings_df['rating'].max() - ratings_df['rating'].min())

# Now combine explicit and implicit feedback
combined_ratings_df = pd.concat([ratings_df[['user_id', 'book_id', 'rating']],
                                 to_read_df[['user_id', 'book_id', 'implicit_rating']]])


In [6]:
# Normalize book IDs and user IDs since Neural Collaborative Filtering models expect contiguous IDs starting from 0
user_ids = combined_ratings_df['user_id'].unique().tolist()
user_id_map = {user_id: idx for idx, user_id in enumerate(user_ids)}
book_ids = combined_ratings_df['book_id'].unique().tolist()
book_id_map = {book_id: idx for idx, book_id in enumerate(book_ids)}

combined_ratings_df['user_id'] = combined_ratings_df['user_id'].map(user_id_map)
combined_ratings_df['book_id'] = combined_ratings_df['book_id'].map(book_id_map)


In [7]:
# Process book metadata
books_df = books_df[['book_id', 'authors', 'title', 'average_rating', 'image_url']]
books_df['book_id'] = books_df['book_id'].map(book_id_map)

# Process book tags
book_tags_df = book_tags_df.merge(tags_df, on='tag_id', how='left')
book_tags_df = book_tags_df[['goodreads_book_id', 'tag_name']]
book_tags_df['goodreads_book_id'] = book_tags_df['goodreads_book_id'].map(book_id_map)
book_tags_df.dropna(inplace=True)


## Model Training

In [8]:
# Balance the ratings distribution
# Assign more weight to less frequent ratings during training
ratings_weights = {0.75: 1, 1.00: 1, 0.50: 1.5, 0.25: 2, 0.00: 2.5}
combined_ratings_df['sample_weight'] = combined_ratings_df['rating'].map(ratings_weights)


# Balance the book interactions
# Undersample books with an extremely high number of interactions
book_interaction_counts = combined_ratings_df['book_id'].value_counts()
high_interaction_books = book_interaction_counts[book_interaction_counts > book_interaction_counts.quantile(0.9)].index
undersampled_df = combined_ratings_df[~combined_ratings_df['book_id'].isin(high_interaction_books)]

# Oversample books with very few interactions
low_interaction_books = book_interaction_counts[book_interaction_counts < book_interaction_counts.quantile(0.1)].index
oversampled_interactions = resample(combined_ratings_df[combined_ratings_df['book_id'].isin(low_interaction_books)],
                                    replace=True, n_samples=len(high_interaction_books), random_state=42)
balanced_df = pd.concat([undersampled_df, oversampled_interactions])


In [9]:
# Prepare the dataset
X = balanced_df[['user_id', 'book_id']].values
y = balanced_df['rating'].values
sample_weight = balanced_df['sample_weight'].values

# Convert to DataFrames for easier manipulation
X_df = pd.DataFrame(X, columns=['user_id', 'book_id'])
y_df = pd.DataFrame(y, columns=['rating'])
sample_weight_df = pd.DataFrame(sample_weight, columns=['sample_weight'])

# Combine the DataFrames
combined_df = pd.concat([X_df, y_df, sample_weight_df], axis=1)

# Drop rows with NaN values
cleaned_df = combined_df.dropna()

# Split the cleaned DataFrame back into X, y, and sample_weight
X_cleaned = cleaned_df[['user_id', 'book_id']].values
y_cleaned = cleaned_df['rating'].values
sample_weight_cleaned = cleaned_df['sample_weight'].values

# Data splitting
X_train, X_test, y_train, y_test, sample_weight_train, _ = train_test_split(X_cleaned, y_cleaned, sample_weight_cleaned, test_size=0.3, random_state=42)


In [10]:
# Define NeuMF model architecture
def get_model(num_users, num_items, latent_dim=10, reg=0.0):
    # Input layers
    user_input = Input(shape=(1,), name='user_input')
    item_input = Input(shape=(1,), name='item_input')

    # Embedding layers
    MF_Embedding_User = Embedding(input_dim=num_users, output_dim=latent_dim, name='user_embedding',
                                  embeddings_regularizer=l2(reg), input_length=1)
    MF_Embedding_Item = Embedding(input_dim=num_items, output_dim=latent_dim, name='item_embedding',
                                  embeddings_regularizer=l2(reg), input_length=1)

    # Flatten the embedding layers
    user_latent = Flatten()(MF_Embedding_User(user_input))
    item_latent = Flatten()(MF_Embedding_Item(item_input))

    # Element-wise product of user and item embeddings (GMF part)
    gmf_vector = Dot(axes=1)([user_latent, item_latent])

    # MLP part
    mlp_user_latent = Flatten()(MF_Embedding_User(user_input))
    mlp_item_latent = Flatten()(MF_Embedding_Item(item_input))
    mlp_vector = Concatenate()([mlp_user_latent, mlp_item_latent])
    mlp_vector = Dense(64, activation='relu')(mlp_vector)
    mlp_vector = Dropout(0.2)(mlp_vector)
    mlp_vector = Dense(32, activation='relu')(mlp_vector)
    mlp_vector = Dropout(0.2)(mlp_vector)

    # Concatenate GMF and MLP parts
    predict_vector = Concatenate()([gmf_vector, mlp_vector])

    # Final prediction layer
    prediction = Dense(1, activation='sigmoid', name='prediction')(predict_vector)

    model = Model(inputs=[user_input, item_input], outputs=prediction)

    return model


In [11]:
# Get number of unique users and items
num_users = combined_ratings_df['user_id'].nunique()
num_items = combined_ratings_df['book_id'].nunique()

print(num_users)
print(num_items)


53424
10000


## Hyperparameter Tuning

In [13]:
# Define the model with hyperparameters
def build_model(hp):
    # Hyperparameters
    latent_dim = hp.Choice('latent_dim', [10, 20, 30])
    learning_rate = hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='log')

    # Model architecture
    model = get_model(num_users, num_items, latent_dim=latent_dim)
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mean_squared_error', weighted_metrics=['mean_squared_error'])

    return model

# Set up the tuner
tuner = RandomSearch(
    build_model,
    objective='val_mean_squared_error',
    max_trials=10,
    executions_per_trial=1,
    directory='my_dir',
    project_name='keras_tuner_demo'
)


Reloading Tuner from my_dir/keras_tuner_demo/tuner0.json


In [15]:
# Define the early stopping criteria
early_stopping = EarlyStopping(monitor='val_loss', # Monitors the validation loss
                               patience=3,         # Number of epochs with no improvement after which training will be stopped
                               restore_best_weights=True) # Restores model weights from the epoch with the best value of the monitored quantity


In [None]:
# Start the hyperparameter search
tuner.search([X_train[:, 0], X_train[:, 1]], y_train, sample_weight=sample_weight_train,
             validation_split=0.15, epochs=10, verbose=2, callbacks=[early_stopping])


In [16]:
# Get the best hyperparameters
# Best latent dimension: 20
# Best learning rate: 0.00036724670443596247

best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"Best latent dimension: {best_hps.get('latent_dim')}")
print(f"Best learning rate: {best_hps.get('learning_rate')}")


Best latent dimension: 20
Best learning rate: 0.00036724670443596247


In [16]:
# Build the model with the best hyperparameters
model = tuner.hypermodel.build(best_hps)

# Train the best model determined
history = model.fit([X_train[:, 0], X_train[:, 1]], y_train, sample_weight=sample_weight_train,
                    validation_split=0.15, epochs=10, verbose=2, callbacks=[early_stopping])


Epoch 1/10
52186/52186 - 185s - loss: 0.0686 - mean_squared_error: 0.0569 - val_loss: 0.0632 - val_mean_squared_error: 0.0524 - 185s/epoch - 4ms/step
Epoch 2/10
52186/52186 - 193s - loss: 0.0609 - mean_squared_error: 0.0505 - val_loss: 0.0620 - val_mean_squared_error: 0.0514 - 193s/epoch - 4ms/step
Epoch 3/10
52186/52186 - 185s - loss: 0.0579 - mean_squared_error: 0.0480 - val_loss: 0.0615 - val_mean_squared_error: 0.0510 - 185s/epoch - 4ms/step
Epoch 4/10
52186/52186 - 182s - loss: 0.0548 - mean_squared_error: 0.0455 - val_loss: 0.0617 - val_mean_squared_error: 0.0511 - 182s/epoch - 3ms/step
Epoch 5/10
52186/52186 - 172s - loss: 0.0517 - mean_squared_error: 0.0429 - val_loss: 0.0625 - val_mean_squared_error: 0.0518 - 172s/epoch - 3ms/step
Epoch 6/10
52186/52186 - 169s - loss: 0.0483 - mean_squared_error: 0.0401 - val_loss: 0.0635 - val_mean_squared_error: 0.0527 - 169s/epoch - 3ms/step


In [17]:
# Evaluate the best model
train_loss = history.history['loss']
val_loss = history.history['val_loss']
print(f'Training Loss: {train_loss[-1]}')
print(f'Validation Loss: {val_loss[-1]}')

# Predict on the test set
y_pred = model.predict([X_test[:, 0], X_test[:, 1]])

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'RMSE: {rmse}')
print(f'MSE: {mse}')
print(f'MAE: {mae}')


Training Loss: 0.04831725358963013
Validation Loss: 0.06348511576652527
RMSE: 0.21027621019903137
MSE: 0.04421608457566722
MAE: 0.16300669935920345


In [22]:
# Save the model
# Create a directory for models if it doesn't exist
MODEL_DIR = os.path.join(os.path.dirname(os.path.realpath('__file__')), 'dev-data', 'models')
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)

# Save the model
model.save(os.path.join(MODEL_DIR, 'my_model.h5'))


  saving_api.save_model(


In [17]:
# Path to the saved model
MODEL_DIR = os.path.join(os.path.dirname(os.path.realpath('__file__')), 'dev-data', 'models')
model_path = os.path.join(MODEL_DIR, 'my_model.h5')

# Load the model
loaded_model = load_model(model_path)


## Generate Recommendations 

In [20]:
# Defining the directory where predictions will be saved
DATA_DIR = os.path.join(os.path.dirname(os.path.realpath('__file__')), 'dev-data')
PREDICTIONS_DIR = os.path.join(DATA_DIR, 'predictions')


In [21]:
# Function to perform batch predictions and save incrementally
def batch_predict_and_save(model, num_users, num_items, batch_size=10000, output_file=os.path.join(PREDICTIONS_DIR, 'predictions.csv')):
    # Generate all possible user-item pairs
    user_ids = np.repeat(np.arange(num_users), num_items)
    item_ids = np.tile(np.arange(num_items), num_users)

    # Ensure the output directory exists
    if not os.path.exists(PREDICTIONS_DIR):
        os.makedirs(PREDICTIONS_DIR)

    # Determine the starting point in case of a crash during training
    start_index = 0
    if os.path.exists(output_file):
        with open(output_file, 'r') as f:
            for line in f:
                pass
            last_line = line.split(',')
            start_index = int(last_line[0]) * num_items + int(last_line[1]) + 1

    # Total number of batches
    total_batches = int(np.ceil((len(user_ids) - start_index) / batch_size))

    # Open the output file in append mode
    with open(output_file, 'a' if start_index > 0 else 'w') as f:
        # Write header if file is new
        if start_index == 0:
            f.write("user_id,item_id,prediction\n")

        # Perform batch predictions
        for i in range(start_index, len(user_ids), batch_size):
            # Extract the batch of user and item IDs
            batch_user_ids = user_ids[i:i + batch_size]
            batch_item_ids = item_ids[i:i + batch_size]

            # Predict ratings for the current batch
            batch_predictions = model.predict([batch_user_ids, batch_item_ids]).flatten()

            # Save predictions incrementally
            for user_id, item_id, prediction in zip(batch_user_ids, batch_item_ids, batch_predictions):
                f.write(f"{user_id},{item_id},{prediction}\n")

            # Log progress
            batch_num = (i - start_index) // batch_size + 1
            sys.stdout.write(f"\rProgress: Batch {batch_num}/{total_batches} completed")
            sys.stdout.flush()

    print(f"\nPredictions have been saved to {output_file}")


In [25]:
# Adjust the batch_size based on system
batch_size = 100000

# Call the function to perform batch predictions and save results
batch_predict_and_save(model, num_users, num_items, batch_size)


Progress: Batch 5343/5343 completed
Predictions have been saved to /Users/hassan.alrabea/Code/McGill/goodreads_rec_sys/dev-data/predictions/predictions.csv


In [33]:
N = 10  # Number of top recommendations

# Initialize an empty DataFrame to hold the top N recommendations
top_n_recommendations = pd.DataFrame()

# Process the CSV in chunks
for chunk in pd.read_csv(f"{PREDICTIONS_DIR}/predictions.csv", chunksize=100000):
    # Sort and select top N within each chunk
    top_n_chunk = chunk.sort_values(['user_id', 'prediction'], ascending=[True, False]).groupby('user_id').head(N)
    top_n_recommendations = pd.concat([top_n_recommendations, top_n_chunk])

# After processing all chunks, sort again and group to ensure only the top N are selected overall
top_n_recommendations.sort_values(['user_id', 'prediction'], ascending=[True, False], inplace=True)
top_n_recommendations = top_n_recommendations.groupby('user_id').head(N).reset_index(drop=True)

# Save the top N recommendations to a CSV in the same directory
top_n_recommendations.to_csv(f"{PREDICTIONS_DIR}/top_n_recommendations.csv", index=False)

print(f"Top 10 recommendations have been saved to {PREDICTIONS_DIR}/top_n_recommendations.csv")


Top N recommendations have been saved to /Users/hassan.alrabea/Code/McGill/goodreads_rec_sys/dev-data/predictions/top_n_recommendations.csv


In [22]:
# Load the top N recommendations
top_n_recommendations = pd.read_csv(f"{PREDICTIONS_DIR}/top_n_recommendations.csv")

# Filter for user_id 123 and sort by prediction score
user_123_recommendations = top_n_recommendations[top_n_recommendations['user_id'] == 123].sort_values(by='prediction', ascending=False)

# Merge with books_df to get book titles
user_123_recommendations_with_titles = user_123_recommendations.merge(books_df[['book_id', 'title']], left_on='item_id', right_on='book_id')

# Print the top recommendations for user_id 123 with book titles
user_123_recommendations_with_titles


Unnamed: 0,user_id,item_id,prediction,book_id,title
0,123,1534,0.962891,1534,The Complete Calvin and Hobbes
1,123,973,0.931827,973,The Calvin and Hobbes Tenth Anniversary Book
2,123,975,0.929871,975,The Authoritative Calvin and Hobbes: A Calvin ...
3,123,976,0.926608,976,The Indispensable Calvin and Hobbes
4,123,972,0.925207,972,The Days Are Just Packed: A Calvin and Hobbes ...
5,123,10,0.923539,10,"Harry Potter Collection (Harry Potter, #1-6)"
6,123,8197,0.923263,8197,"The Way of Kings, Part 1 (The Stormlight Archi..."
7,123,977,0.921298,977,There's Treasure Everywhere: A Calvin and Hobb...
8,123,3261,0.918531,3261,The Revenge of the Baby-Sat
9,123,1830,0.917838,1830,Attack of the Deranged Mutant Killer Monster S...


## Make a Smaller Version of predictions

In [45]:
output_file = f"{PREDICTIONS_DIR}/smaller_predictions.csv"
chunk_size = 1000000
desired_file_size = 1 * 1024**3  # Aim is to output 1 gigabyte file

# Get total number of rows in the file
total_rows = sum(1 for _ in open(f"{PREDICTIONS_DIR}/predictions.csv"))

# Estimate the total file size assuming uniform row size throughout the file
with pd.read_csv(f"{PREDICTIONS_DIR}/predictions.csv", chunksize=1) as reader:
    for sample_row in reader:
        row_size = sys.getsizeof(sample_row.to_csv(header=False, index=False))
        break
estimated_total_file_size = total_rows * row_size

# Calculate sample fraction based on the estimated total file size
sample_fraction = desired_file_size / estimated_total_file_size

# Ensure the sample fraction is between 0 and 1
if sample_fraction <= 0 or sample_fraction >= 1:
    raise ValueError(f"Calculated sample fraction {sample_fraction} is outside valid range. Adjust your desired file size.")

# Now sample the file
with pd.read_csv(f"{PREDICTIONS_DIR}/predictions.csv", chunksize=chunk_size) as reader:
    for i, chunk in enumerate(reader):
        # Sample the chunk
        chunk_sample = chunk.sample(frac=sample_fraction)
        mode = 'w' if i == 0 else 'a'
        header = True if i == 0 else False
        chunk_sample.to_csv(output_file, mode=mode, header=header, index=False)
        print(f"Processed chunk {i+1}")

print(f"Finished sampling. The output is saved in {output_file}")


Processed chunk 1
Processed chunk 2
Processed chunk 3
Processed chunk 4
Processed chunk 5
Processed chunk 6
Processed chunk 7
Processed chunk 8
Processed chunk 9
Processed chunk 10
Processed chunk 11
Processed chunk 12
Processed chunk 13
Processed chunk 14
Processed chunk 15
Processed chunk 16
Processed chunk 17
Processed chunk 18
Processed chunk 19
Processed chunk 20
Processed chunk 21
Processed chunk 22
Processed chunk 23
Processed chunk 24
Processed chunk 25
Processed chunk 26
Processed chunk 27
Processed chunk 28
Processed chunk 29
Processed chunk 30
Processed chunk 31
Processed chunk 32
Processed chunk 33
Processed chunk 34
Processed chunk 35
Processed chunk 36
Processed chunk 37
Processed chunk 38
Processed chunk 39
Processed chunk 40
Processed chunk 41
Processed chunk 42
Processed chunk 43
Processed chunk 44
Processed chunk 45
Processed chunk 46
Processed chunk 47
Processed chunk 48
Processed chunk 49
Processed chunk 50
Processed chunk 51
Processed chunk 52
Processed chunk 53
Pr

In [92]:
# Specify the path to the smaller_predictions.csv file
file_path = f"{PREDICTIONS_DIR}/smaller_predictions.csv"

# Load the smaller predictions DataFrame
smaller_predictions_df = pd.read_csv(file_path)

# Rename 'item_id' column to 'book_id'
smaller_predictions_df.rename(columns={'item_id': 'book_id'}, inplace=True)

# Sort the DataFrame first by 'user_id' in ascending order, then by 'prediction' in descending order
sorted_predictions_df = smaller_predictions_df.sort_values(by=['user_id', 'prediction'], ascending=[True, False])

# Save the sorted DataFrame back to a CSV file
sorted_predictions_df.to_csv(f'{PREDICTIONS_DIR}/sorted_smaller_predictions.csv', index=False)


## GenAI Implementation

In [23]:
# Load environment variables from .env file
load_dotenv()

# Retrieve API key from environment variables
openai.api_key = os.getenv('OPENAI_API_KEY')
sorted_smaller_predictions_df = pd.read_csv(f"{PREDICTIONS_DIR}/sorted_smaller_predictions.csv")


In [29]:
# Define the filter_predictions function
def filter_predictions(user_id, predictions_df, books_df, tags_df, book_tags_df, user_request):
    # Define the system message to establish the assistant's role
    system_message = "You are a helpful assistant. Provide a response in a simple and structured format suitable for processing by a program.\
        Only return a list of book genres, themes, authors, and tags that are relevant to a user's request, as comma-separated values.\
            No need to return the response like so: Genres: Fantasy\nThemes: Formula One, Racing, Motorsport\nAuthors: N/A\nTags: Sports Fantasy, Racing, Magical Realism, Car Racing, Motorsport Fantasy\
                Bur rather like so: Fantasy, Formula One, Racing, Motorsport, Sports Fantasy, Racing, Magical Realism, Car Racing, Motorsport Fantasy.\
                    Do not return N/A or Not available, simply return nothing"

    # Define the user message with the actual prompt
    user_message = user_request

    # Call GPT API
    book_response = openai.ChatCompletion.create(
        model="gpt-4-1106-preview",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ]
    )

    # Return the relevant attributes
    relevant_attributes = book_response['choices'][0]['message']['content'].split(',')

    # Replace newlines and multiple subsequent spaces (if any) with a single comma, then split
    formatted_response = book_response['choices'][0]['message']['content'].replace('\n', ',').replace('  ', ' ')

    # Split by comma and strip whitespace
    all_attributes = [item.strip() for item in formatted_response.split(',') if item.strip() != '']

    # Filter out category names like 'Genres:', 'Themes:', 'Authors:', and 'Tags:'
    category_headers = ['Genres:', 'book genres:', 'Genres:', 'Themes:', 'themes:', 'Authors:', 'authors:', 'Tags:', 'tags:']
    relevant_attributes = [attr for attr in all_attributes if attr not in category_headers]

    # Filter tags that match the relevant attributes
    relevant_tags = tags_df[tags_df['tag_name'].apply(lambda x: any(re.search(attr.strip(), x, re.IGNORECASE) for attr in relevant_attributes))]
    relevant_tag_ids = relevant_tags['tag_id'].unique()

    # Filter book_tags that match the relevant tag IDs
    relevant_book_tags = book_tags_df[book_tags_df['tag_id'].isin(relevant_tag_ids)]
    relevant_book_ids = relevant_book_tags['goodreads_book_id'].unique()

    # Filter books that match the relevant book IDs and attributes
    filtered_books = books_df[books_df['goodreads_book_id'].isin(relevant_book_ids) |
                              books_df['authors'].apply(lambda x: any(re.search(attr.strip(), x, re.IGNORECASE) for attr in relevant_attributes)) |
                              books_df['title'].apply(lambda x: any(re.search(attr.strip(), x, re.IGNORECASE) for attr in relevant_attributes))]



    # Filter predictions for the user that match the relevant book IDs
    filtered_predictions = predictions_df[(predictions_df['user_id'] == user_id) &
                                          (predictions_df['book_id'].isin(filtered_books['book_id']))]


    return filtered_predictions


In [30]:
# Define the rerank_predictions function
def rerank_predictions(filtered_predictions, books_df, user_request):

    # After filtering predictions, merge with books data to include book details
    filtered_books = books_df[books_df['goodreads_book_id'].isin(filtered_predictions['book_id'])]
    filtered_predictions_with_details = filtered_predictions.merge(filtered_books[['goodreads_book_id', 'title', 'authors']], left_on='book_id', right_on='goodreads_book_id')

    # Reranking prompt sent to GPT API
    reranking_prompt = f"""
    Re-rank these books based on the user's preference, only return the top 10.
    output fomrat should follow this example:
    "1. Book ID: 5038, Title: The Pillars of Creation (Sword of Truth, #7), Authors: Terry Goodkind, Prediction: 0.8330672979354858 (Fantasy)
    2. Book ID: 9567, Title: Half Asleep in Frog Pajamas, Authors: Tom Robbins, Prediction: 0.8262556195259094 (Romance)
    3. Book ID: 5368, Title: Forever Amber, Authors: Kathleen Winsor, Prediction: 0.7773409485816956 (Romance)
    Reasoning: Note that I prioritized books that are known for romance or have strong romantic elements, and disregarded
    those that focus on other genres like fantasy or thrillers, unless they are known to blend romance into the narrative
    substantially. If the user is strictly looking for pure romance novels, some books such as "Harry Potter and the
    Prisoner of Azkaban" and non-romance focused thrillers have been left out of the top 10.
    "
    Filtered Recommendations with Details:
    """

    for index, row in filtered_predictions_with_details.iterrows():
        reranking_prompt += f"Book ID: {row['book_id']}, Title: {row['title']}, Authors: {row['authors']}, Prediction: {row['prediction']}\n"

    reranking_prompt += f"User Preference: {user_request}"

    # Sort by prediction score in descending order and take the top 10
    filtered_predictions_with_details = filtered_predictions_with_details.sort_values(by='prediction', ascending=False).head(10)

    # Call the chat completion API for reranking
    reranking_response = openai.ChatCompletion.create(
        model="gpt-4-1106-preview",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": reranking_prompt}
        ]
    )

    # Extract re-ranked recommendations from the response
    reranked_recommendations = reranking_response['choices'][0]['message']['content'].strip()

    return reranked_recommendations


## Example 1: Genre Specific

In [31]:
# Example of a user's textual request and filtering predictions
user_request = "I'm looking for romance-fantasy books"
user_id = 123  # Example user ID

# Call the modified filter_predictions function
filtered_predictions = filter_predictions(user_id, sorted_smaller_predictions_df, books_df, tags_df, book_tags_df, user_request)
# Call the rerank_predictions function
reranked_recommendations = rerank_predictions(filtered_predictions, books_df, user_request)

print(reranked_recommendations)


1. Book ID: 9567, Title: Half Asleep in Frog Pajamas, Authors: Tom Robbins, Prediction: 0.8262556195259094 (Romance)
2. Book ID: 5190, Title: Open House, Authors: Elizabeth Berg, Prediction: 0.7504738569259644 (Romance)
3. Book ID: 3478, Title: Message in a Bottle, Authors: Nicholas Sparks, Prediction: 0.5736340284347534 (Romance)
4. Book ID: 5038, Title: The Pillars of Creation (Sword of Truth, #7), Authors: Terry Goodkind, Prediction: 0.8330672979354858 (Fantasy/Romance)
5. Book ID: 4656, Title: The Fourth Hand, Authors: John Irving, Prediction: 0.6586891412734985 (Romance?)
6. Book ID: 446, Title: The Brooklyn Follies, Authors: Paul Auster, Prediction: 0.6763553619384766 (Romance?)
7. Book ID: 7531, Title: The Idiot Girls' Action-Adventure Club: True Tales from a Magnificent and Clumsy Life, Authors: Laurie Notaro, Prediction: 0.7395410537719727 (Romance?)
8. Book ID: 3832, Title: Cover Her Face (Adam Dalgliesh #1), Authors: P.D. James, Prediction: 0.6382019519805908 (Romance?)
9. B

## Example 2: Author Specifc or Similar to an Author

In [35]:
# Example of a user's textual request and filtering predictions
user_request = "I'm looking for books similar to ones written by Terry Goodkind"
user_id = 123  # Example user ID

# Call the modified filter_predictions function
filtered_predictions = filter_predictions(user_id, sorted_smaller_predictions_df, books_df, tags_df, book_tags_df, user_request)
# Call the rerank_predictions function
reranked_recommendations = rerank_predictions(filtered_predictions, books_df, user_request)

print(reranked_recommendations)


1. Book ID: 5038, Title: The Pillars of Creation (Sword of Truth, #7), Authors: Terry Goodkind, Prediction: 0.8330672979354858 (Fantasy)
2. Book ID: 9567, Title: Half Asleep in Frog Pajamas, Authors: Tom Robbins, Prediction: 0.8262556195259094
3. Book ID: 2657, Title: To Kill a Mockingbird, Authors: Harper Lee, Prediction: 0.7711403965950012
4. Book ID: 5190, Title: Open House, Authors: Elizabeth Berg, Prediction: 0.7504738569259644
5. Book ID: 5182, Title: Songs in Ordinary Time, Authors: Mary McGarry Morris, Prediction: 0.6870437860488892
6. Book ID: 4656, Title: The Fourth Hand, Authors: John Irving, Prediction: 0.6586891412734985
7. Book ID: 3832, Title: Cover Her Face (Adam Dalgliesh #1), Authors: P.D. James, Prediction: 0.6382019519805908
8. Book ID: 7728, Title: Antigone (The Theban Plays, #3), Authors: Sophocles, J.E. Thomas, Prediction: 0.6334075331687927
9. Book ID: 3478, Title: Message in a Bottle, Authors: Nicholas Sparks, Prediction: 0.5736340284347534

Reasoning: The top 

## Example 3: Similar to a Specific Book

In [37]:
# Example of a user's textual request and filtering predictions
user_request = "I'm looking for books that are similar to Dune"
user_id = 123  # Example user ID

# Call the modified filter_predictions function
filtered_predictions = filter_predictions(user_id, sorted_smaller_predictions_df, books_df, tags_df, book_tags_df, user_request)
# Call the rerank_predictions function
reranked_recommendations = rerank_predictions(filtered_predictions, books_df, user_request)

print(reranked_recommendations)


Given that the user's preference is for books similar to "Dune," I will prioritize science fiction novels, especially those with sprawling universes, political intrigue, complex societies, and deep philosophical underpinnings. Unfortunately, since the provided filtered recommendations list only includes two books, and neither seems to fit the science fiction genre or appear to be similar to "Dune," I can't accurately re-rank a top 10 list based on the user's preferences.

However, if I had a larger list of books from which to choose, I'd look for titles by authors like Frank Herbert (who wrote "Dune"), Isaac Asimov, Arthur C. Clarke, Orson Scott Card, Philip K. Dick, and other authors known for their work within the science fiction genre, especially those that deal with epic storytelling and complex worlds. If more appropriate recommendations were available in the larger list, I’d be able to compile a top 10 list that would likely satisfy the user's preference for books similar to "Dun

In [39]:
# Example of a user's textual request and filtering predictions
user_request = "I'm looking for books that are similar to Harry Potter"
user_id = 123  # Example user ID

# Call the modified filter_predictions function
filtered_predictions = filter_predictions(user_id, sorted_smaller_predictions_df, books_df, tags_df, book_tags_df, user_request)
# Call the rerank_predictions function
reranked_recommendations = rerank_predictions(filtered_predictions, books_df, user_request)

print(reranked_recommendations)


1. Book ID: 2657, Title: To Kill a Mockingbird, Authors: Harper Lee, Prediction: 0.7711403965950012
2. Book ID: 4948, Title: The Very Hungry Caterpillar Board Book, Authors: Eric Carle, Prediction: 0.7600183486938477
3. Book ID: 5190, Title: Open House, Authors: Elizabeth Berg, Prediction: 0.7504738569259644
4. Book ID: 7531, Title: The Idiot Girls' Action-Adventure Club: True Tales from a Magnificent and Clumsy Life, Authors: Laurie Notaro, Prediction: 0.7395410537719727
5. Book ID: 5182, Title: Songs in Ordinary Time, Authors: Mary McGarry Morris, Prediction: 0.6870437860488892
6. Book ID: 446, Title: The Brooklyn Follies, Authors: Paul Auster, Prediction: 0.6763553619384766
7. Book ID: 4656, Title: The Fourth Hand, Authors: John Irving, Prediction: 0.6586891412734985
8. Book ID: 3832, Title: Cover Her Face (Adam Dalgliesh #1), Authors: P.D. James, Prediction: 0.6382019519805908
9. Book ID: 7728, Title: Antigone (The Theban Plays, #3), Authors: Sophocles, J.E. Thomas, Prediction: 0.6