In [None]:

import pandas as pd
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Load dataset
df = pd.read_csv('Ratings.csv')
df = df[df['Book-Rating'] > 0]  # remove zero ratings
df

In [None]:
df.duplicated().sum()

In [None]:

df['Book-Rating'].value_counts().sort_index()

In [None]:
plt.figure(figsize=(8, 4))
sns.countplot(x="Book-Rating", data=df, order=sorted(df["Book-Rating"].unique()))
plt.title("Distribution of Ratings")
plt.xlabel("Rating")
plt.ylabel("Number of Reviews")
plt.show()

In [None]:
book_counts = df['ISBN'].value_counts()

print("Unnique books:", df['ISBN'].nunique())
print("Total number of reviews:", len(df))


In [None]:
print(book_counts)

In [None]:
book_counts.describe()

In [None]:
#Reviews per user
user_counts = df['User-ID'].value_counts()

In [None]:
user_counts.describe()

In [None]:
#In item–item recommendations, each book should have at least a few reviews so that similarities can be calculated reliably.

#Below we investigate how many books and how many reviews are retained at different limits (1, 2, 4, 6, 8, 10 reviews per book).

thresholds = [1, 2, 4, 6, 8, 10]

results = {}

for t in thresholds:

    # Books with t>= reviews
    books_to_keep = book_counts[book_counts >= t].index

    # How many books remain
    num_books_remaining = len(books_to_keep)

    # Percentage of books remaining
    pct_books_remaining = 100 * num_books_remaining / df['ISBN'].nunique()

    # Filter the original dataframe to keep only reviews for those books
    df_tmp = df[df['ISBN'].isin(books_to_keep)]

    # How many reviews remain
    num_ratings_remaining = len(df_tmp)

    # Percentage of reviews remaining
    pct_ratings_remaining = 100 * num_ratings_remaining / len(df)

    # Save results
    results[t] = {
        "num_books_remaining": num_books_remaining,
        "pct_books_remaining": pct_books_remaining,
        "num_ratings_remaining": num_ratings_remaining,
        "pct_ratings_remaining": pct_ratings_remaining
    }

    results_df = pd.DataFrame(results).T

results_df["pct_books_remaining"] = results_df["pct_books_remaining"].round(2)
results_df["pct_ratings_remaining"] = results_df["pct_ratings_remaining"].round(2)

print("=== Results of Threshold Filtering ===")
display(results_df)



In [None]:
plt.figure(figsize=(8, 5))

plt.plot(results_df.index, results_df["pct_books_remaining"],
         marker="o", label="% books remaining")

plt.plot(results_df.index, results_df["pct_ratings_remaining"],
         marker="o", label="% ratings remaining")

plt.title("Threshold analysis: impact on data volume")
plt.xlabel("Threshold (min rating per book)")
plt.ylabel("Percent remaining (%)")

plt.xticks(results_df.index)
plt.grid(True)
plt.legend()
plt.show()

In [None]:
books_to_keep = book_counts[book_counts >= 3].index

df_filtered = df[df['ISBN'].isin(books_to_keep)].copy()

df_filtered["ISBN"] = (
    df_filtered["ISBN"]
        .astype(str)
        .str.strip()
        .str.upper()
)

print("Shape after filtering:", df_filtered.shape)

Rename the columns to the format that Surprise expects (user, item, rating)

In [191]:
df_filtered = df_filtered.rename(columns={
    'User-ID': 'user',
    'ISBN': 'item',
    'Book-Rating': 'rating'
})

df_filtered.head()

Unnamed: 0,user,item,rating
6,276736,3257224281,8
8,276744,038550120X,7
16,276747,0060517794,9
19,276747,0671537458,9
20,276747,0679776818,8


Verify/check the rating scale (minimum and maximum) so that Surprise can interpret the ratings correctly.

In [None]:
min_rating = df_filtered['rating'].min()
max_rating = df_filtered['rating'].max()

print(f"Rating scale: ", min_rating, "to", max_rating)

The filtered data is loaded into the Surprise library's Dataset object.

In [None]:
from surprise import Dataset, Reader


reader = Reader(rating_scale=(min_rating, max_rating))

data = Dataset.load_from_df(
    df_filtered[['user', 'item', 'rating']],
    reader
)

80/20 train/test split, so the model can be evaluated on data it didn't see during training

In [None]:
from surprise.model_selection import train_test_split

trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# 3. Item–item KNN-mallin koulutus

We train an item–item based KNNBasic model using cosine similarity. This corresponds to the typical "since you liked this, you will probably like these too" logic.

In [None]:
from surprise import KNNBasic

sim_options = {
    'name': 'cosine',
    'user_based': False  # False -> item-item similariteetti
}

algo = KNNBasic(sim_options=sim_options)

algo.fit(trainset)

# 4. Model evaluation

In [None]:
# Evaluated with a test set
predictions = algo.test(testset)

predictions[:5]

In [None]:
from surprise import accuracy

# RMSE
rmse = accuracy.rmse(predictions)


# MAE
mae = accuracy.mae(predictions)

In [None]:
pred_df = pd.DataFrame([
    {
        "user": pred.uid,
        "item": pred.iid,
        "true_rating": pred.r_ui,
        "predicted_rating": pred.est,
        "error": pred.est - pred.r_ui
    }
    for pred in predictions
])

pred_df.head()

In [None]:
plt.figure(figsize=(8,4))
sns.histplot(pred_df["error"], bins=50, kde=True)
plt.title("Error distribution (predicted - true)")
plt.xlabel("Error")
plt.ylabel("Count")
plt.axvline(0, color='black', linestyle='--')
plt.show()

The error distribution is roughly symmetric around zero, which suggests that the model does not have strong systematic under- or overestimation.

Most of the errors are close to zero, but the distribution has wide tails: some predictions are off by several points.

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(x="true_rating", y="error", data=pred_df)
plt.title("Errors across different rating values")
plt.xlabel("True rating")
plt.ylabel("Error")
plt.axhline(0, color='black', linestyle='--')
plt.show()

The boxplot shows how the errors are distributed with the actual rating values.

The model seems to overestimate the lowest ratings badly, but the estimates between 7 and 9 are relatively accurate.

# 5. Book data (Books.csv) and recommendation functions

We load Books.csv to get book titles and other metadata (e.g. author, year of publication). We then combine the predictions with the book data and build recommendation functions:

user-specific recommendations
similar books searched by book.
We import Books.csv to display the recommended book titles and other metadata.

In [None]:
books = pd.read_csv("Books_1.csv", dtype=str)
books.head()

We will combine the predictions with Books.csv to include the book title and other information in the recommendations.

In [None]:
books['ISBN'] = (
    books['ISBN']
         .astype(str)
         .str.strip()
         .str.upper()
)

Recommend books to the user based on predicted ratings

In [None]:
def recommend_books_for_user(algo, df_filtered, books, user_id, n=10):
    # All ISBNs in the dataset
    all_items = df_filtered['item'].unique()

    # Books that the user has already rated
    rated_items = set(df_filtered[df_filtered['user'] == user_id]['item'])

    # Books that the user has not rated
    items_to_predict = [iid for iid in all_items if iid not in rated_items]

    # Prediction for each remaining book
    predictions = [algo.predict(user_id, iid) for iid in items_to_predict]

    # Sort by the highest estimated rating
    predictions.sort(key=lambda x: x.est, reverse=True)

    # Take only the top n
    top_n = predictions[:n]

    # Convert to DataFrame
    recs = pd.DataFrame([{
        "ISBN": pred.iid,
        "predicted_rating": pred.est
    } for pred in top_n])

    recs = recs.merge(books, on="ISBN", how="left")

    # Remove books that were not found in Books.csv
    recs = recs.dropna(subset=["Book-Title"])

    # If there are NaN book titles, create a placeholder message
    recs['Book-Title'] = recs['Book-Title'].fillna("Book not found in metadata")
    
    return recs

In [None]:
result = recommend_books_for_user(algo, df_filtered, books, user_id="188100", n=4)
result

Creating a book recommendation based on the book

In [None]:
# Function to search for books by name

def find_book_by_title(books, title):
    matches = books[books['Book-Title'].str.contains(title, case=False, na=False)]
    return matches

In [None]:
# Function for searching books using KNN model

def recommend_by_title(algo, books, title, n=10):
# Find books whose title contains the search keyword
    matches = find_book_by_title(books, title)

    if matches.empty:
        return f"No matches found for book title '{title}'."

    # Use the first match
    target = matches.iloc[0]
    target_isbn = target['ISBN']

    print(f"Using ISBN {target_isbn} for book '{target['Book-Title']}'")

    # Convert ISBN into Surprise library's internal ID format
    try:
        inner_id = algo.trainset.to_inner_iid(target_isbn)
    except:
        return "This book was not included in the training data, so similarities cannot be calculated."

    # Retrieve k nearest neighbors (most similar books)
    neighbors = algo.get_neighbors(inner_id, k=n)

    # Convert internal IDs back into original ISBN values
    neighbor_isbns = [algo.trainset.to_raw_iid(inner_id) for inner_id in neighbors]

    # Return books whose ISBN is in the neighbors
    recs = books[books['ISBN'].isin(neighbor_isbns)]
    
    return recs

In [None]:
recommend_by_title(algo, books, "lord of the rings", n=3)

In [None]:
def recommend_books_for_user(algo, df_filtered, books, user_id, n=10):
    # Kaikki ISBN datasetissä
    all_items = df_filtered['item'].unique()

    # Kirjat jotka käyttäjä on jo arvostellut
    rated_items = set(df_filtered[df_filtered['user'] == user_id]['item'])

    # Kirjat joita käyttäjä ei ole arvostellut
    items_to_predict = [iid for iid in all_items if iid not in rated_items]

    # Ennuste jokaiselle lopulle kirjalle
    predictions = [algo.predict(user_id, iid) for iid in items_to_predict]

    # Järjestetään suurimman arvioidun ratingin mukaan
    predictions.sort(key=lambda x: x.est, reverse=True)

    # Otetaan vain n parasta
    top_n = predictions[:n]

    # Muutetaan DataFrameksi
    recs = pd.DataFrame([{
        "ISBN": pred.iid,
        "predicted_rating": pred.est
    } for pred in top_n])

    recs = recs.merge(books, on="ISBN", how="left")

    # Poistetaan kirjat jotka eivät löytyneet Books.csv:stä
    recs = recs.dropna(subset=["Book-Title"])

    # Jos tulee NaN kirjoja luodaan ilmoitus
    recs['Book-Title'] = recs['Book-Title'].fillna("Kirjaa ei löytynyt metadatasta")


In [None]:
!python -m pip install nbconvert

import nbformat
from nbconvert import HTMLExporter

# Load notebook
with open("book_reco_improved.ipynb") as f:
    nb = nbformat.read(f, as_version=4)

# Convert to HTML
html_exporter = HTMLExporter()
(body, resources) = html_exporter.from_notebook_node(nb)

# Save HTML
with open("book_reco_improved.html", "w", encoding="utf-8") as f:
    f.write(body)