In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import correlation
from sklearn.metrics.pairwise import pairwise_distances
from IPython.display import display, clear_output
from contextlib import contextmanager
import warnings

# Suppressing warnings
warnings.filterwarnings('ignore')

# Load data
books = pd.read_csv('data/books.csv', sep=';', error_bad_lines=False, encoding="latin-1", dtype=object)
books.columns = ['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher']

users = pd.read_csv('data/users.csv', sep=';', error_bad_lines=False, encoding="latin-1")
users.columns = ['userID', 'Location', 'Age']

ratings = pd.read_csv('data/ratings.csv', sep=';', error_bad_lines=False, encoding="latin-1", dtype=object)
ratings.columns = ['userID', 'ISBN', 'bookRating']

# Data preprocessing
# Handling invalid years in the 'yearOfPublication' column
books.loc[(books.yearOfPublication.astype(np.int32) > 2006) | (books.yearOfPublication.astype(np.int32) == 0), 'yearOfPublication'] = np.NAN
books.yearOfPublication.fillna(round(books.yearOfPublication.mean()), inplace=True)
books.yearOfPublication = books.yearOfPublication.astype(np.int32)

# Handling invalid ages in the 'Age' column
users.loc[(users.Age > 90) | (users.Age < 5), 'Age'] = np.nan
users.Age = users.Age.fillna(users.Age.mean())
users.Age = users.Age.astype(np.int32)

# Filtering ratings based on available books and users
ratings.userID = ratings.userID.astype(np.int64)
ratings.bookRating = ratings.bookRating.astype(np.int64)
ratings_new = ratings[ratings.ISBN.isin(books.ISBN)]
ratings = ratings[ratings.userID.isin(users.userID)]

# Displaying some information about the dataset
n_users = users.shape[0]
n_books = books.shape[0]
sparsity = 1.0 - len(ratings_new) / float(n_users * n_books)
print('The sparsity level of Book Crossing dataset is {:.2%}'.format(sparsity))

# Separating explicit and implicit ratings
ratings_explicit = ratings_new[ratings_new.bookRating != 0]
ratings_implicit = ratings_new[ratings_new.bookRating == 0]

# Exploring and handling explicit ratings data
users_exp_ratings = users[users.userID.isin(ratings_explicit.userID)]
users_imp_ratings = users[users.userID.isin(ratings_implicit.userID)]
counts1 = ratings_explicit['userID'].value_counts()
ratings_explicit = ratings_explicit[ratings_explicit['userID'].isin(counts1[counts1 >= 1].index)]
counts = ratings_explicit['bookRating'].value_counts()
ratings_explicit = ratings_explicit[ratings_explicit['bookRating'].isin(counts[counts >= 1].index)]

# Generating ratings matrix from explicit ratings table
ratings_matrix = ratings_explicit.pivot(index='userID', columns='ISBN', values='bookRating')
ratings_matrix.fillna(0, inplace=True)
ratings_matrix = ratings_matrix.astype(np.int32)
sparsity = 1.0 - len(ratings_explicit) / float(users_exp_ratings.shape[0] * n_books)
print('The sparsity level of Book Crossing dataset is {:.2%}'.format(sparsity))

# Setting global variables
global metric, k
k = 3
metric = 'cosine'

# Function to find k similar users given the user_id and ratings matrix
def findksimilarusers(user_id, ratings, metric=metric, k=k):
    similarities = []
    indices = []
    model_knn = NearestNeighbors(metric=metric, algorithm='brute')
    model_knn.fit(ratings)
    loc = ratings.index.get_loc(user_id)
    distances, indices = model_knn.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors=k + 1)
    similarities = 1 - distances.flatten()
    return similarities, indices

# Function to predict rating for a specified user-item combination based on user-based approach
def predict_userbased(user_id, item_id, ratings, metric=metric, k=k):
    prediction = 0
    user_loc = ratings.index.get_loc(user_id)
    item_loc = ratings.columns.get_loc(item_id)
    similarities, indices = findksimilarusers(user_id, ratings, metric, k)
    mean_rating = ratings.iloc[user_loc, :].mean()
    sum_wt = np.sum(similarities) - 1
    product = 1
    wtd_sum = 0

    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i] == user_loc:
            continue
        else:
            ratings_diff = ratings.iloc[indices.flatten()[i], item_loc] - np.mean(ratings.iloc[indices.flatten()[i], :])
            product = ratings_diff * (similarities[i])
            wtd_sum = wtd_sum + product

    if sum_wt == 0.0:
        sum_wt = 0.1

    prediction = int(round(mean_rating + (wtd_sum / sum_wt)))
    if prediction <= 0:
        prediction = 1
    elif prediction > 10:
        prediction = 10

    print('\nPredicted rating for user {} -> item {}: {}'.format(user_id, item_id, prediction))
    return prediction

# Function to find k similar items given the item_id and ratings matrix
def findksimilaritems(item_id, ratings, metric=metric, k=k):
    similarities = []
    indices = []
    ratings = ratings.T
    loc = ratings.index.get_loc(item_id)
    model_knn = NearestNeighbors(metric=metric, algorithm='brute')
    model_knn.fit(ratings)

    distances, indices = model_knn.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors=k + 1)
    similarities = 1 - distances.flatten()
    return similarities, indices

# Function to predict rating for a specified user-item combination based on item-based approach
def predict_itembased(user_id, item_id, ratings, metric=metric, k=k):
    prediction = wtd_sum = 0
    user_loc = ratings.index.get_loc(user_id)
    item_loc = ratings.columns.get_loc(item_id)
    similarities, indices = findksimilaritems(item_id, ratings)
    sum_wt = np.sum(similarities) - 1
    product = 1
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i] == item_loc:
            continue
        else:
            product = ratings.iloc[user_loc, indices.flatten()[i]] * (similarities[i])
            wtd_sum = wtd_sum + product

    if sum_wt == 0.0:
        sum_wt = 0.1

    prediction = int(round(wtd_sum / sum_wt))

    if prediction <= 0:
        prediction = 1
    elif prediction > 10:
       
