In [2]:
import pandas as pd 
import re
def clean_title(title):
    return re.sub(r"\s*\(.*?\)", "", title)
pd.set_option('display.max_columns', None)

def clean_book_titles(book_list):
    cleaned_books = []
    for book in book_list:
        match = re.match(r'^(.*?)\s\((.*?),\s#\d+\)$', book)
        if match:
            title = match.group(1).strip()
            series = match.group(2).strip()
            if title.lower() != series.lower():
                cleaned_books.append(f"{series} {title}")
            else:
                cleaned_books.append(title)
        else:
            cleaned_books.append(book)
    return cleaned_books

In [3]:
ratings = pd.read_csv("C:\\Users\\JaredPeck\\Documents\\reccomendation_system\\app\\csv\\ratings.csv")
books = pd.read_csv("C:\\Users\\JaredPeck\\Documents\\reccomendation_system\\app\\csv\\books.csv")

In [4]:
combined_df = books.merge(ratings,  on = "book_id")

In [5]:
cleaned_df =  combined_df[["book_id", 'original_publication_year', "title",   'authors', 'language_code', 'average_rating',
                     'ratings_count', 'work_ratings_count', 'work_text_reviews_count','user_id', 'rating', "image_url"]]



In [6]:
cleaned_df["title"][:5]

0    The Hunger Games (The Hunger Games, #1)
1    The Hunger Games (The Hunger Games, #1)
2    The Hunger Games (The Hunger Games, #1)
3    The Hunger Games (The Hunger Games, #1)
4    The Hunger Games (The Hunger Games, #1)
Name: title, dtype: object

In [7]:
x = cleaned_df["user_id"].value_counts() > 100
y= x[x].index

In [8]:
ratings_with_books = cleaned_df[cleaned_df['user_id'].isin(y)]
ratings_with_books

Unnamed: 0,book_id,original_publication_year,title,authors,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,user_id,rating,image_url
0,1,2008.0,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,eng,4.34,4780653,4942365,155254,2886,5,https://images.gr-assets.com/books/1447303603m...
1,1,2008.0,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,eng,4.34,4780653,4942365,155254,6158,5,https://images.gr-assets.com/books/1447303603m...
2,1,2008.0,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,eng,4.34,4780653,4942365,155254,3991,4,https://images.gr-assets.com/books/1447303603m...
4,1,2008.0,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,eng,4.34,4780653,4942365,155254,5721,5,https://images.gr-assets.com/books/1447303603m...
5,1,2008.0,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,eng,4.34,4780653,4942365,155254,5034,5,https://images.gr-assets.com/books/1447303603m...
...,...,...,...,...,...,...,...,...,...,...,...,...
5976472,10000,1998.0,The First World War,John Keegan,,4.00,9162,9700,364,51328,1,https://images.gr-assets.com/books/1403194704m...
5976474,10000,1998.0,The First World War,John Keegan,,4.00,9162,9700,364,35336,4,https://images.gr-assets.com/books/1403194704m...
5976476,10000,1998.0,The First World War,John Keegan,,4.00,9162,9700,364,49007,4,https://images.gr-assets.com/books/1403194704m...
5976477,10000,1998.0,The First World War,John Keegan,,4.00,9162,9700,364,43319,5,https://images.gr-assets.com/books/1403194704m...


In [9]:
number_rating = ratings_with_books.groupby('title')['rating'].count().reset_index()

In [10]:
number_rating.rename(columns={'rating':'num_of_rating'},inplace=True)

In [11]:
number_rating.head()

Unnamed: 0,title,num_of_rating
0,"Angels (Walsh Family, #3)",199
1,#GIRLBOSS,120
2,'Salem's Lot,3804
3,"'Tis (Frank McCourt, #2)",600
4,"1,000 Places to See Before You Die",317


In [12]:
final_rating = ratings_with_books.merge(number_rating, on='title')
final_rating["title"] = clean_book_titles(final_rating["title"])
final_rating["title"]

0             The Hunger Games
1             The Hunger Games
2             The Hunger Games
3             The Hunger Games
4             The Hunger Games
                  ...         
4527132    The First World War
4527133    The First World War
4527134    The First World War
4527135    The First World War
4527136    The First World War
Name: title, Length: 4527137, dtype: object

In [13]:
final_rating = final_rating[final_rating['num_of_rating'] >= 50]

In [28]:
final_rating = final_rating.drop_duplicates(['user_id','title'])
final_rating["title"] = final_rating["title"].apply(lambda x: x.lower())

In [29]:
final_rating

Unnamed: 0,book_id,original_publication_year,title,authors,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,user_id,rating,image_url,num_of_rating
0,1,2008.0,the hunger games,Suzanne Collins,eng,4.34,4780653,4942365,155254,2886,5,https://images.gr-assets.com/books/1447303603m...,16667
1,1,2008.0,the hunger games,Suzanne Collins,eng,4.34,4780653,4942365,155254,6158,5,https://images.gr-assets.com/books/1447303603m...,16667
2,1,2008.0,the hunger games,Suzanne Collins,eng,4.34,4780653,4942365,155254,3991,4,https://images.gr-assets.com/books/1447303603m...,16667
3,1,2008.0,the hunger games,Suzanne Collins,eng,4.34,4780653,4942365,155254,5721,5,https://images.gr-assets.com/books/1447303603m...,16667
4,1,2008.0,the hunger games,Suzanne Collins,eng,4.34,4780653,4942365,155254,5034,5,https://images.gr-assets.com/books/1447303603m...,16667
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4527132,10000,1998.0,the first world war,John Keegan,,4.00,9162,9700,364,51328,1,https://images.gr-assets.com/books/1403194704m...,73
4527133,10000,1998.0,the first world war,John Keegan,,4.00,9162,9700,364,35336,4,https://images.gr-assets.com/books/1403194704m...,73
4527134,10000,1998.0,the first world war,John Keegan,,4.00,9162,9700,364,49007,4,https://images.gr-assets.com/books/1403194704m...,73
4527135,10000,1998.0,the first world war,John Keegan,,4.00,9162,9700,364,43319,5,https://images.gr-assets.com/books/1403194704m...,73


In [30]:
final_rating.to_csv("final_rating_df.csv")

In [31]:
test_df  = pd.read_csv("final_rating_df.csv")
test_df

Unnamed: 0.1,Unnamed: 0,book_id,original_publication_year,title,authors,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,user_id,rating,image_url,num_of_rating
0,0,1,2008.0,the hunger games,Suzanne Collins,eng,4.34,4780653,4942365,155254,2886,5,https://images.gr-assets.com/books/1447303603m...,16667
1,1,1,2008.0,the hunger games,Suzanne Collins,eng,4.34,4780653,4942365,155254,6158,5,https://images.gr-assets.com/books/1447303603m...,16667
2,2,1,2008.0,the hunger games,Suzanne Collins,eng,4.34,4780653,4942365,155254,3991,4,https://images.gr-assets.com/books/1447303603m...,16667
3,3,1,2008.0,the hunger games,Suzanne Collins,eng,4.34,4780653,4942365,155254,5721,5,https://images.gr-assets.com/books/1447303603m...,16667
4,4,1,2008.0,the hunger games,Suzanne Collins,eng,4.34,4780653,4942365,155254,5034,5,https://images.gr-assets.com/books/1447303603m...,16667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4502745,4527132,10000,1998.0,the first world war,John Keegan,,4.00,9162,9700,364,51328,1,https://images.gr-assets.com/books/1403194704m...,73
4502746,4527133,10000,1998.0,the first world war,John Keegan,,4.00,9162,9700,364,35336,4,https://images.gr-assets.com/books/1403194704m...,73
4502747,4527134,10000,1998.0,the first world war,John Keegan,,4.00,9162,9700,364,49007,4,https://images.gr-assets.com/books/1403194704m...,73
4502748,4527135,10000,1998.0,the first world war,John Keegan,,4.00,9162,9700,364,43319,5,https://images.gr-assets.com/books/1403194704m...,73


In [32]:
book_pivot = final_rating.pivot_table(columns='user_id', index='title', values= 'rating')

In [33]:
# always want to fill NA values with 0's before sparse matrix this bitch :) 
book_pivot = book_pivot.fillna(0)

In [34]:
len(book_pivot)

9175

In [35]:
# pass in the csr_matrix to make it computationally faster
from scipy.sparse import csr_matrix

In [36]:
# add pivot table to the sparse matrix 
book_sparse = csr_matrix(book_pivot)

In [37]:
import pickle 
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(algorithm= 'brute')
model.fit(book_sparse)

with open('nearest_neighbors_model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [42]:

from typing import List
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

def get_book_vectors(df: pd.DataFrame, book_titles: List[str]) -> np.ndarray:
    book_vectors = []
    for title in book_titles:
        if title in df.index:
            book_vectors.append(df.loc[title].values)
        else:
            raise ValueError(f"The book '{title}' is not in the DataFrame index.")
    return np.array(book_vectors)

def books_recommendation(user_liked_books: List[str], model: NearestNeighbors, df: pd.DataFrame, n_neighbors: int = 8) -> List[str]:
    book_vectors = get_book_vectors(df, user_liked_books)
    average_vector = np.mean(book_vectors, axis=0).reshape(1, -1)
    distance, suggestion = model.kneighbors(average_vector, n_neighbors=n_neighbors + len(user_liked_books))
    
    suggested_books = [df.index[suggestion[0][i]] for i in range(len(suggestion[0]))]
    
    filtered_suggestions = [book for book in suggested_books if book not in user_liked_books][:n_neighbors]
    
    return filtered_suggestions

user_liked_books = [
  
    'Twilight'.lower()
]

suggested_books = books_recommendation(user_liked_books, model, book_pivot, 9)
for book in suggested_books:
    print(book.capitalize())


Twilight new moon
Twilight eclipse
Twilight breaking dawn
The host
Twilight: the complete illustrated movie companion
Fifty shades fifty shades of grey
House of night marked
Sookie stackhouse dead until dark
House of night untamed


In [41]:
with open("nearest_neighbors_model.pkl", "rb") as file:
    pickled_model = pickle.load(file)
    
suggested_books = books_recommendation(user_liked_books, model, book_pivot, 9)
suggested_books



['twilight new moon',
 'twilight eclipse',
 'twilight breaking dawn',
 'the host',
 'twilight: the complete illustrated movie companion',
 'fifty shades fifty shades of grey',
 'house of night marked',
 'sookie stackhouse dead until dark',
 'house of night untamed']