# EDA

In [2]:
import pandas as pd
import numpy as np

book_data = pd.read_csv('bookData.csv', encoding='Windows-1252')

user_data = pd.read_csv('UserData.csv', encoding='Windows-1252')

user_historical_view = pd.read_csv('UserHistoricalView.csv', encoding='Windows-1252')

test_user_answers = pd.read_csv('TestUserAnswers.csv', encoding='Windows-1252')

In [3]:
# remove X for isbn
for df in [user_historical_view, test_user_answers, book_data]:
    df['isbn'] = df['isbn'].str.replace('X', '')

In [4]:
# Find missing values

print("Book Data Missing Values:")
print(book_data.isna().sum())

print("\nUser Data Missing Values:")
print(user_data.isna().sum())

print("\nUser Historical View Missing Values:")
print(user_historical_view.isna().sum())

print("\nTest User Answers Missing Values:")
print(test_user_answers.isna().sum())

Book Data Missing Values:
isbn                 0
booktitle            0
bookauthor           0
yearofpublication    0
publisher            0
Synopsis             1
dtype: int64

User Data Missing Values:
userid      0
location    0
age         3
dtype: int64

User Historical View Missing Values:
userid    0
isbn      0
dtype: int64

Test User Answers Missing Values:
userid    0
isbn      0
dtype: int64


In [5]:
# check duplicates
print('book_data duplicates')
print(book_data.duplicated().sum())

print('user_historical view duplicates')
print(user_historical_view.duplicated().sum())

print('user_data duplicates')
print(user_data.duplicated().sum())

book_data duplicates
0
user_historical view duplicates
0
user_data duplicates
0


In [6]:
# dropping missing synopsis as we dont know the info about book
book_data.dropna(subset=['Synopsis'], inplace=True)


In [7]:
# find duplicates in synopsis
duplicated_indices = book_data[book_data['Synopsis'].duplicated(keep=False)].index
print(duplicated_indices)

Index([36, 68, 89, 94], dtype='int64')


In [8]:
print(book_data.loc[68])
print(book_data.loc[94])

isbn                                                         312278586
booktitle                                            The Nanny Diaries
bookauthor                                             Emma McLaughlin
yearofpublication                                                 2002
publisher                                           St. Martin s Press
Synopsis             A college graduate gets employed as a nanny fo...
Name: 68, dtype: object
isbn                                                         312291639
booktitle                                   The Nanny Diaries: A Novel
bookauthor                                             Emma McLaughlin
yearofpublication                                                 2003
publisher                                         St. Martin s Griffin
Synopsis             A college graduate gets employed as a nanny fo...
Name: 94, dtype: object


In [9]:
# find duplicates in booktitle
duplicated_indices = book_data[book_data['booktitle'].duplicated(keep=False)].index
print(duplicated_indices)

Index([42, 50, 51, 100, 108, 109], dtype='int64')


In [10]:
print(book_data.loc[100])
print(book_data.loc[108])

isbn                                                         439136350
booktitle            Harry Potter and the Prisoner of Azkaban (Book 3)
bookauthor                                               J. K. Rowling
yearofpublication                                                 1999
publisher                                                   Scholastic
Synopsis             Harry Potter and the Prisoner of Azkaban is a ...
Name: 100, dtype: object
isbn                                                         439139597
booktitle                 Harry Potter and the Goblet of Fire (Book 4)
bookauthor                                               J. K. Rowling
yearofpublication                                                 2000
publisher                                                   Scholastic
Synopsis             In this thrilling installment Harry finds hims...
Name: 108, dtype: object


# Part 1 creating user profiles

In [12]:
def create_boolean_df(column):
    return pd.get_dummies(book_data[column])

book_titles_df = create_boolean_df('booktitle')
book_authors_df = create_boolean_df('bookauthor')
publishers_df = create_boolean_df('publisher')
year_df = create_boolean_df('yearofpublication')
# Concatenate boolean dataframes with ISBN
boolean_book_data = pd.concat([book_data['isbn'], book_titles_df, book_authors_df, publishers_df,year_df], axis=1)

boolean_book_data

Unnamed: 0,isbn,1984,1st to Die,A Heartbreaking Work of Staggering Genius,A Painted House,A Prayer for Owen Meany,A Time to Kill,A Walk in the Woods: Rediscovering America on the Appalachian Trail (Official Guides to the Appalachian Trail),A Walk to Remember,ANGELA S ASHES,...,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
0,440234743,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
1,971880107,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,345417623,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
3,446310786,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,671027360,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,439064864,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
110,043935806,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
111,440220602,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
112,671001795,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False


In [13]:
# Merge user data with historical views and boolean book data
merged = pd.merge(user_data, user_historical_view, on='userid').merge(boolean_book_data, on='isbn', how='inner')
merged.drop(columns=['location', 'age'], inplace=True)
merged

Unnamed: 0,userid,isbn,1984,1st to Die,A Heartbreaking Work of Staggering Genius,A Painted House,A Prayer for Owen Meany,A Time to Kill,A Walk in the Woods: Rediscovering America on the Appalachian Trail (Official Guides to the Appalachian Trail),A Walk to Remember,...,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
0,11676,60938455,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,11676,316096199,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,11676,316569321,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
3,11676,312195516,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
4,35859,312195516,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,204864,812550706,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
84,271448,60928336,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
85,271448,312291639,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
86,271448,044023722,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [14]:
agg = merged.groupby('userid')['isbn'].apply(list).reset_index()
profile_df = merged.groupby('userid').any().reset_index()
profile_df = profile_df.assign(userid=agg['userid'], isbn=agg['isbn'])

profile_df

Unnamed: 0,userid,isbn,1984,1st to Die,A Heartbreaking Work of Staggering Genius,A Painted House,A Prayer for Owen Meany,A Time to Kill,A Walk in the Woods: Rediscovering America on the Appalachian Trail (Official Guides to the Appalachian Trail),A Walk to Remember,...,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
0,11676,"[60938455, 316096199, 316569321, 312195516, 34...",False,True,False,False,True,False,False,False,...,True,True,False,True,True,True,True,True,False,False
1,16795,"[142001740, 316666343, 446605239, 385504209, 6...",False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,True,True,False
2,22625,"[439064864, 059035342, 067976402, 316666343, 3...",False,False,False,False,False,False,False,False,...,True,False,True,True,True,False,False,True,True,False
3,35859,"[312195516, 059035342, 61009059, 345342968, 04...",False,False,False,False,False,False,False,False,...,True,False,True,True,True,False,False,False,False,False
4,95359,"[345361792, 449212602, 316601950, 043935806, 3...",False,False,False,False,True,False,False,False,...,False,False,True,True,True,False,False,True,True,False
5,104636,"[439139600, 446610038, 446672211, 059035342, 0...",False,True,False,False,False,True,False,False,...,False,False,False,True,True,True,True,True,False,False
6,110912,"[345361792, 439064864, 385484518, 068484477, 6...",False,False,False,False,True,False,False,False,...,False,False,True,False,True,True,False,True,True,False
7,204864,"[312195516, 449212602, 067976402, 316769487, 1...",False,False,False,False,False,False,False,False,...,True,True,False,True,False,True,False,True,True,False
8,271448,"[439139600, 446672211, 452282829, 316666343, 0...",False,False,False,True,False,False,False,False,...,False,True,True,True,False,False,True,True,True,False


# calcualting similarity matrix

In [15]:
def cosine_similarity(user_row, book_data):
    dot_product = np.dot(user_row, book_data.T)
    magnitude_user = np.linalg.norm(user_row)
    magnitude_book = np.linalg.norm(book_data, axis=1)
    return dot_product / (magnitude_user * magnitude_book)

User_Calc = profile_df.iloc[:, 2:]
book_calc = boolean_book_data.iloc[:, 1:]

cosine_sim_matrix = np.array([cosine_similarity(user_row, book_calc.values) for _, user_row in User_Calc.iterrows()])
cosine_similarity_df = pd.DataFrame(cosine_sim_matrix, index=profile_df['userid'], columns=book_data['booktitle'])

cosine_similarity_df

booktitle,The Testament,Wild Animus,Timeline,To Kill a Mockingbird,Angels and Demons,Little Altars Everywhere,The Firm,Fast Food Nation: The Dark Side of the AllAmerican Meal,Where the Heart Is,Icy Sparks,...,House of Sand and Fog,Silence of the Lambs,Angela s Ashes (MMP) : A Memoir,The Pilot s Wife : A Novel,Harry Potter and the Goblet of Fire (Book 4),Harry Potter and the Chamber of Secrets (Book 2),Harry Potter and the Order of the Phoenix (Book 5),The Chamber,Two for the Dough,The Horse Whisperer
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11676,0.066815,0.0,0.066815,0.0,0.066815,0.066815,0.0,0.066815,0.066815,0.066815,...,0.066815,0.0,0.066815,0.066815,0.066815,0.066815,0.066815,0.066815,0.066815,0.066815
16795,0.0,0.0,0.0,0.0,0.117851,0.0,0.0,0.117851,0.117851,0.117851,...,0.0,0.0,0.0,0.0,0.0,0.0,0.117851,0.0,0.117851,0.0
22625,0.082199,0.0,0.082199,0.0,0.0,0.0,0.0,0.082199,0.082199,0.0,...,0.0,0.082199,0.082199,0.082199,0.082199,0.082199,0.082199,0.082199,0.0,0.0
35859,0.102062,0.0,0.0,0.0,0.0,0.0,0.102062,0.0,0.102062,0.0,...,0.0,0.0,0.102062,0.102062,0.102062,0.102062,0.102062,0.102062,0.102062,0.0
95359,0.085749,0.0,0.085749,0.085749,0.0,0.0,0.0,0.085749,0.085749,0.0,...,0.0,0.0,0.085749,0.085749,0.085749,0.085749,0.085749,0.0,0.0,0.0
104636,0.092848,0.0,0.092848,0.0,0.092848,0.0,0.092848,0.092848,0.092848,0.092848,...,0.092848,0.0,0.092848,0.092848,0.092848,0.092848,0.092848,0.092848,0.0,0.092848
110912,0.081111,0.0,0.081111,0.0,0.0,0.0,0.0,0.081111,0.0,0.081111,...,0.081111,0.0,0.081111,0.081111,0.081111,0.081111,0.081111,0.081111,0.0,0.0
204864,0.0,0.0,0.080064,0.0,0.0,0.080064,0.0,0.080064,0.080064,0.0,...,0.080064,0.080064,0.080064,0.0,0.080064,0.0,0.080064,0.080064,0.080064,0.080064
271448,0.088388,0.0,0.0,0.0,0.088388,0.088388,0.088388,0.088388,0.088388,0.088388,...,0.0,0.0,0.0,0.0,0.088388,0.088388,0.088388,0.088388,0.088388,0.088388


# Displaying top books

In [16]:
# Map book titles to isbns
isbn_title_map = book_data.set_index('booktitle')['isbn'].to_dict()

def get_user_read_books(user_id):
    read_books_isbns = user_historical_view[user_historical_view['userid'] == user_id]['isbn'].tolist()
    return book_data[book_data['isbn'].isin(read_books_isbns)]['booktitle'].tolist()

def get_user_similarity_scores(user_id, max_threshold=1.0):
    similarity_scores = cosine_similarity_df.loc[user_id]
    return similarity_scores.clip(upper=max_threshold)

def filter_unread_books(user_id, read_books_titles):
    similarity_scores = get_user_similarity_scores(user_id)
    return similarity_scores.drop(index=read_books_titles, errors='ignore')

def get_top_recommendations(unread_books):
    return unread_books.sort_values(ascending=False).head(5)

def create_recommendation_list(user_id, top_5_books):
    return [{
        'User ID': user_id,
        'Book’s ISBN': isbn_title_map[title],
        'Book’s Title': title,
        'Similarity Value': similarity
    } for title, similarity in top_5_books.items()]

# Generate recommendations
recommendations_list = []
for user_id in user_data['userid']:
    read_books_titles = get_user_read_books(user_id)
    unread_books = filter_unread_books(user_id, read_books_titles)
    top_5_books = get_top_recommendations(unread_books)
    recommendations_list.extend(create_recommendation_list(user_id, top_5_books))

# Convert to DataFrame and add rank
recommendations_df = pd.DataFrame(recommendations_list)
recommendations_df['Rank'] = recommendations_df.groupby('User ID').cumcount() + 1

# Save to file


recommendations_df

Unnamed: 0,User ID,Book’s ISBN,Book’s Title,Similarity Value,Rank
0,11676,440234743,The Testament,0.066815,1
1,11676,440225701,The Street Lawyer,0.066815,2
2,11676,380789035,American Gods,0.066815,3
3,11676,743237188,Fall On Your Knees (Oprah #45),0.066815,4
4,11676,316776963,Me Talk Pretty One Day,0.066815,5
5,16795,590353403,Harry Potter and the Sorcerer s Stone (Book 1),0.117851,1
6,16795,439139597,Harry Potter and the Goblet of Fire (Book 4),0.117851,2
7,16795,312291639,The Nanny Diaries: A Novel,0.117851,3
8,16795,385484518,Tuesdays with Morrie: An Old Man a Young Man a...,0.117851,4
9,16795,60392452,Stupid White Men ...and Other Sorry Excuses fo...,0.117851,5


# Part 2 create user profiles

In [17]:
# Calculate tf idf for books
boolean_book_no_isbn = boolean_book_data.iloc[:, 1:]  # removed isbn 
tf_book = np.log1p(boolean_book_no_isbn.div(boolean_book_no_isbn.sum(axis=1), axis=0).values)  # logarithmic TF

# Calculate idf
N = tf_book.shape[0]  # total documents
doc_frequency = np.sum(tf_book > 0, axis=0)  # document frequency 
idf_book = np.log(N / (doc_frequency + 1))  # smoothed IDF

# Calculate tf idf
tfidf_book = tf_book * idf_book
tfidf_book = tfidf_book / np.linalg.norm(tfidf_book, axis=1, keepdims=True)  # L2 normalization
tfidf_df = pd.DataFrame(tfidf_book, columns=boolean_book_no_isbn.columns)
tfidf_df.insert(0, 'isbn', boolean_book_data['isbn'])

# Calculate tfidf profiles
tf_profile = np.log1p(profile_df.iloc[:, 2:].div(profile_df.iloc[:, 2:].sum(axis=1), axis=0).values)  # TF with log1p
tfidf_profile = tf_profile * idf_book
tfidf_profile = tfidf_profile / np.linalg.norm(tfidf_profile, axis=1, keepdims=True)  # L2 normalization


tfidf_profile_df = pd.DataFrame(tfidf_profile, columns=profile_df.columns[2:])
tfidf_profile_df = pd.concat([profile_df.iloc[:, :2], tfidf_profile_df], axis=1)

tfidf_profile_df


Unnamed: 0,userid,isbn,1984,1st to Die,A Heartbreaking Work of Staggering Genius,A Painted House,A Prayer for Owen Meany,A Time to Kill,A Walk in the Woods: Rediscovering America on the Appalachian Trail (Official Guides to the Appalachian Trail),A Walk to Remember,...,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
0,11676,"[60938455, 316096199, 316569321, 312195516, 34...",0.0,0.1486,0.0,0.0,0.1486,0.0,0.0,0.0,...,0.114849,0.089317,0.0,0.085806,0.069771,0.089317,0.085806,0.063785,0.0,0.0
1,16795,"[142001740, 316666343, 446605239, 385504209, 6...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.157923,0.0,0.0,0.0,0.117394,0.179511,0.0
2,22625,"[439064864, 059035342, 067976402, 316666343, 3...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.145258,0.0,0.136764,0.108526,0.088245,0.0,0.0,0.080674,0.123362,0.0
3,35859,"[312195516, 059035342, 61009059, 345342968, 04...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.176941,0.0,0.166594,0.132197,0.107493,0.0,0.0,0.0,0.0,0.0
4,95359,"[345361792, 449212602, 316601950, 043935806, 3...",0.0,0.0,0.0,0.0,0.19145,0.0,0.0,0.0,...,0.0,0.0,0.139314,0.110549,0.08989,0.0,0.0,0.082178,0.125662,0.0
5,104636,"[439139600, 446610038, 446672211, 059035342, 0...",0.0,0.222287,0.0,0.0,0.0,0.222287,0.0,0.0,...,0.0,0.0,0.0,0.128355,0.104369,0.133607,0.128355,0.095414,0.0,0.0
6,110912,"[345361792, 439064864, 385484518, 068484477, 6...",0.0,0.0,0.0,0.0,0.182436,0.0,0.0,0.0,...,0.0,0.0,0.132754,0.0,0.085658,0.109654,0.0,0.078309,0.119745,0.0
7,204864,"[312195516, 449212602, 067976402, 316769487, 1...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.137455,0.106898,0.0,0.102696,0.0,0.106898,0.0,0.07634,0.116735,0.0
8,271448,"[439139600, 446672211, 452282829, 316666343, 0...",0.0,0.0,0.0,0.207055,0.0,0.0,0.0,0.0,...,0.0,0.124452,0.150669,0.11956,0.0,0.0,0.11956,0.088876,0.135904,0.0


# similarity matrix

In [62]:
User_Calc = tfidf_profile_df.iloc[:, 2:]
book_calc = boolean_book_data.iloc[:, 1:]
euclidean_distance_matrix = np.array([np.linalg.norm(user_row.values.astype(float) - book_calc.values.astype(float), axis=1) for _, user_row in User_Calc.iterrows()])

# distance to similarity 
euclidean_sim_matrix = 1 / (1 + euclidean_distance_matrix)


euclidean_similarity2_df = pd.DataFrame(euclidean_sim_matrix, index=profile_df['userid'], columns=book_data['isbn'])
euclidean_similarity2_df


isbn,440234743,971880107,345417623,446310786,671027360,60976845,044021145,60938455,446672211,142000205,...,375727345,312924585,684872153,316601950,439139597,439064864,043935806,440220602,671001795,440222656
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11676,0.312047,0.309017,0.331186,0.309017,0.312758,0.317371,0.309017,0.331056,0.332842,0.312758,...,0.312915,0.309017,0.312047,0.317267,0.328452,0.327426,0.317371,0.31406,0.333625,0.312915
16795,0.309017,0.309017,0.309017,0.309017,0.320184,0.309017,0.309017,0.314175,0.325701,0.318527,...,0.309017,0.309017,0.309017,0.309017,0.309017,0.309017,0.317028,0.309017,0.320184,0.309017
22625,0.312867,0.309017,0.313975,0.309017,0.309017,0.309017,0.309017,0.31253,0.313775,0.309017,...,0.309017,0.316542,0.312867,0.339612,0.319718,0.33298,0.336017,0.315444,0.309017,0.309017
35859,0.32014,0.309017,0.309017,0.309017,0.309017,0.309017,0.350634,0.309017,0.314847,0.309017,...,0.309017,0.309017,0.313729,0.313729,0.315094,0.32014,0.315094,0.323554,0.318269,0.309017
95359,0.31294,0.309017,0.31407,0.348108,0.309017,0.309017,0.309017,0.312597,0.313866,0.309017,...,0.309017,0.309017,0.31294,0.340289,0.31993,0.324363,0.336602,0.309017,0.309017,0.309017
104636,0.327092,0.309017,0.314911,0.309017,0.314672,0.309017,0.349131,0.313187,0.347343,0.314672,...,0.314911,0.309017,0.313588,0.313588,0.33983,0.338107,0.321816,0.321816,0.309017,0.315474
110912,0.312752,0.309017,0.318904,0.309017,0.309017,0.309017,0.309017,0.312425,0.309017,0.315249,...,0.313826,0.309017,0.312752,0.312752,0.333509,0.332182,0.325303,0.314282,0.309017,0.309017
204864,0.309017,0.309017,0.313702,0.309017,0.309017,0.313702,0.309017,0.312338,0.313513,0.309017,...,0.313702,0.323825,0.32319,0.309017,0.313702,0.309017,0.314145,0.32058,0.313702,0.313702
271448,0.314495,0.309017,0.309017,0.309017,0.314273,0.32962,0.314495,0.319165,0.344163,0.314273,...,0.309017,0.309017,0.309017,0.309017,0.330412,0.32088,0.339252,0.32088,0.314495,0.314495


# top 10 not-yet-read books

In [63]:
recommendations = {}
for user_id in user_data['userid'].unique():
    # get sim scores for user and filter read books
    user_similarities = euclidean_similarity2_df.loc[user_id]
    read_books = set(merged[merged['userid'] == user_id]['isbn'])
    not_read_similarities = user_similarities.loc[~user_similarities.index.isin(read_books)].nlargest(10)
    
    # store recommendations
    recommendations[user_id] = pd.DataFrame({
        'User ID': user_id,
        'Book\'s ISBN': not_read_similarities.index,
        'Book\'s Title': book_data.set_index('isbn')['booktitle'].loc[not_read_similarities.index].values,
        'Similarity result': not_read_similarities.values
    })

# combine reccomendations
all_recommendations = pd.concat(recommendations.values(), ignore_index=True)


all_recommendations['Rank'] = all_recommendations.groupby('User ID')['Similarity result'].rank(method='first', ascending=False).astype(int)

all_recommendations

Unnamed: 0,User ID,Book's ISBN,Book's Title,Similarity result,Rank
0,11676,439064872,Harry Potter and the Chamber of Secrets (Book 2),0.328452,1
1,11676,439139597,Harry Potter and the Goblet of Fire (Book 4),0.328452,2
2,11676,316284955,White Oleander : A Novel (Oprah s Book Club),0.324728,3
3,11676,316666343,The Lovely Bones,0.323854,4
4,11676,590353403,Harry Potter and the Sorcerer s Stone (Book 1),0.321480,5
...,...,...,...,...,...
85,271448,440241073,The Summons,0.325308,6
86,271448,068484477,STONES FROM THE RIVER,0.324681,7
87,271448,312278586,The Nanny Diaries,0.321601,8
88,271448,316096199,Lucky : A Memoir,0.321601,9


# Evaluation

In [67]:
a = all_recommendations["Book's ISBN"].isin(test_user_answers['isbn']).sum()

p = a / 90
R = a / test_user_answers.shape[0]
F = 2 * (p * R) / (p + R) if p + R > 0 else 0

evaluation_results = pd.DataFrame({'Metric': ['Precision', 'Recall', 'F-measure'], 'Value': [p, R, F]})

evaluation_results


Unnamed: 0,Metric,Value
0,Precision,0.788889
1,Recall,0.440994
2,F-measure,0.565737
