# Cosine Similarity Content Based Recommender System 


##### Import Libraries and Dataset 

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
books = pd.read_csv (r'C:\Users\user\OneDrive\Desktop\Artificial Intelligence\clean dataset\books.csv',usecols = ['book_id','authors','original_publication_year','title','average_rating','image_url'])
ratings = pd.read_csv (r'C:\Users\user\OneDrive\Desktop\Artificial Intelligence\clean dataset\ratings.csv')
tags = pd.read_csv (r'C:\Users\user\OneDrive\Desktop\Artificial Intelligence\dataset folder\tags.csv')
bookTags = pd.read_csv (r'C:\Users\user\OneDrive\Desktop\Artificial Intelligence\dataset folder\book_tags.csv')

### Recommend books using the book authors


In [3]:
#Convert a collection of raw documents to a matrix of TF-IDF features
#analyzer = Whether the feature should be made of word or character n-grams
#ngram_range = The lower and upper boundary of the range of n-values for different n-grams to be extracted.
#min_df = When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold
#stop_words = If a string, it is passed to _check_stop_list and the appropriate stop list is returned. ‘english’ is currently the only supported string value
#more info: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')

#Learn vocabulary and idf, return document-term matrix. This is equivalent to fit followed by transform with more efficiently implemented.
author_matrix = tf.fit_transform(books['authors'])

#Compute the linear kernel between author matrix and itself
author_cosine_sim = linear_kernel(author_matrix, author_matrix)

In [4]:
author_cosine_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [5]:
#Building 1-dimensional array with book titles
titles = books['title']
indices = pd.Series(books.index, index=books['title'])

# Function that get book recommendations based on the cosine similarity score of book authors
def authors_recommendations(title):
    #index will equals to position at the title pass in 
    idx = indices[title]
    
    #list = convert it to a list 
    #enumerate = to count item in a loop
    sim_scores = list(enumerate(author_cosine_sim[idx]))
    
    #sorting the list of similarity scores in reversed form
    #key = adding every list's 2nd element (i.e x[1]) to the sort function
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    #recommend only first 6 books 
    sim_scores = sim_scores[1:7]
    
    #getting the book index for 6 books 
    book_indices = [i[0] for i in sim_scores]
    
    #return the book title with the book index in data frame format 
    return titles.iloc[book_indices].to_frame()

In [6]:
authors_recommendations('The Hobbit')

Unnamed: 0,title
18,The Fellowship of the Ring (The Lord of the Ri...
152,"The Two Towers (The Lord of the Rings, #2)"
158,"The Return of the King (The Lord of the Rings,..."
184,"The Lord of the Rings (The Lord of the Rings, ..."
941,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
4721,Unfinished Tales of Númenor and Middle-Earth


In [7]:
authors_recommendations('Romeo and Juliet').head(6)

Unnamed: 0,title
819,The Merchant of Venice
151,Macbeth
344,Othello
701,King Lear
758,The Taming of the Shrew
787,The Tempest


In [8]:
authors_recommendations("Twilight (Twilight, #1)").head(6)

Unnamed: 0,title
48,"New Moon (Twilight, #2)"
51,"Eclipse (Twilight, #3)"
55,"Breaking Dawn (Twilight, #4)"
72,"The Host (The Host, #1)"
718,The Short Second Life of Bree Tanner: An Eclip...
1575,The Twilight Saga Complete Collection (Twilig...


In [9]:
authors_recommendations("Harry Potter and the Sorcerer's Stone (Harry Potter, #1)")

Unnamed: 0,title
20,Harry Potter and the Order of the Phoenix (Har...
22,Harry Potter and the Chamber of Secrets (Harry...
23,Harry Potter and the Goblet of Fire (Harry Pot...
24,Harry Potter and the Deathly Hallows (Harry Po...
26,Harry Potter and the Half-Blood Prince (Harry ...
2029,"The Harry Potter Collection 1-4 (Harry Potter,..."


### Recommend books using the tags provided to the books.

In [10]:
#joining tag(tag_id, tag_name) dataset with bookTags(book_id, tag_id, count) dataset
tags_join_DF = pd.merge(bookTags, tags, left_on='tag_id', right_on='tag_id', how='inner')
tags_join_DF.head()

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
0,1,30574,167697,to-read
1,2,30574,24549,to-read
2,3,30574,496107,to-read
3,5,30574,11909,to-read
4,6,30574,298,to-read


In [11]:
#joining books with the tagname details 
books_with_tags = pd.merge(books, tags_join_DF, left_on='book_id', right_on='goodreads_book_id', how='inner')

In [12]:
books_with_tags.head()

Unnamed: 0,book_id,authors,original_publication_year,title,average_rating,image_url,goodreads_book_id,tag_id,count,tag_name
0,2767052,Suzanne Collins,2008.0,"The Hunger Games (The Hunger Games, #1)",4.34,https://images.gr-assets.com/books/1447303603m...,2767052,30574,11314,to-read
1,2767052,Suzanne Collins,2008.0,"The Hunger Games (The Hunger Games, #1)",4.34,https://images.gr-assets.com/books/1447303603m...,2767052,11305,10836,fantasy
2,2767052,Suzanne Collins,2008.0,"The Hunger Games (The Hunger Games, #1)",4.34,https://images.gr-assets.com/books/1447303603m...,2767052,11557,50755,favorites
3,2767052,Suzanne Collins,2008.0,"The Hunger Games (The Hunger Games, #1)",4.34,https://images.gr-assets.com/books/1447303603m...,2767052,8717,35418,currently-reading
4,2767052,Suzanne Collins,2008.0,"The Hunger Games (The Hunger Games, #1)",4.34,https://images.gr-assets.com/books/1447303603m...,2767052,33114,25968,young-adult


In [13]:
tagname_matrix = tf.fit_transform(books_with_tags['tag_name'].head(10000))
booktag_cosine_sim = linear_kernel(tagname_matrix, tagname_matrix)

In [14]:
booktag_cosine_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [15]:
pd.DataFrame(booktag_cosine_sim)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.168681,0.0,0.000000,0.000000,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.000000,0.545242,0.000000,0.0,1.000000,0.339996,0.0,0.0,0.0
9996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.339996,0.000000,0.000000,0.0,0.339996,1.000000,0.0,0.0,0.0
9997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1.0,0.0,0.0
9998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,1.0,0.0


In [16]:
# Function that get book recommendations based on the cosine similarity score of books tags
def tags_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(booktag_cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:7]
    book_indices = [i[0] for i in sim_scores]
    return titles.iloc[book_indices].to_frame()

In [17]:
tags_recommendations('The Hobbit')

Unnamed: 0,title
16,"Catching Fire (The Hunger Games, #2)"
31,Of Mice and Men
107,Les Misérables
125,Steve Jobs
149,Dear John
206,Atonement


In [18]:
tags_recommendations('Romeo and Juliet')

Unnamed: 0,title
142,Deception Point
227,Love in the Time of Cholera
1128,The One Minute Manager
1626,Galápagos
1741,"Edge of Eternity (The Century Trilogy, #3)"
1928,A Storm of Swords: Steel and Snow (A Song of I...


In [19]:
tags_recommendations('Twilight (Twilight, #1)')

Unnamed: 0,title
33,"Fifty Shades of Grey (Fifty Shades, #1)"
102,The Count of Monte Cristo
156,The Battle of the Labyrinth (Percy Jackson and...
202,The Immortal Life of Henrietta Lacks
234,"Inferno (Robert Langdon, #4)"
301,Can You Keep a Secret?


### Recommendation of books using the authors and tags attributes

Recommendation of books using the authors and tags attributes for better results. Creating corpus of features and calculating the TF-IDF on the corpus of attributes for gettings better recommendations.

In [20]:
#creating a temporary dataset that joing the tag_name of a book together
temp_df = books_with_tags.groupby('book_id')['tag_name'].apply(' '.join).reset_index()
temp_df.head()

Unnamed: 0,book_id,tag_name
0,1,to-read fantasy favorites currently-reading yo...
1,2,to-read fantasy favorites currently-reading yo...
2,3,to-read fantasy favorites currently-reading yo...
3,5,to-read fantasy favorites currently-reading yo...
4,6,to-read fantasy young-adult fiction harry-pott...


In [21]:
#create a new column named tag_name in book dataset
books = pd.merge(books, temp_df, left_on='book_id', right_on='book_id', how='inner')

In [22]:
books.head()

Unnamed: 0,book_id,authors,original_publication_year,title,average_rating,image_url,tag_name
0,2767052,Suzanne Collins,2008.0,"The Hunger Games (The Hunger Games, #1)",4.34,https://images.gr-assets.com/books/1447303603m...,to-read fantasy favorites currently-reading yo...
1,3,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Sorcerer's Stone (Harry P...,4.44,https://images.gr-assets.com/books/1474154022m...,to-read fantasy favorites currently-reading yo...
2,41865,Stephenie Meyer,2005.0,"Twilight (Twilight, #1)",3.57,https://images.gr-assets.com/books/1361039443m...,to-read fantasy favorites currently-reading yo...
3,2657,Harper Lee,1960.0,To Kill a Mockingbird,4.25,https://images.gr-assets.com/books/1361975680m...,to-read favorites currently-reading young-adul...
4,4671,F. Scott Fitzgerald,1925.0,The Great Gatsby,3.89,https://images.gr-assets.com/books/1490528560m...,to-read favorites currently-reading young-adul...


In [23]:
#column corpus which is authors and tag_name
books['corpus'] = (pd.Series(books[['authors', 'tag_name']]
                .fillna('')
                .values.tolist()
                ).str.join(' '))

In [24]:
tfidf_matrix_corpus = tf.fit_transform(books['corpus'])
cosine_sim_corpus = linear_kernel(tfidf_matrix_corpus, tfidf_matrix_corpus)

# Build a 1-dimensional array with book titles
titles = books['title']
indices = pd.Series(books.index, index=books['title'])

# Function that get book recommendations based on the cosine similarity score of books tags
def corpus_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim_corpus[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:7]
    book_indices = [i[0] for i in sim_scores]
    return titles.iloc[book_indices].to_frame()

In [25]:
corpus_recommendations("The Hobbit")

Unnamed: 0,title
184,"The Lord of the Rings (The Lord of the Rings, ..."
152,"The Two Towers (The Lord of the Rings, #2)"
158,"The Return of the King (The Lord of the Rings,..."
18,The Fellowship of the Ring (The Lord of the Ri...
599,The Silmarillion (Middle-Earth Universe)
4721,Unfinished Tales of Númenor and Middle-Earth


In [26]:
corpus_recommendations("Twilight (Twilight, #1)")

Unnamed: 0,title
51,"Eclipse (Twilight, #3)"
48,"New Moon (Twilight, #2)"
969,"The Twilight Saga (Twilight, #1-4)"
718,The Short Second Life of Bree Tanner: An Eclip...
1575,The Twilight Saga Complete Collection (Twilig...
3901,The Twilight Saga: The Official Illustrated Gu...


In [27]:
corpus_recommendations("Romeo and Juliet")

Unnamed: 0,title
344,Othello
755,Julius Caesar
122,Hamlet
151,Macbeth
242,A Midsummer Night's Dream
819,The Merchant of Venice


## Cosine Similarity Content Based Rec System Program

In [28]:
#converting it into a function or class
#items to pre generate 

#data to generate and export
#1. books_with_tags 
#2. books_with_corpus

#declaration of tfidVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')

#different tfidf matrix use to generate the cosine score 
tfidf_authormatrix = tf.fit_transform(books['authors'])
author_cosinesim = linear_kernel(tfidf_authormatrix, tfidf_authormatrix)

tfidf_booktagmatrix = tf.fit_transform(books_with_tags['tag_name'].head(10000))
booktag_cosinesim = linear_kernel(tfidf_booktagmatrix, tfidf_booktagmatrix)

tfidf_corpusmatrix = tf.fit_transform(books['corpus'])
corpus_cosinesim = linear_kernel(tfidf_corpusmatrix, tfidf_corpusmatrix)

print("Starting Rec System..")

#A function that returns the 20 most similar books based on the cosine similarity score.
# Build a 1-dimensional array with book titles
titles = books['title']
indices = pd.Series(books.index, index=books['title'])

# Function that get book recommendations based on the cosine similarity score of book authors
def cosine_sim_recommendation(rec_type,title):
    #rec_type either (author,booktag,corpus)
    #title = book title
    
    #the cosine_sim value based on type 
    if (rec_type == "author"):
        cosine_sim = author_cosinesim
    if (rec_type == "booktag"):
        cosine_sim = booktag_cosinesim
    if (rec_type == "corpus"):
        cosine_sim = corpus_cosinesim
    
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:7]
    book_indices = [i[0] for i in sim_scores]
    return titles.iloc[book_indices].to_frame()

Starting Rec System..


In [29]:
cosine_sim_recommendation(rec_type = "booktag",title="The Hobbit" )

Unnamed: 0,title
16,"Catching Fire (The Hunger Games, #2)"
31,Of Mice and Men
107,Les Misérables
125,Steve Jobs
149,Dear John
206,Atonement


In [30]:
cosine_sim_recommendation(rec_type = "author",title="The Hobbit" )

Unnamed: 0,title
18,The Fellowship of the Ring (The Lord of the Ri...
152,"The Two Towers (The Lord of the Rings, #2)"
158,"The Return of the King (The Lord of the Rings,..."
184,"The Lord of the Rings (The Lord of the Rings, ..."
941,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
4721,Unfinished Tales of Númenor and Middle-Earth


In [31]:
cosine_sim_recommendation(rec_type = "author",title="The Hobbit" )

Unnamed: 0,title
18,The Fellowship of the Ring (The Lord of the Ri...
152,"The Two Towers (The Lord of the Rings, #2)"
158,"The Return of the King (The Lord of the Rings,..."
184,"The Lord of the Rings (The Lord of the Rings, ..."
941,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
4721,Unfinished Tales of Númenor and Middle-Earth


In [32]:
cosine_sim_recommendation(rec_type = "author",title="The Hobbit" )

Unnamed: 0,title
18,The Fellowship of the Ring (The Lord of the Ri...
152,"The Two Towers (The Lord of the Rings, #2)"
158,"The Return of the King (The Lord of the Rings,..."
184,"The Lord of the Rings (The Lord of the Rings, ..."
941,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
4721,Unfinished Tales of Númenor and Middle-Earth


In [33]:
cosine_sim_recommendation(rec_type = "author",title="The Hobbit" )

Unnamed: 0,title
18,The Fellowship of the Ring (The Lord of the Ri...
152,"The Two Towers (The Lord of the Rings, #2)"
158,"The Return of the King (The Lord of the Rings,..."
184,"The Lord of the Rings (The Lord of the Rings, ..."
941,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
4721,Unfinished Tales of Númenor and Middle-Earth


In [34]:
cosine_sim_recommendation(rec_type = "author",title="The Hobbit" )

Unnamed: 0,title
18,The Fellowship of the Ring (The Lord of the Ri...
152,"The Two Towers (The Lord of the Rings, #2)"
158,"The Return of the King (The Lord of the Rings,..."
184,"The Lord of the Rings (The Lord of the Rings, ..."
941,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
4721,Unfinished Tales of Númenor and Middle-Earth


In [35]:
cosine_sim_recommendation(rec_type = "author",title="The Hobbit" )

Unnamed: 0,title
18,The Fellowship of the Ring (The Lord of the Ri...
152,"The Two Towers (The Lord of the Rings, #2)"
158,"The Return of the King (The Lord of the Rings,..."
184,"The Lord of the Rings (The Lord of the Rings, ..."
941,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
4721,Unfinished Tales of Númenor and Middle-Earth


In [36]:
cosine_sim_recommendation(rec_type = "corpus",title="The Hobbit" )

Unnamed: 0,title
184,"The Lord of the Rings (The Lord of the Rings, ..."
152,"The Two Towers (The Lord of the Rings, #2)"
158,"The Return of the King (The Lord of the Rings,..."
18,The Fellowship of the Ring (The Lord of the Ri...
599,The Silmarillion (Middle-Earth Universe)
4721,Unfinished Tales of Númenor and Middle-Earth


In [37]:
cosine_sim_recommendation(rec_type = "author",title="The First World War")

Unnamed: 0,title
3286,The Opposite of Loneliness: Essays and Stories
85,A Time to Kill
120,"The Firm (Penguin Readers, Level 5)"
221,The Client
274,The Pelican Brief
335,The Runaway Jury


In [38]:
cosine_sim_recommendation(rec_type = "author",title="The Hunger Games (The Hunger Games, #1)")

Unnamed: 0,title
16,"Catching Fire (The Hunger Games, #2)"
19,"Mockingjay (The Hunger Games, #3)"
496,The Hunger Games Trilogy Boxset (The Hunger Ga...
1489,"Gregor the Overlander (Underland Chronicles, #1)"
2815,Gregor and the Code of Claw (Underland Chronic...
3050,Gregor and the Curse of the Warmbloods (Underl...


In [40]:
cosine_sim_recommendation(rec_type = "author",title="Romeo and Juliet")

Unnamed: 0,title
819,The Merchant of Venice
151,Macbeth
344,Othello
701,King Lear
758,The Taming of the Shrew
787,The Tempest
