# sigmoid Kernel

In [7]:
import pandas as pd
from sklearn.metrics.pairwise import sigmoid_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

#Get the data
columns=['Id','Book_id','Best_Book_Id','Original_Title','Description']
# reading file
book_description = pd.read_excel('/home/hina/Downloads/Dataset/book_data.xlsx',sep='',usecols=columns)
# checking if we have the right data
book_description.head()



Unnamed: 0,Id,Book_id,Best_Book_Id,Original_Title,Description
0,1,13624209,210834,Kim,Kim is set in an imperialistic world; a world ...
1,2,846427,846429,Classical Mythology,"Featuring the authors' extensive, clear, and f..."
2,3,969335,241664,Clara Callan,"In a small town in Canada, Clara Callan reluct..."
3,4,3767482,1508654,Decision in Normandy,"Here, for the first time in paperback, is an o..."
4,5,893390,763331,Flu: The Story of the Great Influenza Pandemic,"The fascinating, true story of the world's dea..."


In [8]:
# removing the stop words
books_tfidf = TfidfVectorizer(stop_words='english')
# filling the missing values with empty string
book_description['Description'] = book_description['Description'].fillna('')
# computing TF-IDF matrix required for calculating cosine similarity
book_description_matrix = books_tfidf.fit_transform(book_description['Description'])
book_description_matrix.shape

(5001, 44582)

In [9]:
# computing cosine similarity matrix using linear_kernal of sklearn
cosine_similarity = sigmoid_kernel(book_description_matrix, book_description_matrix)
# In[6]:
indices = pd.Series(book_description['Original_Title'].index)
# In[7]:
# Function to get the most similar books
def recommend(index, cosine_sim=cosine_similarity):
    id = indices[index]
    # Get the pairwsie similarity scores of all books compared to that book, 
    # sorting them and getting top 5
    similarity_scores = list(enumerate(cosine_sim[id]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[1:6]
    for i in similarity_scores:
        print(i)
    # Get the books index
    books_index = [i[0] for i in similarity_scores]

    # Return the top 5 most similar books using integer-location based indexing (iloc)
    return book_description['Original_Title'].iloc[books_index]
# In[8]:
# getting recommendation for book at index 2
recommend(121)

(2161, 0.7615955032164635)
(1762, 0.7615951119181229)
(4663, 0.7615951037539098)
(4013, 0.7615950929815997)
(1515, 0.7615949745749758)


2161                                           Sea Glass
1762    Cerulean Sins (Anita Blake, Vampire Hunter, #11)
4663                                 The Laughing Corpse
4013                                              Malice
1515                                         Alias Grace
Name: Original_Title, dtype: object

# Linear Kernel 

In [35]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

columns=['Id','Book_id','Best_Book_Id','Original_Title','Description']

ds = pd.read_excel("/home/hina/Downloads/Dataset/book_data.xlsx",sep='',usecols=columns) #you can plug in your own list of products or movies or books here as csv file
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
######ngram (1,3) can be explained as follows#####
#ngram(1,3) encompasses uni gram, bi gram and tri gram
#consider the sentence "The ball fell"
#ngram (1,3) would be the, ball, fell, the ball, ball fell, the ball fell

tfidf_matrix = tf.fit_transform(ds["Description"].values.astype('U'))
cosine_similarities = cosine_similarity(tfidf_matrix,tfidf_matrix)

results = {} # dictionary created to store the result in a dictionary format (ID : (Score,item_id))

for idx, row in ds.iterrows(): #iterates through all the rows
    # the below code 'similar_indice' stores similar ids based on cosine similarity. sorts them in ascending order. [:-5:-1] is then used so that the indices with most similarity are got. 0 means no similarity and 1 means perfect similarity
    similar_indices = cosine_similarities[idx].argsort()[:-5:-1] #stores 5 most similar books, you can change it as per your needs
    similar_items = [(cosine_similarities[idx][i], ds['Id'][i]) for i in similar_indices]
    results[row['Id']] = similar_items[1:]
    
#below code 'function item(id)' returns a row matching the id along with Book Title. Initially it is a dataframe, then we convert it to a list
def item(id):
    return ds.loc[ds['Id'] == id]['Original_Title'].tolist()[0]
def recommend(id, num):
    if (num == 0):
        print("Unable to recommend any book as you have not chosen the number of book to be recommended")
    elif (num==1):
        print("Recommending " + str(num) + " book similar to " + item(id))
        
    else :
        print("Recommending " + str(num) + " books similar to " + item(id))
        
    print("----------------------------------------------------------")
    recs = results[id][:num]
    for rec in recs:
        print("You may also like to read: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")

#the first argument in the below function to be passed is the id of the book, second argument is the number of books you want to be recommended
recommend(5,10)

Recommending 10 books similar to Flu: The Story of the Great Influenza Pandemic
----------------------------------------------------------
You may also like to read: Darwin's Radio (score:0.05116896834391612)
You may also like to read: The Tipping Point: How Little Things Can Make a Big Difference (score:0.0320391497044297)
You may also like to read: The Amber Spyglass (score:0.024156569159928126)
