## Content_based RS and its evaluation

In [1]:
from google.colab import drive
drive.mount('/content/drive')
#change directory to current location
%cd /content/drive/My Drive/DMC

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/DMC


In [2]:
# Importing necessary libraries
import pandas as pd
import numpy as np
!pip install rank-bm25
from rank_bm25 import BM25Okapi
import math
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
import re
import string
import random
import matplotlib.pyplot as plt
import scipy.sparse as sparse
import pickle
%matplotlib inline



In [3]:
# read in dataset
with open('publisher_author_cluster_added_new.csv', 'rb') as file:
    df = pickle.load(file)

In [4]:
eval_data = pd.read_csv('evaluation.csv')
eval_data['predictions'] = [list() for x in range(len(eval_data.index))] 
display(eval_data.head(1))

Unnamed: 0,itemID,predictions
0,12,[]


### Model Building

In [5]:
def searchquery(df, itemID, num_recommendation = 5):
    #Find the query item
    query = df.loc[df['itemID'] == itemID]
    return query

In [6]:
#function required for title simililarity calculation
def calculate_bm25(query, corpus):
    tokenized_corpus = [doc.split(" ") for doc in corpus]
    bm25 = BM25Okapi(tokenized_corpus)
    tokenized_query = query.split(" ")
    doc_scores = bm25.get_scores(tokenized_query)
    return doc_scores

In [7]:
#The Linsimilarityfunction gets two strings (two main topics), Based on Lin similarity formula, then a floating number will
#return as a measure for similarity of two books.
def linsimilarityfunction (maintopic1 , maintopic2):
    #if two topics are identical the Lin similarity measure should be 1:
    if (maintopic1 == maintopic2):
        return 1
    #if they are not identical, we should do further analysis.
    #we compare the first elements of both lists. If they differ from each other, the Lin similarity will be zero.
    elif (maintopic1[0] != maintopic2[0]):
        return 0
    #otherwise we need further steps!
    else:
        #here we simply find the frequency of each topic in our dataset as it is required by Lin similarity
        totalitems = df['main topic'].count()
        minlength=min(len(maintopic1),len(maintopic2))
        df1=df[df['main topic']==maintopic1]
        df2=df[df['main topic']==maintopic2]
        count1=df1['main topic'].count()
        count2=df2['main topic'].count()
        #Now, we can focus on finding the closest mutual ancestor of both topics. By close, we refer to number of
        #node traverse in the tree of all topics.
        i=minlength
        while (i>0):
            if (maintopic1[:i]==maintopic2[:i]):
                dfmutual=df[df['main topic'].str[:i]==maintopic1[:i]]
                countmutual=dfmutual['main topic'].count()
                sim = 2*math.log10(countmutual/totalitems)/(math.log10(count1/totalitems)+math.log10(count2/totalitems))
                return sim
            else:
                i=i-1
                
# example
print(linsimilarityfunction ('YB', 'YFB'))

0.10116165919446315


In [8]:
#preprocessing for interest age
#if interest_age = 'None', we assigned the item value 20 (as one of the member in Group others)
df['interest_age'] = [20 if i == 'None' else i for i in df['interest_age']]

#change data type from string to float
df['interest_age'] = df['interest_age'].astype(float)

In [9]:
#we simply define the cosine similarity marices for controids of publisher clusters, according to what we calculated before.
#we will use these matrices, later in this notebook.
cosine_sim_pub_en= [[1.0, 0.8330066502646305,0.6297570399979626,0.4861936761165772,0.7453411410708859],
                    [0.8330066502646305, 1.0, 0.7554188417173869, 0.7118815933864038, 0.9210669994486579],
                    [0.6297570399979626, 0.7554188417173869, 1.0, 0.6235440834287432, 0.6375244958925325],
                    [0.4861936761165772, 0.7118815933864038, 0.6235440834287432, 1.0, 0.6478558391290395],
                    [0.7453411410708859, 0.9210669994486579, 0.6375244958925325, 0.6478558391290395, 1.0]]
cosine_sim_pub_de= [[1.0, 0.6980035928496665, 0.6652253308854318, 0.7618509407590829, 0.24346291180752616],
                    [0.6980035928496665, 1.0, 0.5094143512344115, 0.6684881408627634, 0.34110548671081775],
                    [0.6652253308854318, 0.5094143512344115, 1.0, 0.5009705224224626, 0.1478384568079113],
                    [0.7618509407590829, 0.6684881408627634, 0.5009705224224626, 1.0, 0.21427313660101593],
                    [0.24346291180752616, 0.34110548671081775, 0.1478384568079113, 0.21427313660101593, 1.0]]
cosine_sim_pub_es= [[1.0000000000000002, 0.160890554358734, 0.6464809047245317, 0.2896861543901769, 0.21631925136172767],
                    [0.160890554358734, 1.0, 0.15004107600273364, 0.15892212224517466, 0.08412392024908043], 
                    [0.6464809047245317, 0.15004107600273364, 1.0, 0.260231004956859, 0.35929479439712453], 
                    [0.2896861543901769, 0.15892212224517466, 0.260231004956859, 1.0000000000000002, 0.10176734676342591],
                    [0.21631925136172767, 0.08412392024908043, 0.35929479439712453, 0.10176734676342591, 1.0000000000000002]]
cosine_sim_pub_it= [[0.9999999999999999, 0.2561779725373971, 0.25311718656092913, 0.22072994629183765, 0.21016282367168967],
                    [0.2561779725373971, 1.0000000000000002, 0.2929791491742338, 0.060380627840515635, 0.10947151505277775],
                    [0.25311718656092913, 0.2929791491742338, 1.0000000000000002, 0.0947600541304488, 0.06665679929131706],
                    [0.22072994629183765, 0.060380627840515635, 0.0947600541304488, 1.0, 0.08102817035798306],
                    [0.21016282367168967, 0.10947151505277775, 0.06665679929131706, 0.08102817035798306, 1.0]]
cosine_sim_pub_fr= [[1.0, 0.0648745051530169, 0.10092346226021406, 0.12995150974842662,  0.11242176980768166],
                    [0.0648745051530169, 0.9999999999999998, 0.11951700602218226, 0.1598015065807482, 0.11855046701545281],
                    [0.10092346226021406, 0.11951700602218226, 1.0000000000000002, 0.4525789852904426, 0.45974607409831275],
                    [0.12995150974842662, 0.1598015065807482, 0.4525789852904426, 1.0000000000000002, 0.6061514610077626],
                    [0.11242176980768166, 0.11855046701545281, 0.45974607409831275, 0.6061514610077626, 0.9999999999999998]]

In [10]:
#change data type from string to integer
df['publisher_cluster'] = df['publisher_cluster'].astype(int)

In [11]:
#function for publisher cluster similarity calculation. 

def calculate_pub_cluster_similarity (publisher_clusster_i, publisher_clusster_j, language):
    if (language == 'en'):
        return cosine_sim_pub_en [publisher_clusster_i][publisher_clusster_j]
    elif (language == 'de'):
        return cosine_sim_pub_de [publisher_clusster_i][publisher_clusster_j]
    elif (language == 'es'):
        return cosine_sim_pub_es [publisher_clusster_i][publisher_clusster_j]
    elif (language == 'it'):
        return cosine_sim_pub_it [publisher_clusster_i][publisher_clusster_j]
    elif (language == 'fr'):
        return cosine_sim_pub_fr [publisher_clusster_i][publisher_clusster_j]
    else:
        return 1

In [12]:
#required preprocessing for author cluster feature. 
#Also we import the cosine similarity matrix that we have computed for this feature.

df1 = df[df['author'].notnull()]
df2 = df[df['author'].isna()]

#if author = NaN we assigned the item value 20 as a number for author cluster attribute. it represent unknown! :D
df2['author_cluster'] = 20

frames = [df1, df2]
df = pd.concat(frames)

#change data type from string to integer
df['author_cluster'] = df['author_cluster'].astype(int)


#we calculated cosine similarity of author cluster centroids, and we just import the matrix save in a nparray
from numpy import load
# load array
cosine_sim_author = load('cosine_sim_author.npy', allow_pickle=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [13]:
#function for author cluster similarity calculation. 
def calculate_author_cluster_similarity (author_cluster_i, author_cluster_j):
    if ((author_cluster_i == 20) or (author_cluster_j == 20)):
        return 0.5
    else:
        return cosine_sim_author [author_cluster_i][author_cluster_j]

In [14]:
#required preprocessing for number of pages feature. 

df1 = df[df['number_pages'].notnull()]
df2 = df[df['number_pages'].isna()]
#if number_pages = NaN we assigned the item value 0 as a number for number_pages attribute. it represent unknown! :D
df2['number_pages'] = 0
frames = [df1, df2]
df = pd.concat(frames)
#change data type from string to integer
df['author_cluster'] = df['author_cluster'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [15]:
def calculate_num_pages_similarity (num_pages_i, num_pages_j):
    if (num_pages_i <= 24):
        num_pages_i = 24
    elif (num_pages_i >= 640):
        num_pages_i = 640
    if (num_pages_j <= 24):
        num_pages_j = 24
    elif (num_pages_j >= 640):
        num_pages_j = 640
    return  1-(abs(num_pages_i-num_pages_j)/(640-24))

In [16]:
def recommend(df, itemID, num_recommendation = 5):
    
    #Find the item index
    item_index = int(df[df['itemID'] == itemID].index.values)

    #metadata for the itemID
    item_metadata = df[df['itemID'] == itemID]
    item_metadata = item_metadata.squeeze(axis=0)
    
    #0. Language: Keep data with same langauge. + remove itemID (this is the query item) from df dataframe
    df = df.loc[df['itemID'] != itemID]
    df = df.loc[df['language'] == item_metadata.language]
    
    #1. Title
    item_title = item_metadata.title #this is the query sentence
    if pd.isnull(item_title)== False:
        df['title_bm25_scores'] = calculate_bm25(item_title, df.title)
    else: 
        df['title_bm25_scores'] = 0

    #2. Description
    item_description = item_metadata.processed_description #this is the query sentence
    if pd.isnull(item_description)== False:
        df['description_bm25_scores'] = calculate_bm25(item_description, df.processed_description)
    else: 
        df['description_bm25_scores'] = 0
    
    #3. Title-Author combination best selections
    item_author = item_metadata.author
    df1 = df.loc[df['author'] == item_author]
    df2 = df.loc[df['author'] != item_author]
    df1['author_scores'] = 1
    df2['author_scores'] = 0
       
 
    #4.combining all we have for final evaluation
    df2 = df2.sort_values(by=['description_bm25_scores'], ascending=False)
    if len(df2.index) >= 1000:
        df2= df2[:1000]
    frames = [df1, df2]
    df = pd.concat(frames)

 
    #5. Calculate the closeness of age between query item and candidate items. At the end, create an new column called "age_closeness"
    #age distance
    df['age_distance'] = df['interest_age'] - item_metadata['interest_age'] #candidate's age minus query item's age
    #age Normalization: use Max-min Normalization
    df['age_distance'] = (df['age_distance'] - df['age_distance'].min()) / (df['age_distance'].max() - df['age_distance'].min())
    #How close the age distance is? (Closeness = 1 - age_distance)
    df['age_closeness'] = 1 - abs(df['age_distance']) #the larger value the closer age is
    
    
    #6. Publisher clusters. since we have different clusters for different languages: first, we capture the language of
    #both books. Then, we should look for each specific language. so, if both books are written in English, German, Spanish,
    #Italian, or French, then, we simply use the cosine similarity matrix that we extracted during the clustering.
    #Otherwise, we set the measure to 1, which means that for all infrequent languages, we basically use the same score
    #equal to 1 for all books.
    item_publisher_clusster = item_metadata['publisher_cluster']
    item_language = item_metadata['language']
    df['publisher_cluster_scores'] = [calculate_pub_cluster_similarity (item_publisher_clusster, publisher_cluster, item_language) for publisher_cluster in df['publisher_cluster']]
    
    
    #7. author clusters
    item_author_clusster = item_metadata['author_cluster']
    df['author_cluster_scores'] = [calculate_author_cluster_similarity (item_author_clusster, author_cluster) for author_cluster in df['author_cluster']]

    
    #8. number of pages
    item_num_pages = item_metadata['number_pages']
    if (item_num_pages == 0):
        df['num_pages_scores'] = 0
    else:
        df1 = df[df['number_pages']!= 0]
        df2 = df[df['number_pages']== 0]
        df1['num_pages_scores'] = [calculate_num_pages_similarity (item_num_pages, number_pages) for number_pages in df1['number_pages']]
        df2['num_pages_scores'] = 0
        frames = [df1, df2]
        df = pd.concat(frames)
    
    #9. Main topic: use Linsimilarity
    item_maintopic = item_metadata['main topic']
    if pd.isnull(item_maintopic)==False:
        df1 = df[df['main topic'].notna()]
        df2 = df[df['main topic'].isna()]
        df1['maintopic_scores'] = [linsimilarityfunction (item_maintopic, topic) for topic in df1['main topic']]
        df2['maintopic_scores'] = 0
        #combining all we have for final evaluation
        frames = [df1, df2]
        df = pd.concat(frames)
    else:
        df['maintopic_scores'] = 0
    
    #normalize all scores.
    df['maintopic_scores'] = (df['maintopic_scores'] - df['maintopic_scores'].min()) / (df['maintopic_scores'].max() - df['maintopic_scores'].min())
    Total = df['title_bm25_scores'].sum()
    if (Total > 0):
        df['title_bm25_scores'] = (df['title_bm25_scores'] - df['title_bm25_scores'].min()) / (df['title_bm25_scores'].max() - df['title_bm25_scores'].min())   
  
    #10. final score and limit the number of recommendations to 10
    
    #since we had many NaN values in the num_pages column, we should find a way to handle the issue.
    #thus, for all items with NaN values (previously replacced by zero), we omit this criterion from our evaluation.
    if (item_num_pages == 0):
        df['average_score'] = 0.22265 *  df['author_scores'] + 0.27749 * df['maintopic_scores'] + 0.08067 * df['age_closeness'] + 0.07591 * df['title_bm25_scores'] + 0.03389 * df['publisher_cluster_scores'] + 0.30938 * df['author_cluster_scores']
    else:
        df1 = df[df['number_pages']!= 0]
        df2 = df[df['number_pages']== 0]
        df1['average_score'] = 0.10731 *  df1['author_scores'] + 0.12744 * df1['maintopic_scores'] + 0.04142 * df1['age_closeness'] + 0.03289 * df1['title_bm25_scores'] + 0.01880 * df1['publisher_cluster_scores'] + 0.19771 * df1['author_cluster_scores'] + 0.47442 * df1['num_pages_scores']
        df2['average_score'] = 0.22265 *  df2['author_scores'] + 0.27749 * df2['maintopic_scores'] + 0.08067 * df2['age_closeness'] + 0.07591 * df2['title_bm25_scores'] + 0.03389 * df2['publisher_cluster_scores'] + 0.30938 * df2['author_cluster_scores']
        frames = [df1, df2]
        df = pd.concat(frames)

    df = df.drop_duplicates(subset=['itemID'], keep='first')
    df = df.drop_duplicates(subset=['title'], keep='first')
    result = df.sort_values(by=['average_score'], ascending=False)
    #result.drop(columns=['level_0','maintopic_text','subtopics_text','release_date', 'processed_description', 'age_distance'], axis= 1, inplace=True)
    result.drop(columns=['maintopic_text','subtopics_text','release_date', 'processed_description', 'age_distance'], axis= 1, inplace=True)
    return result.iloc[:num_recommendation]

In [17]:
query_item = searchquery(df, 445, 5) #k = 5
query_item.head()

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,language,interest_age,description,publisher_cluster,author_cluster,number_pages,maintopic_text,subtopics_text,release_date,processed_description
7353,445,DIE PHANTASTISCHE REISE,Isaac Asimov,epubli,FL,[],de,1.0,Um einen ins Koma gefallenen Wissenschaftler z...,3,3,0.0,Science fiction,,{Timestamp('2021-03-09 00:00:00')},koma gefallenen wissenschaftl retten rzte team...


In [18]:
rec_books = recommend(df, 445, 5) #k = 5
rec_books.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,language,interest_age,description,publisher_cluster,author_cluster,number_pages,title_bm25_scores,description_bm25_scores,author_scores,age_closeness,publisher_cluster_scores,author_cluster_scores,num_pages_scores,maintopic_scores,average_score
59436,961,Geliebter Roboter,Isaac Asimov,Heyne Taschenbuch,FL,[],de,1.0,"Ein Blick in die Zukunft Claire Belmont, Ehefr...",3,3,208.0,0.0,23.698676,1,0.95,1.0,1.0,0,1.0,0.920046
7355,26868,Gold,Isaac Asimov,Harper Voyager,FL,[],de,1.0,eine willkommene abwechslung fuer die drei al...,2,3,0.0,0.0,0.0,1,0.95,0.500971,1.0,0,1.0,0.903134
59435,5553,Die Foundation-Trilogie,Isaac Asimov,Heyne Taschenbuch,FLC,[FLS],de,1.0,Die Geschichte unserer Zukunft Mittels der sog...,3,3,880.0,0.0,24.594998,1,0.95,1.0,1.0,0,0.560347,0.798047
59448,78055,Ein Sandkorn am Himmel,Isaac Asimov,Heyne Taschenbuch,FLC,[FLS],de,1.0,Von einem Schritt auf den nächsten ... Der pen...,3,3,288.0,0.0,0.0,1,0.95,1.0,1.0,0,0.560347,0.798047
59437,45658,Das galaktische Imperium,Isaac Asimov,Heyne Taschenbuch,FLC,[],de,1.0,im 32 jahrhundert hat ein teil der menschheit ...,3,3,544.0,0.0,18.182994,1,0.95,1.0,1.0,0,0.560347,0.798047


In [None]:
query_item = searchquery(df, 12564, 5) #k = 5
query_item.head()

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,language,interest_age,description,publisher_cluster,author_cluster,number_pages,maintopic_text,subtopics_text,release_date,processed_description
18455,12564,Ignite Me,Tahereh Mafi,Egmont UK Limited,YFCB,"[5AQ,YFG,YFHR,YFM]",en,1.5,X-Men meets The Handmaid's Tale in the third i...,1,10,416.0,Children’s / Teenage fiction: Thrillers,"['Interest age: from c 14 years', 'Children’s ...",{Timestamp('2014-02-04 00:00:00')},x men meet handmaid tale third instal epic rom...


In [None]:
rec_books = recommend(df, 12564, 5) #k = 5
display(rec_books)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['author_scores'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['author_scores'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['num_pages_scores'] = [calculate_num_pages_similarity (item_num_pages, number_pages) for number_pages in df1['number_pages']]
A value is trying to be s

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,language,interest_age,description,publisher_cluster,...,number_pages,title_bm25_scores,description_bm25_scores,author_scores,age_closeness,publisher_cluster_scores,author_cluster_scores,num_pages_scores,maintopic_scores,average_score
18457,9024,Imagine Me,Tahereh Mafi,Egmont UK Limited,YFCB,"[YFG,YFHR,YFM]",en,1.0,The book that all SHATTER ME fans have been wa...,1,...,464.0,0.532072,278.238908,1,0.95,1.0,1.0,0.922078,1.0,0.945561
18452,50498,Defy Me,Tahereh Mafi,Egmont UK Limited,YFCB,"[5AN,5AQ,YFG,YFHR,YFM]",en,1.333333,The breath-taking and heart-pounding fifth ins...,1,...,368.0,0.532072,180.957221,1,0.933333,1.0,1.0,0.922078,1.0,0.944871
22648,20000,Ever the Hunted,Erin Summerill,HOUGHTON MIFFLIN,YFCB,"[YFH,YFM]",en,1.0,"In this epic fantasy adventure, a teen girl em...",1,...,416.0,0.0,48.948056,0,0.95,1.0,1.0,1.0,1.0,0.857719
18448,21222,Restore Me,Tahereh Mafi,Harper Collins Publ. USA,YFHR,"[YFE,YFG]",en,1.0,An instant New York Times bestseller! Juliette...,1,...,448.0,0.532072,109.221401,1,0.95,1.0,1.0,0.948052,0.172344,0.852407
18446,59588,Shatter Me,Tahereh Mafi,Harper Collins Publ. USA,YFE,"[5AK,YFG,YFM]",en,1.0,X-Men meets The Handmaid's Tale in this first ...,1,...,448.0,0.532072,267.16129,1,0.95,1.0,1.0,0.948052,0.147314,0.849217


In [18]:
import warnings
warnings.filterwarnings("ignore")

In [19]:
print(len(eval_data))
display(eval_data.head())

1000


Unnamed: 0,itemID,predictions
0,12,[]
1,45274,[]
2,10104,[]
3,41371,[]
4,14015,[]


In [None]:
%%time

# generate recommendations based on book's features
for idx,row in eval_data.iterrows():
    item_data = row['itemID']
    if(idx % 10 == 0): print(idx)
    try: 
        rec_list =recommend(df, item_data, 5)['itemID'].tolist()
        for x in rec_list:
            row['predictions'].append(str(x))
    except:
        print('unable to make recommendations for the {}th book'.format(idx))

# export the dataset for rating the RS
evaluation_data=eval_data[['itemID','predictions']]
evaluation_data.to_csv('final_evaluation_data.csv')

0
