In [1]:
import pandas as pd
import numpy as np
import re 
import math
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer 

In [2]:
df_items = pd.read_csv('items.csv', sep='|')
df_items.head(20)


Unnamed: 0,itemID,title,author,publisher,main topic,subtopics
0,21310,Princess Poppy: The Big Mix Up,Janey Louise Jones,Penguin Random House Children's UK,YFB,[5AH]
1,73018,Einfach zeichnen! Step by Step,Wiebke Krabbe,Schwager und Steinlein,AGZ,"[5AJ,AGZ,WFA,YBG,YBL,YNA,YPA]"
2,19194,Red Queen 1,Victoria Aveyard,Orion Publishing Group,YFH,"[5AP,FBA]"
3,40250,Meine Kindergarten-Freunde (Pirat),,Ars Edition GmbH,YB,"[5AC,5AD,YBG,YBL,YF]"
4,46107,Mein großes Schablonen-Buch - Wilde Tiere,Elizabeth Golding,Edition Michael Fischer,WFTM,"[WD,WFTM,YBG,YBL,YBLD,YBLN1]"
5,34217,Ewig geliebt,J. R. Ward,Heyne Taschenbuch,FMR,"[1KBB-US-NAK,FMX,FRX,3MRBF]"
6,31436,Meine Sticker-Tiere,,Ars Edition GmbH,YBG,"[5AD,YBG,YBLL]"
7,14576,Unsterblich 01 - Tor der Dämmerung,Julie Kagawa,Heyne Taschenbuch,YFE,"[5AQ,FM,YFE,YFH]"
8,17731,Unsterblich 02 - Tor der Nacht,Julie Kagawa,Heyne Taschenbuch,YFH,"[5AQ,FM,YFE,YFH]"
9,58723,Pedro und die Bettler von Cartagena,Ursula Hasler,dtv Verlagsgesellschaft,YFB,"[5AM,1KLSC]"


In [3]:
df_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78030 entries, 0 to 78029
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   itemID      78030 non-null  int64 
 1   title       78030 non-null  object
 2   author      74790 non-null  object
 3   publisher   78021 non-null  object
 4   main topic  77772 non-null  object
 5   subtopics   78029 non-null  object
dtypes: int64(1), object(5)
memory usage: 3.6+ MB


In [4]:
df_items.loc[df_items['author'].isna()]

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics
3,40250,Meine Kindergarten-Freunde (Pirat),,Ars Edition GmbH,YB,"[5AC,5AD,YBG,YBL,YF]"
6,31436,Meine Sticker-Tiere,,Ars Edition GmbH,YBG,"[5AD,YBG,YBLL]"
10,73124,Freundebuch - Einhorn-Paradies - Meine Freunde,,Coppenrath F,YZG,"[5JA,YBG,YBL,YFH,YZS]"
28,63166,Fingerstempel-Spaß Kunterbunt,,Ars Edition GmbH,YBLD,"[5AD,YBG,YBLD]"
29,44220,Fingerstempel-Spaß Tiere,,Ars Edition GmbH,YBG,"[5AD,YBG,YBLD,YBLL]"
...,...,...,...,...,...,...
77788,37942,Perry Rhodan 109. Das Loch im Universum,,MOEWIG,FL,[]
77864,4022,Iron Man Armoured Avenger,,Panini Publishing Ltd,XADC,[YF]
77869,30445,Find Out,,Books on Demand,FM,[FL]
77876,45554,The Eyes of Despero!,,GROSSET DUNLAP,YFZZ,[YFC]


In [5]:
df_items.loc[df_items['author'].isna(),'author'] = "other"
df_items.loc[df_items['publisher'].isna(),'publisher'] = "other"
df_items.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78030 entries, 0 to 78029
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   itemID      78030 non-null  int64 
 1   title       78030 non-null  object
 2   author      78030 non-null  object
 3   publisher   78030 non-null  object
 4   main topic  77772 non-null  object
 5   subtopics   78029 non-null  object
dtypes: int64(1), object(5)
memory usage: 3.6+ MB


In [6]:
df_items.loc[df_items['publisher'].isna()].shape

(0, 6)

In [7]:
df_items.loc[df_items['subtopics'] == '[]'].shape
# 36904

(36904, 6)

In [8]:
df_items.loc[(df_items['main topic'].isna()) ]
# 258

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics
182,6798,Kernstaub,Marie Graßhoff,Drachenmond Verlag,,[FM]
211,67006,Mein erstes Feuerwehr-Bastelbuch,other,"Coppenrath, Münster",,[5AC]
536,38541,Näh mit! Die Kindernähschule,Ina Andresen,Frech Verlag GmbH,,[5AH]
647,25076,Star Trek - New Frontier: Mehr als Götter,Peter David,Cross Cult,,"[FLS,FLU]"
1236,27457,Retreat: Stirb lachend!,"Joe McKinney, Craig DiLouie, Steven Knight",Luzifer,,"[1KBB,FKM,FLQ]"
...,...,...,...,...,...,...
64281,44390,Eleha - Aufbruch,Sonja Girisch,Books on Demand,,[FRM]
75807,21505,Vamps and the City,Kerrelyn Sparks,MIRA Taschenbuch,,[FMR]
76628,40959,Wie das Schwein zum Tanze ging,Michael Köhlmeier,DTV,,[5AM]
77261,64864,Die Eingeborenen,Karsten Kruschel,Wurdack,,"[FLS,FLU]"


In [9]:
# drop nan publisher\
df_items['publisher'].dropna(axis=0, inplace=True)

df_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78030 entries, 0 to 78029
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   itemID      78030 non-null  int64 
 1   title       78030 non-null  object
 2   author      78030 non-null  object
 3   publisher   78030 non-null  object
 4   main topic  77772 non-null  object
 5   subtopics   78029 non-null  object
dtypes: int64(1), object(5)
memory usage: 3.6+ MB


In [10]:
df_items.loc[df_items['subtopics'].isna(),'subtopics']='[]'

In [11]:
for i in range(0,len(df_items)):
    if df_items['subtopics'][i] != '[]':
        subtopics_value_new = re.sub(r"[\([{})\]]", "", df_items['subtopics'][i])
        subtopics_value_new_2 = subtopics_value_new.split(',')
        df_items.at[i, 'subtopics'] = subtopics_value_new_2
   
    if pd.isna(df_items.loc[i ,'main topic']):
        df_items['main topic'][i] =  df_items['subtopics'][i][0]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_items['main topic'][i] =  df_items['subtopics'][i][0]


In [12]:
df_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78030 entries, 0 to 78029
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   itemID      78030 non-null  int64 
 1   title       78030 non-null  object
 2   author      78030 non-null  object
 3   publisher   78030 non-null  object
 4   main topic  78030 non-null  object
 5   subtopics   78030 non-null  object
dtypes: int64(1), object(5)
memory usage: 3.6+ MB


In [13]:
colunms = ['title', 'author', 'publisher', 'main topic','subtopics']

In [14]:
#combine features:

def combine_features(data):
    features = []
    for i in range(0, len(data)):
        sub_topic = ''
        if df_items['subtopics'][i] != '[]':
            for subtopic in df_items['subtopics'][i]:
                sub_topic+= ' '+ subtopic
        features.append(data['title'][i] + ' '  + data['author'][i]+' ' + data['main topic'][i] + ' ' + sub_topic)
    return features



In [15]:
# create new colum and save combined_features
df_items['combined_features'] = combine_features(df_items)

In [16]:
df_items = df_items[0:20000]
df_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   itemID             20000 non-null  int64 
 1   title              20000 non-null  object
 2   author             20000 non-null  object
 3   publisher          20000 non-null  object
 4   main topic         20000 non-null  object
 5   subtopics          20000 non-null  object
 6   combined_features  20000 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.1+ MB


In [17]:
# convert combined features to count matrix
cm = CountVectorizer().fit_transform(df_items['combined_features'])
# print(cm)

In [18]:
# calculate similarity - cosine

cs = cosine_similarity(cm)

In [19]:
print(cs)

[[1.         0.         0.         ... 0.         0.         0.        ]
 [0.         1.         0.         ... 0.         0.         0.        ]
 [0.         0.         1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.13867505]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.13867505 0.         1.        ]]


In [20]:
Title = df_items[df_items['itemID'] == 54013]['title'].values[0]
Title

'Harry Potter and the Cursed Child - Parts I & II'

In [21]:
book_id = df_items[df_items['title'] == Title]['itemID'].index[0]
book_id

2619

In [22]:
scores = list(enumerate(cs[book_id]))
scores

[(0, 0.1257389226923863),
 (1, 0.0),
 (2, 0.15762208124782012),
 (3, 0.0),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (7, 0.11145564251507058),
 (8, 0.16718346377260587),
 (9, 0.06286946134619315),
 (10, 0.06019292654288461),
 (11, 0.0),
 (12, 0.0),
 (13, 0.0),
 (14, 0.0),
 (15, 0.0),
 (16, 0.0),
 (17, 0.0),
 (18, 0.0),
 (19, 0.0),
 (20, 0.0),
 (21, 0.0),
 (22, 0.0659380473395787),
 (23, 0.5029556907695452),
 (24, 0.0),
 (25, 0.06950480468569159),
 (26, 0.0),
 (27, 0.0),
 (28, 0.0),
 (29, 0.0),
 (30, 0.24077170617153845),
 (31, 0.0),
 (32, 0.0),
 (33, 0.0),
 (34, 0.0659380473395787),
 (35, 0.12038585308576923),
 (36, 0.11145564251507058),
 (37, 0.11145564251507058),
 (38, 0.07881104062391006),
 (39, 0.0),
 (40, 0.14744195615489714),
 (41, 0.09325048082403138),
 (42, 0.14744195615489714),
 (43, 0.0),
 (44, 0.06286946134619315),
 (45, 0.06286946134619315),
 (46, 0.1257389226923863),
 (47, 0.07881104062391006),
 (48, 0.0),
 (49, 0.07881104062391006),
 (50, 0.0),
 (51, 0.049147318718299055),
 (52, 

In [23]:
sorted_scores = sorted(scores , key = lambda x:x[1] , reverse = True)
sorted_scores = sorted_scores[1:]
sorted_scores

[(1118, 0.6361464251628643),
 (1150, 0.5922200922639821),
 (565, 0.5783149319662403),
 (1183, 0.5783149319662403),
 (265, 0.5734146386569556),
 (532, 0.5658251521157383),
 (2021, 0.556293911166591),
 (2940, 0.546018906231948),
 (4604, 0.546018906231948),
 (9250, 0.546018906231948),
 (10510, 0.546018906231948),
 (1301, 0.5417363388859615),
 (1602, 0.5417363388859615),
 (3341, 0.5383819020581655),
 (430, 0.5363989048891136),
 (3680, 0.5262013605584338),
 (1590, 0.5212860351426869),
 (2995, 0.5204834387696162),
 (23, 0.5029556907695452),
 (6993, 0.5029556907695452),
 (2717, 0.5015503913178176),
 (7462, 0.5015503913178176),
 (10635, 0.5015503913178176),
 (14144, 0.5015503913178176),
 (17671, 0.5015503913178176),
 (19821, 0.5015503913178176),
 (458, 0.4890096469218258),
 (892, 0.4845437118523489),
 (913, 0.4845437118523489),
 (1087, 0.4845437118523489),
 (1803, 0.4845437118523489),
 (1944, 0.4845437118523489),
 (1949, 0.4845437118523489),
 (2465, 0.4845437118523489),
 (297, 0.48154341234307

In [24]:
for i in sorted_scores[:6]:
    book_title = df_items.loc[i[0]].title
    print(book_title)

Harry Potter 7 and the Deathly Hallows
Harry Potter 4 and the Goblet of Fire
Harry Potter 6 and the Half-Blood Prince
Harry Potter 1 and the Philosopher's Stone
Harry Potter 5 and the Order of the Phoenix
Harry Potter 1 and the Philosopher's Stone
