In [2]:
from langdetect import detect
import time
from statistics import mean
from collections import Counter
import numpy as np
from pandas import DataFrame
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel

In [11]:
items=pd.read_csv("items.csv", sep="|")
transactions=pd.read_csv("transactions.csv", sep="|")
evaluation=pd.read_csv("evaluation.csv")
popularity = pd.read_csv("items_popularity.csv", sep = "|")
df=items

### Determining languages
Computing takes about 10 mins. Not used in recs algorithm, can be skipped.

In [39]:
# list of items and their languages:
lang=list()

# list of items with undefined language:
notok=list()

ttime=list()

for row in df.iterrows():
    tic = time.time()
    #print(row[1][0])
    try:
        lang.append(list((row[1][0], detect(row[1][1]))))
    except: 
        lang.append(list((row[1][0], "??")))
        notok.append(row)
        #print("not ok 00000000000000000000000000000000000000000000000000000000000")
    toc = time.time()
    ttime.append(toc-tic)
print("mean time elapsed: ",mean(ttime)," sum time: ", sum(ttime))


mean time elapsed:  0.008733441819598335  sum time:  681.470465183258


In [40]:
# exploring the undefined-language items
print(len(notok))
for item in notok:
    print(item[1]["title"])

51
144000
2312
1984
381
1984
2034
2021
2037
43
2069
2121
2394
2121
17
2048
2121
1632
Ferris@Bruns_LLC
1814
2012
11
!!
Tajo@Bruns_LLC
1,2,3
110
5:55
1906
1906
2030
2047
444
444
2501
2060
5028
2084
2053
1520-1522
2145
1523-1526
1712
2084
2625
2084
>
2156
6984
12
71%
4
2049


In [41]:
# list of languages only:
langpure=list()
for item in lang:
    langpure.append(item[1])

# exploring frequencies of languages:
freqs = Counter(langpure)
freqsdf = pd.DataFrame(list(freqs.items()),columns = ['language','frequency']) 
freqsdf=freqsdf.sort_values("frequency", ascending=False)
print(freqsdf)

   language  frequency
0        en      40223
1        de      19991
8        es       3105
6        it       1503
12       fr       1266
16       nl       1144
2        af       1103
3        id        882
19       tl        860
4        pt        806
5        no        754
7        da        724
9        ca        716
29       cy        589
11       ro        571
10       sv        567
22       so        440
24       et        427
14       fi        374
27       sw        337
23       pl        276
17       tr        242
15       lt        223
28       vi        164
25       hu        158
20       hr        142
26       sl        133
21       sq         86
18       sk         83
13       ??         51
30       lv         47
31       cs         43


In [42]:
# adding language data to dataframe:
df["language"]= np.asarray(langpure)

### Preprocessing

In [12]:
# checking, which columns contain null values
df.isna().sum()

itemID           0
title            0
author        3247
publisher        9
main topic     259
subtopics        0
dtype: int64

In [13]:
# filling null values
df['title'] = df['title'].fillna("no_title")
df['author'] = df['author'].fillna("no_author")
df['publisher'] = df['publisher'].fillna("no_publisher")
df['main topic'] = df['main topic'].fillna("no_main_topic")
df['subtopics'] = df['subtopics'].fillna("no_subtopics")

In [14]:
# checking that filling null values worked
df.isna().sum()

itemID        0
title         0
author        0
publisher     0
main topic    0
subtopics     0
dtype: int64

In [15]:
# Processing "subtopics" column:
# replacing "[]" with "no_subtopics", 
# removing "[" and "]" everywhere else
df['subtopics'] = df['subtopics'].map(lambda x: x.lstrip('[').rstrip(']'))
df=df.replace(r'^\s*$', "no_subtopics", regex=True)

In [16]:
# building column with all analyzable data
df['all_content'] = df['title'] + " " + df['author'] + " " + df['publisher'] + " " + df['main topic'] + " " + df['subtopics']

#### popularity score based on transaction data is added to the data

In [18]:
df = df.merge(popularity, how = "left", on = "itemID")

In [19]:
df.head()

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,all_content,pop,rank_main_topic
0,21310,Princess Poppy: The Big Mix Up,Janey Louise Jones,Penguin Random House Children's UK,YFB,5AH,Princess Poppy: The Big Mix Up Janey Louise Jo...,3.0,595.0
1,73018,Einfach zeichnen! Step by Step,Wiebke Krabbe,Schwager und Steinlein,AGZ,"5AJ,AGZ,WFA,YBG,YBL,YNA,YPA",Einfach zeichnen! Step by Step Wiebke Krabbe S...,100.0,1.0
2,19194,Red Queen 1,Victoria Aveyard,Orion Publishing Group,YFH,"5AP,FBA",Red Queen 1 Victoria Aveyard Orion Publishing ...,268.0,32.0
3,40250,Meine Kindergarten-Freunde (Pirat),no_author,Ars Edition GmbH,YB,"5AC,5AD,YBG,YBL,YF",Meine Kindergarten-Freunde (Pirat) no_author A...,380.0,2.0
4,46107,Mein großes Schablonen-Buch - Wilde Tiere,Elizabeth Golding,Edition Michael Fischer,WFTM,"WD,WFTM,YBG,YBL,YBLD,YBLN1",Mein großes Schablonen-Buch - Wilde Tiere Eliz...,61.0,22.0


In [20]:
# saving preprocessed dataset
df.to_csv("items_prepdone.csv")

### Making recommendations

In [22]:
# Creating TF-IDF matrix and exploring its shape
vectorizer = TfidfVectorizer(analyzer='word')
tfidf_all_content = vectorizer.fit_transform(df['all_content'])
tfidf_all_content.shape

(78334, 71421)

In [27]:
# function for making a recommendation
def recommend(itemID, tfidf_matrix, reorder_top_5 = False):
    item_index=df[df["itemID"]==itemID].index[0]
    
    # computing cosine similarity matrix
    cosine_similarity=linear_kernel(tfidf_matrix[item_index,:], tfidf_matrix)
    
    # processing the matrix
    similarity_scores = cosine_similarity.tolist()[0]
    ssdf = DataFrame(similarity_scores,columns=['score'])
    ssdf=ssdf.sort_values(by='score', ascending=False)
    ssdf = ssdf[1:6]
    
    # printing similarity scores
    print("Similarity scores:\n", ssdf)
    
    # getting indices of top 5 recs
    index_ssdf=ssdf.index.tolist() 
    
    
    if reorder_top_5:
        # Reorder top 5 recommendations based on their popularity score before returning df
        return df.iloc[index_ssdf].sort_values(by = "pop", ascending = False)
    else:
        # returning a dataframe with recommendations 
        return df.iloc[index_ssdf]

In [29]:
# running recs algorithm
recommend(938, tfidf_all_content)

Similarity scores:
           score
18579  0.852327
14627  0.818909
18575  0.805497
17896  0.739951
5669   0.369355


Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,all_content,pop,rank_main_topic
18579,50651,Engelssturm 02 - Gabriel,Heather Killough-Walden,Heyne Taschenbuch,FMR,"1KBB,FRX,FYT",Engelssturm 02 - Gabriel Heather Killough-Wald...,3.0,484.0
14627,39268,Engelssturm 01 - Uriel,Heather Killough-Walden,Heyne Taschenbuch,FMR,"1KBB,FRX,FYT",Engelssturm 01 - Uriel Heather Killough-Walden...,7.0,384.0
18575,14215,Engelssturm - Azrael,Heather Killough-Walden,Heyne Verlag,FMR,"1KBB,FRX,FYT",Engelssturm - Azrael Heather Killough-Walden H...,1.0,561.0
17896,28785,Engelssturm - Samael,Heather Killough-Walden,Heyne Taschenbuch,FMR,"1KBB,3MRB",Engelssturm - Samael Heather Killough-Walden H...,3.0,484.0
5669,41460,Black Dagger 04. Bruderkrieg,J. R. Ward,Heyne Taschenbuch,FMR,"1KBB-US-NAK,FB,FRX,FYT",Black Dagger 04. Bruderkrieg J. R. Ward Heyne ...,11.0,309.0


In [None]:
# the book, for which we made recs:
df[df["itemID"]==938]

#### Example with reordering

In [37]:
# running recs algorithm
recommend(10, tfidf_all_content, reorder_top_5 = True)

Similarity scores:
           score
35404  0.794510
6485   0.703675
4131   0.691640
28989  0.681556
77946  0.674210


Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,all_content,pop,rank_main_topic
4131,27703,The Adventures of Tom Sawyer,Mark Twain,Penguin Random House Children's UK,YFA,5AK,The Adventures of Tom Sawyer Mark Twain Pengui...,48.0,21.0
6485,61994,Adventures of Huckleberry Finn,Mark Twain,Race Point Publishing,YFA,no_subtopics,Adventures of Huckleberry Finn Mark Twain Race...,4.0,145.0
35404,6098,The Adventures of Huckleberry Finn,Mark Twain,ALADDIN,YFA,YFC,The Adventures of Huckleberry Finn Mark Twain ...,,
28989,52671,The Adventures of Huckleberry Finn,Mark Twain,Vintage Publishing,FBC,no_subtopics,The Adventures of Huckleberry Finn Mark Twain ...,,
77946,73006,The Adventures of Huckleberry Finn,Mark Twain,VINTAGE,DCA,no_subtopics,The Adventures of Huckleberry Finn Mark Twain ...,,


In [35]:
# the book, for which we made recs:
df[df["itemID"]==10]

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,all_content,pop,rank_main_topic
14926,10,The Adventures of Huckleberry Finn,Mark Twain,Penguin Random House Children's UK,YFA,5AK,The Adventures of Huckleberry Finn Mark Twain ...,19.0,45.0
