In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn import preprocessing 
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from nltk.tokenize import RegexpTokenizer
from scipy.spatial.distance import cdist
import re
import string
import random
import matplotlib.pyplot as plt
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Function for removing NonAscii characters
def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)

# Function for converting into lower case
def make_lower_case(text):
    return text.lower()

# Function for removing stop words
def remove_stop_words(text, item_lang):
    if item_lang in lang_in_nltk:
        text = text.split()
        stops = set(stopwords.words(item_lang)) #change to other language #"english"
        text = [w for w in text if not w in stops]
        text = " ".join(text)
    return text

# Function for removing punctuation
def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

# Function for removing the html tags
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

In [3]:
#Import dataset
books_data = pd.read_csv('D:/Uni Mannheim/Semester 2/Data Mining 2/DMC/publisher_cluster_added.csv', sep=',')
df = books_data.copy()
df.drop(['Unnamed: 0'], axis= 1, inplace=True)

In [4]:
df.head(1)

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,interest_age,number_pages,release_date,ISBN_13,language,description,publisher_cluster
0,21310,Princess Poppy: The Big Mix Up,Janey Louise Jones,Penguin Random House Children's UK,YFB,[5AH],1,{''},{Timestamp('2010-11-30 00:00:00')},"[{'type': 'ISBN_13', 'identifier': '9781409048...",en,A brand-new story in the best-selling Princess...,1


In [5]:
thema = pd.read_csv('D:/Uni Mannheim/Semester 2/Data Mining 2/DMC/Others datasets/Nazanin/annotated_item_dataset_thema_text.csv', sep=',')
thema.drop(['Unnamed: 0'], axis= 1, inplace=True)
thema.head(1)

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,language,description,ISBN_13,interest_age,LangDetect,maintopic_text,subtopics_text
0,21310,Princess Poppy: The Big Mix Up,Janey Louise Jones,Penguin Random House Children's UK,YFB,[5AH],en,A brand-new story in the best-selling Princess...,"[{'type': 'ISBN_13', 'identifier': '9781409048...",1,en,Children’s / Teenage fiction: General fiction,['Interest age: from c 7 years']


In [6]:
thema.drop(['title','author', 'publisher','main topic', 'subtopics', 'language', 'description', 'ISBN_13', 'interest_age', 'LangDetect'], axis= 1, inplace=True)

In [7]:
thema.count()

itemID            78030
maintopic_text    77772
subtopics_text    41126
dtype: int64

In [8]:
df = df.merge(thema, how= 'left', on='itemID')

In [9]:
df['maintopic_text'] = df['maintopic_text'].fillna('')
df['subtopics_text'] = df['subtopics_text'].fillna('')

In [10]:
for i in range (len(df.index)):
    main_topic = str(df.at[i,'maintopic_text'])
    sub_topic = str(df.at[i,'subtopics_text'])
    total_topic = main_topic + sub_topic
    df.loc[i, 'total_topic'] = total_topic

In [11]:
#df.drop(['maintopic_text','subtopics_text'], axis= 1, inplace=True)
df.count()

itemID               78030
title                78030
author               75824
publisher            78030
main topic           77772
subtopics            78029
interest_age         78030
number_pages         78021
release_date         78021
ISBN_13              77112
language             78030
description          73749
publisher_cluster    78030
maintopic_text       78030
subtopics_text       78030
total_topic          78030
dtype: int64

In [12]:
df2 = df[df['author'].isna()]
df = df[df['author'].notnull()]

In [13]:
df2.count()

itemID               2206
title                2206
author                  0
publisher            2206
main topic           2181
subtopics            2206
interest_age         2206
number_pages         2205
release_date         2205
ISBN_13              2186
language             2206
description          1834
publisher_cluster    2206
maintopic_text       2206
subtopics_text       2206
total_topic          2206
dtype: int64

In [14]:
df.count()

itemID               75824
title                75824
author               75824
publisher            75824
main topic           75591
subtopics            75823
interest_age         75824
number_pages         75816
release_date         75816
ISBN_13              74926
language             75824
description          71915
publisher_cluster    75824
maintopic_text       75824
subtopics_text       75824
total_topic          75824
dtype: int64

In [15]:
df_concat = df.groupby(['author'])['total_topic'].apply(','.join).reset_index()

In [16]:
for i in range (len(df_concat.index)):
    processesed_description = df_concat.at[i,'total_topic']
    processesed_description = _removeNonAscii(processesed_description)
    processesed_description = make_lower_case(processesed_description)
    processesed_description = remove_punctuation(processesed_description)
    processesed_description = remove_html(processesed_description)
    df_concat.loc[i, 'total_topic'] = processesed_description

In [17]:
vec = TfidfVectorizer(stop_words="english")
vec.fit(df_concat.total_topic.values)
features = vec.transform(df_concat.total_topic.values)

In [18]:
features.shape

(36669, 2073)

In [19]:
features = preprocessing.normalize(features)

In [20]:
kmeans = KMeans(n_clusters=20, random_state=0).fit(features)
labels = kmeans.labels_
centers = kmeans.cluster_centers_

In [21]:
#We collect cosine similarity of the centroids in a 2D array to be used by our RS.
cosine_sim_author = np.ndarray(shape=(20,20), dtype=np.ndarray, order='F')
for i in range (20):
    for j in range (20):
        # Dot and norm
        dot = sum(a*b for a, b in zip(centers[i], centers[j]))
        norm_a = sum(a*a for a in centers[i]) ** 0.5
        norm_b = sum(b*b for b in centers[j]) ** 0.5
        # Cosine similarity
        cos_sim = dot / (norm_a*norm_b)
        cosine_sim_author[i,j] = cos_sim

In [22]:
print(cosine_sim_author)

[[0.9999999999999998 0.3407845335442216 0.1535973459726355
  0.34815918045310745 0.3409330622863031 0.1900915666550329
  0.39081625551249566 0.21970868375191574 0.42497466694220665
  0.292232423052543 0.37519902298233976 0.3351886266046458
  0.2460399709163743 0.31150166088038406 0.2236967132939758
  0.38864138850181695 0.2087169836385856 0.2614619781825115
  0.1704869174571438 0.5099246408811111]
 [0.3407845335442216 1.0 0.04808148321997798 0.18225042204321065
  0.4869244583061165 0.09597009550364076 0.5683778390219959
  0.11397635642487272 0.5159180304089604 0.3595207331279233
  0.5017985986077885 0.5445793673503294 0.3380046858535409
  0.5290834299995082 0.2080261406996985 0.6443495587695554
  0.23287254964027634 0.39896452964094975 0.05942726039559365
  0.5645095845998672]
 [0.1535973459726355 0.04808148321997798 1.0 0.03052043474855843
  0.017021092481113866 0.04204616009964359 0.10045240240888648
  0.6206444439133468 0.628092590280627 0.03204182845928035
  0.06399608176402721 0.0

In [23]:
print(type(cosine_sim_author))

<class 'numpy.ndarray'>


In [24]:
from numpy import asarray
from numpy import save
# save to npy file
save('cosine_sim_author.npy', cosine_sim_author)

In [25]:
from numpy import load
# load array
data = load('cosine_sim_author.npy', allow_pickle=True)
# print the array
print(data)

[[0.9999999999999998 0.3407845335442216 0.1535973459726355
  0.34815918045310745 0.3409330622863031 0.1900915666550329
  0.39081625551249566 0.21970868375191574 0.42497466694220665
  0.292232423052543 0.37519902298233976 0.3351886266046458
  0.2460399709163743 0.31150166088038406 0.2236967132939758
  0.38864138850181695 0.2087169836385856 0.2614619781825115
  0.1704869174571438 0.5099246408811111]
 [0.3407845335442216 1.0 0.04808148321997798 0.18225042204321065
  0.4869244583061165 0.09597009550364076 0.5683778390219959
  0.11397635642487272 0.5159180304089604 0.3595207331279233
  0.5017985986077885 0.5445793673503294 0.3380046858535409
  0.5290834299995082 0.2080261406996985 0.6443495587695554
  0.23287254964027634 0.39896452964094975 0.05942726039559365
  0.5645095845998672]
 [0.1535973459726355 0.04808148321997798 1.0 0.03052043474855843
  0.017021092481113866 0.04204616009964359 0.10045240240888648
  0.6206444439133468 0.628092590280627 0.03204182845928035
  0.06399608176402721 0.0

In [26]:
df_concat['author_cluster'] = labels.tolist()

In [27]:
df_concat.drop(['total_topic'], axis= 1, inplace=True)
df = df.merge(df_concat, on='author')

In [28]:
aut_type = df.groupby('author_cluster',sort = False).agg({'itemID': 'count'}).reset_index()
aut_type.sort_values(by=['itemID'], ascending=False, inplace=True, ignore_index= True)
print(aut_type)

    author_cluster  itemID
0                0   13599
1                3    7040
2               19    6575
3                4    6296
4                2    4718
5               18    4447
6                6    4063
7               15    3559
8                7    3423
9               10    2938
10               1    2926
11              11    2629
12               8    2550
13               9    2320
14              13    1917
15              17    1912
16              12    1847
17               5    1242
18              14    1136
19              16     687


In [29]:
frames = [df, df2]
df = pd.concat(frames)

In [30]:
df.count()

itemID               78030
title                78030
author               75824
publisher            78030
main topic           77772
subtopics            78029
interest_age         78030
number_pages         78021
release_date         78021
ISBN_13              77112
language             78030
description          73749
publisher_cluster    78030
maintopic_text       78030
subtopics_text       78030
total_topic          78030
author_cluster       75824
dtype: int64

In [31]:
df.head(1)

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,interest_age,number_pages,release_date,ISBN_13,language,description,publisher_cluster,maintopic_text,subtopics_text,total_topic,author_cluster
0,21310,Princess Poppy: The Big Mix Up,Janey Louise Jones,Penguin Random House Children's UK,YFB,[5AH],1,{''},{Timestamp('2010-11-30 00:00:00')},"[{'type': 'ISBN_13', 'identifier': '9781409048...",en,A brand-new story in the best-selling Princess...,1,Children’s / Teenage fiction: General fiction,['Interest age: from c 7 years'],Children’s / Teenage fiction: General fiction[...,15.0


In [32]:
df.drop(['total_topic' , 'ISBN_13'], axis= 1, inplace=True)
df = df[['itemID', 'title', 'author', 'publisher', 'main topic', 'subtopics', 'language', 'interest_age', 'description', 'publisher_cluster', 'author_cluster','number_pages', 'maintopic_text', 'subtopics_text','release_date']]

In [33]:
df.to_csv('publisher_author_cluster_added.csv')