In [1]:
!pip install sentence_transformers -q

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import csr_matrix, coo_matrix, load_npz, save_npz, hstack, identity
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from sentence_transformers import SentenceTransformer
from nltk.stem import PorterStemmer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
import joblib

In [3]:
ps = PorterStemmer()

<a href='https://mengtingwan.github.io/data/goodreads.html'>Dataset link</a>

<div>
    <p>Citations</p> 
    <ul>
        <li>Mengting Wan, Julian McAuley, "<a href="https://mengtingwan.github.io/paper/recsys18_mwan.pdf">Item Recommendation on Monotonic Behavior Chains</a>", in RecSys'18. [<a href="https://dblp.uni-trier.de/rec/conf/recsys/WanM18.html?view=bibtex">bibtex</a>]
        </li>
        <li>Mengting Wan, Rishabh Misra, Ndapa Nakashole, Julian McAuley, "<a href="https://mengtingwan.github.io/paper/acl19_mwan.pdf">Fine-Grained Spoiler Detection from Large-Scale Review Corpora</a>", in ACL'19. [<a href="https://dblp.uni-trier.de/rec/conf/acl/WanMNM19.html?view=bibtex">bibtex</a>]
        </li>
    </ul>
</div>

# Create MostCommonWords Transformer

In [4]:
# only work on single column dataframe/series only
class MostCommonWords(BaseEstimator, TransformerMixin):
    def __init__(self, min_count=5, lower=True, replace_punctuation='', replace_number=None, stem=True, is_input_array=False):
        self.min_count = min_count
        self.lower = lower
        self.replace_punctuation = replace_punctuation
        self.replace_number = replace_number
        self.stem = stem
        self.is_input_array = is_input_array
    
    def _preprocess(self, X):
        if type(X) is pd.DataFrame:
            X = X.squeeze()
        if self.is_input_array == False:
            return self._preprocess_str(X)
        return self._preprocess_arr(X)
    
    def _preprocess_str(self, X):
        if self.lower:
            X = X.str.lower()
        if self.replace_punctuation is not None:
            X = X.str.replace(r'[^\w\s]',self.replace_punctuation, regex=True)
        if self.replace_number is not None:
            X = X.str.replace(r'\d+', self.replace_number, regex=True)
        X = X.str.split(r'\s+', regex=True)
        if self.stem:
            X = X.apply(lambda x : [ps.stem(word) for word in x])
        X = X.apply(Counter)
        return X
    
    def _preprocess_arr(self, X):
        if self.lower:
            X = X.apply(lambda x : [text.lower() for text in x])
        if self.replace_punctuation is not None:
            X = X.apply(lambda x : [re.sub(r'[^\w\s]', self.replace_punctuation, word) for word in x])
        if self.stem:
            X = X.apply(lambda x : [ps.stem(word) for word in x])
        if self.replace_number is not None:
            X = X.apply(lambda x : [re.sub(r'\d+', self.replace_number, word) for word in x])
        X = X.apply(Counter)
        return X
    
    def fit(self, X, y=None):
        X = self._preprocess(X)
        all_words = Counter()
        X.apply(all_words.update)
        self.common_words_ = set([key for key, value in all_words.items() if value >= self.min_count])
        # '' gets added if string has leading or trailing spaces
        self.common_words_.discard('')
        self.encoder_ = LabelEncoder().fit(list(self.common_words_))
        return self
    
    def transform(self, X):
        X = self._preprocess(X)
        X = X.apply(lambda x : [key for key in x.keys() if key in self.common_words_])
        X = X.apply(self.encoder_.transform)
        row_indices = [np.full(len(v), i, dtype=int) for i, v in enumerate(X.values)]
        row_indices = np.concatenate(row_indices, axis=0)
        col_indices = np.concatenate(X.values, axis=0)
        values = np.ones(len(row_indices))
        
        csr_matrix_shape = (X.shape[0], len(self.common_words_))
        return csr_matrix((values, (row_indices, col_indices)), shape=csr_matrix_shape,  dtype=int)


# Load data

In [5]:
books = pd.read_json('/kaggle/input/goodreads-book-graph-datasets-download/books.json', lines=True)
print('books loaded')

books loaded


In [6]:
book_authors = pd.read_json('/kaggle/input/goodreads-book-graph-datasets-download/authors.json', lines=True)

In [7]:
interactions = pd.read_json('/kaggle/input/goodreads-book-graph-datasets-download/interactions.json', lines=True)
print('interactions loaded')

interactions loaded


In [8]:
categories = pd.read_csv('/kaggle/input/book-categories/categories.csv').iloc[:,0]

# Filter english books

(some books don't have language_code so they are assumed to be english)

In [9]:
filt = ((books.language_code == 'eng') | (books.language_code.str.len() == 0))
books_eng = books[filt]
books_eng_important_columns = ["book_id", "title", "description", "authors", "url", "image_url", "popular_shelves"]
books_eng = books_eng[books_eng_important_columns]

# Map book categories to common

In [10]:
model = SentenceTransformer('all-mpnet-base-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [11]:
categories_enc = model.encode(categories.values)

Batches:   0%|          | 0/10 [00:00<?, ?it/s]

In [12]:
book_categories = books_eng.popular_shelves.apply(lambda x : [v['name'] for v in x if int(v['count']) >= 1])

In [13]:
book_categories_counter = Counter()
_ = book_categories.apply(book_categories_counter.update)

In [14]:
book_categories_all = np.array(list(book_categories_counter.keys()))
book_categories_all.shape

(138714,)

In [15]:
book_categories_enc = model.encode(book_categories_all)

Batches:   0%|          | 0/4335 [00:00<?, ?it/s]

In [16]:
similarity = model.similarity(book_categories_enc, categories_enc)
row_arg_max = np.argmax(similarity, axis=1)
row_max = similarity[np.arange(similarity.shape[0]), row_arg_max]
row_indices = np.where(row_max >= 0.7)[0]
row_arg_max = row_arg_max[row_indices]

In [17]:
mapper = pd.DataFrame({'v' : categories.iloc[row_arg_max].values}, index=book_categories_all[row_indices])

In [18]:
book_categories_mapped = book_categories.apply(lambda x : np.unique([mapper.loc[v].values[0] for v in x if v in mapper.index]).tolist())

In [19]:
try:
    books_eng.pop('popular_shelves')
    books_eng['categories'] = book_categories_mapped
except:
    pass

# Map authors id from 0 to n-1

In [20]:
authors = books_eng.authors.apply(lambda x : {int(v['author_id']) : v['role'] for v in x})
authors_id = authors.apply(lambda x : list(x.keys()))
authors_id_all = np.concatenate(authors_id.values)

In [21]:
authors_encoder = LabelEncoder().fit(authors_id_all)
#use mapper because LabelEncoder is slower on individual mappings (single values, not arrays)
authors_mapper = pd.Series(np.arange(authors_encoder.classes_.shape[0]), index=authors_encoder.classes_)

In [22]:
book_authors_in = book_authors[book_authors.author_id.isin(authors_encoder.classes_.astype(int))]
book_authors_in = book_authors_in.copy()[['author_id', 'name']]
book_authors_in['author_id'] = authors_encoder.transform(book_authors_in['author_id'])
book_authors_in = book_authors_in.sort_values('author_id')
book_authors_in.set_index('author_id', inplace=True)
book_authors_in.to_csv('book_authors.csv')

In [23]:
books_eng['authors'] = authors.apply(lambda x : {authors_mapper.loc[k] : v for k, v in x.items()})

# Convert interactions to implicit

In [24]:
interactions_rated_or_read_filt = (interactions.is_read == True) | (interactions.rating != 0)
interactions_rated = interactions[interactions_rated_or_read_filt]
interactions_rated = interactions_rated[interactions_rated.book_id.isin(books_eng.book_id)]

# Create y sparse matrix

In [25]:
def transform_rating_and_read_to_implicit(x):
    is_read = x['is_read']
    rating = x['rating']
    if rating != 0:
        return rating >= 4
    else:
        return is_read
values = interactions_rated[['is_read', 'rating']].apply(transform_rating_and_read_to_implicit, axis=1)
values = values.astype(int).values

In [26]:
book_id_encoder = LabelEncoder()
book_id_encoder.fit(books_eng.book_id)

user_id_encoder = LabelEncoder()
user_id_encoder.fit(interactions_rated.user_id)

In [27]:
p_books_eng = books_eng.iloc[np.argsort(book_id_encoder.transform(books_eng.book_id))]
assert (p_books_eng.book_id == book_id_encoder.classes_).all()
p_books_eng = p_books_eng.drop(columns='book_id')

In [28]:
cols = book_id_encoder.transform(interactions_rated.book_id)
rows = user_id_encoder.transform(interactions_rated.user_id)

In [29]:
y = csr_matrix((values, (rows, cols)), dtype=int)
y = csr_matrix((np.ones(y.count_nonzero()), y.nonzero()), dtype=int)

# Save y and english books sorted

In [30]:
save_npz('y.npz', y)
p_books_eng.to_csv('books_processed.csv', index=False)

# Create item features

In [31]:
books_columns_to_transform = pd.DataFrame()
books_columns_to_transform['content'] = p_books_eng['title'] + ' ' + p_books_eng['description']
books_columns_to_transform['authors'] = p_books_eng.authors.apply(lambda x : list(x.keys()))
books_columns_to_transform['categories'] = p_books_eng['categories']

In [32]:
item_preprocessing = ColumnTransformer([
    ('content', MostCommonWords(min_count=100, lower=True, 
                                replace_punctuation='', replace_number='', stem=True, is_input_array=False), ['content']),
    ('authors', MostCommonWords(min_count=1, lower=False,
                                replace_punctuation=None, replace_number=None, stem=False, is_input_array=True), ['authors']),
    ('categories', MostCommonWords(min_count=1, lower=False,
                                   replace_punctuation=None, replace_number=None, stem=False, is_input_array=True), ['categories'])
])

item_features = item_preprocessing.fit_transform(books_columns_to_transform)

In [33]:
identity_items = identity(p_books_eng.shape[0], dtype=int)

In [34]:
p_item_features = hstack([item_features, identity_items], dtype=int)

In [35]:
save_npz('item_features.npz', p_item_features)
joblib.dump(item_preprocessing, 'item_preprocessing.pkl')

['item_preprocessing.pkl']

# Create User features

In [36]:
p_interactions_rated = interactions_rated[['user_id', 'book_id']].copy()
p_interactions_rated['rating'] = values
p_interactions_rated['user_id'] = user_id_encoder.transform(p_interactions_rated['user_id'])
p_interactions_rated['book_id'] = book_id_encoder.transform(p_interactions_rated['book_id'])

In [37]:
p1_books_eng = p_books_eng[['categories']].copy()
p1_books_eng['book_id'] = np.arange(p1_books_eng.shape[0])

In [38]:
interactions_with_categories = p_interactions_rated.merge(p1_books_eng, on='book_id')

In [39]:
def process_user_categories(user_id, categories, rating):
    if rating == 1:
        user_categories[user_id]['positive'].update(categories)
    else:
        user_categories[user_id]['negative'].update(categories)

In [40]:
"""user_categories = ['positive' : counter, 'negative' : counter] of length n_users"""
user_categories = np.array([{'positive' : Counter(), 'negative': Counter()} for _ in range(user_id_encoder.classes_.shape[0])])

_ = interactions_with_categories.apply(lambda x : process_user_categories(x['user_id'], x['categories'], x['rating']), axis=1)

In [41]:
user_categories_ratio = np.array([{k : (v['positive'][k] / v['negative'][k]) 
                                    if k in v['positive'].keys() and k in v['negative'].keys() 
                                    else v['positive'][k] if k in v['positive'].keys() else 0 
                                    for k in v['positive'].keys() | v['negative'].keys()} 
                                    for v in user_categories])

In [42]:
#user_categories[i]['positive'][k] >= 2 because some categories have only 2 ratings
#therefore I assume users liking a 2 or less books in that category like that category
user_categories_filtered = [[k for k in user_categories_ratio[i].keys() 
                             if user_categories_ratio[i][k] >= 1 and user_categories[i]['positive'][k] >= 2]
                             for i in range(len(user_categories))]

In [43]:
user_categories_filtered_series = pd.Series(user_categories_filtered)

In [44]:
user_preprocessing = MostCommonWords(min_count=1, lower=False, 
                                replace_punctuation=None, replace_number=None, stem=False, is_input_array=True)

In [45]:
user_features = user_preprocessing.fit_transform(user_categories_filtered_series)

In [46]:
identity_users = identity(user_features.shape[0], dtype=int)

In [47]:
p_user_features = hstack([user_features, identity_users], dtype=int)

In [48]:
save_npz('user_features.npz', p_user_features)
joblib.dump(user_preprocessing, 'user_processing.pkl')

['user_processing.pkl']