# Networking

This file contains all the code necessary to design, train, and test both the feature-based and Singular Value Decomposition (SVD) recommendation models

In [1]:
# import *a lot* of stuff
import numpy as np
import pandas as pd
import surprise
import scipy
import html
import csv
import sys
import os
import re

from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.utils.extmath import randomized_svd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from surprise import Reader, Dataset, SVD, dump, accuracy
from utils.xml_to_dict import dict_from_xml_file

sys.path.append('./utils')

In [3]:
# create mapping mechanism
gr_to_id = {}
with open('./data/goodbooks-10k/books.csv', "r", encoding='utf8') as f:
    reader = csv.reader(f, delimiter=",")
    for i, line in enumerate(reader):
        gr_to_id[line[1]] = line[0]

gr_to_id.items

<function dict.items>

### Feature Matrix

In [4]:
# prepare tags feature
undesireables = {
    'to-read',
    'currently-reading',
    'books-i-own',
    'on-hold',
    'favorite',
    'favorites',
    'owned',
    'owned-books',
    'read',
    'favourites',
    'default',
    'kindle',
    'my-books',
    'to-buy',
    'all-time-favorites',
    're-read',
    'i-own',
    'ebook'
}

tag_defs = {}
with open('./data/goodbooks-10k/tags.csv', "r", encoding='utf8') as f:
    reader = csv.reader(f, delimiter=",")
    for i, line in enumerate(reader):
        tag_defs[line[0]] = line[1]

    book_tags = {}
    with open('./data/goodbooks-10k/book_tags_with_bookid.csv', "r", encoding='utf8') as f:
        reader = csv.reader(f, delimiter=",")
        for i, line in enumerate(reader):
            gr_id = line[1]
            tag_id = line[2]
            count = line[3]
            if gr_id not in book_tags:
                book_tags[gr_id] = {}

            tag_name = tag_defs[tag_id]
            if tag_name not in undesireables:
                book_tags[gr_id][tag_name] = count

book_tags.items

<function dict.items>

In [6]:
# util to format string in easily readable way
def clean_string(s):
    if not s:
        return s

    TAG_RE = re.compile(r'<[^>]+>')
    s = html.unescape(s)
    s = TAG_RE.sub('', s)
    # s = s.lower()
    return s

In [7]:
# create additional features for the content-based matrix
books = []

for file in os.listdir('./data/goodbooks-10k/books_xml/books_xml'):
    raw, popular_shelves = dict_from_xml_file(
        './data/goodbooks-10k/books_xml/books_xml/' + os.fsdecode(file))

    book = {}
    goodreads_id = raw['book']['id']
    book['id'] = gr_to_id[goodreads_id]
    book['title'] = raw['book']['title']
    book['image_url'] = raw['book']['image_url']
    book['url'] = raw['book']['url']
    book['author'] = raw['book']['authors']['author']

    # only use first author
    if isinstance(book['author'], dict):
        book['author'] = book['author']['name']
    else:
        book['author'] = book['author'][0]['name']

    book['description'] = raw['book']['description']
    book['description'] = clean_string(book['description'])

    # add pop shelves & tags features
    book['popular_shelves'] = ''
    normalizing_value = 5
    for key, value in popular_shelves.items():
        for i in range(int(value) // normalizing_value):
            book['popular_shelves'] += ' ' + key

    book['tags'] = ''
    tags = book_tags[goodreads_id]
    for key, value in tags.items():
        for i in range(int(value) // normalizing_value):
            book['tags'] += ' ' + key

    books.append(book)

books_df = pd.DataFrame(books)
books_df['id'] = books_df['id'].astype(int)
books_df = books_df.sort_values(by=['id'])
books_df = books_df.set_index('id')

books_df['description'] = books_df['description'].fillna('')
books_df.to_pickle('pickled/books.pkl')

books_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 1 to 10000
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   title            10000 non-null  object
 1   image_url        10000 non-null  object
 2   url              10000 non-null  object
 3   author           10000 non-null  object
 4   description      10000 non-null  object
 5   popular_shelves  10000 non-null  object
 6   tags             10000 non-null  object
dtypes: object(7)
memory usage: 625.0+ KB


In [26]:
# fetch combined dataset
combined_ratings = scipy.sparse.load_npz('models/ratings_combined.npz')
combined_ratings.shape

(210612, 10000)

In [33]:
# util function to reduce a matrix into its components using SVD
def reduce_matrix(X, n_components=1000, n_iter=7, random_state=None):
    """
    Props:
        X:              matrix to reduce
        n_components:   num singular values to limit to
        n_iter:         num iterations for SVD
        random_state:   random initial state SVD

    Returns:
        U: row representations
        S: singular values
        V: column representations
    """
    svd = TruncatedSVD(n_components=n_components, n_iter=n_iter, random_state=random_state)
    reduced_matrix = svd.fit_transform(X)
    return reduced_matrix, svd.singular_values_, svd.components_

In [29]:
# feature matrix
books = pd.read_pickle('pickled/books.pkl')
vectorizer = TfidfVectorizer(stop_words='english')

In [30]:
# fit feature matrix on features
tfidf_m_description = vectorizer.fit_transform(books['description'])
print(tfidf_m_description.shape)

tfidf_m_shelves = vectorizer.fit_transform(books['popular_shelves'])
print(tfidf_m_shelves.shape)

tfidf_m_tags = vectorizer.fit_transform(books['tags'])
print(tfidf_m_tags.shape)


(10000, 11245)

In [31]:
# Weight smaller matrices by ratio to largest column matrix
shelves_weight = tfidf_m_description.shape[1] / tfidf_m_shelves.shape[1]
tags_weight = tfidf_m_description.shape[1] / tfidf_m_tags.shape[1]

tfidf_m_shelves = tfidf_m_shelves.multiply(shelves_weight)
tfidf_m_tags = tfidf_m_tags.multiply(tags_weight)

feature_m = scipy.sparse.hstack([tfidf_m_description, tfidf_m_shelves, tfidf_m_tags])

scipy.sparse.save_npz('models/feature_m', feature_m)

In [34]:
# SVD on full features to calculate sum of eigen values (special set of scalars associated with a linear system of equation)
U, E, V = reduce_matrix(feature_m, n_components=3000)

total_eigen_values = 0
for e in E:
    total_eigen_values += (e*e)
total_eigen_values


567667.5952636951

In [35]:
features_U, E_reduced, _ = reduce_matrix(feature_m, n_components=1000)

total_eigen_values = 567667.43158802704
reduced_eigen_values = 0
for e in E_reduced:
    reduced_eigen_values += (e*e)
reduced_eigen_values

554451.312193875

In [36]:
# Save reduced feature matrix
np.save('models/feature_matrix_1000.npy', features_U)

### SVD Matrix

In [41]:
# svd matrix
ratings = pd.read_pickle('pickled/ratings.pkl')

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['user_id', 'book_id', 'rating']], reader)

svd = SVD(n_epochs=100, n_factors=300, lr_all=0.005, reg_all=0.02)

In [42]:
# fit SVD model with training data
train = data.build_full_trainset()
svd.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2088223e908>

In [45]:
# save svd model
dump.dump('models/svd_100_300', algo=svd)
# convert SVD object saved by Surprise into numpy array and save that
svd = dump.load('models/svd_100_300')[1]
np.save('models/svd_100_300.npy', svd.qi)
svd.qi.shape

(10000, 300)

### Combined Matrix

In [48]:
# define the dual collaborative & ratings based matrix
ratings_U, _, _ = reduce_matrix(combined_ratings.T, n_components=20)
features_U, _, _ = reduce_matrix(feature_m, n_components=100)

# normalize everything
max_rating = np.max(ratings_U)
ratings_U = ratings_U / max_rating

max_feature = np.max(features_U)
features_U = features_U / max_feature

recs_m = np.hstack((ratings_U, features_U))
recs_m.shape

(10000, 120)

In [49]:
# load in feature matrix (for future dev so you don't need to rerun everything)
feature_m = np.load('models/feature_m_1000.npy')
feature_m.shape

(10000, 1000)

In [51]:
# load in SVD matrix (for future dev so you don't need to rerun everything)
svd_m = np.load('models/svd_100_300.npy')
svd_m.shape

(10000, 300)

In [52]:
# stack arrays horizonatally
recs_m = np.hstack((svd_m, feature_m))
recs_m.shape

(10000, 1300)

In [53]:
# Save final dual collaborative & content based matrix
np.save('models/readme_m.npy', recs_m)