# Preprocessing

This file contains all the code necessary to import/load the book & user data from GoodReads & Amazon, clean it, and prepare it for use

In [2]:
# import *a lot* of stuff
import numpy as np
import pandas as pd
import requests
import pickle
import scipy
import html
import gzip
import json
import csv
import sys
import re
import io

from pandas.api.types import CategoricalDtype

### Import & Clean Amazon Data

In [3]:
# import Amazon data, grab title & asin
asin_titles = []  # asin = Amazon Standard Identification Number

with gzip.open('./data/amazon_data/meta_Books.json.gz', 'r') as fp:
    for line in fp:
        a = json.loads(line)
        if 'title' in a:
            asin_titles.append(a['asin'] + ',' + a['title'] + '\n')

asin_titles[:3]

['0000092878,Biology Gods Living Creation Third Edition 10 (A Beka Book Science Series)\n',
 '000047715X,Mksap 16 Audio Companion: Medical Knowledge Self-Assessment Program\n',
 '0000004545,Flex! Discography of North American Punk, Hardcore, and Powerpop 1975-1985 A-M\n']

In [4]:
# clean Amazon data
undesireables = [
    'hardcover',
    'paperback',
    'volume',
    'unabridged',
    'boxed Set',
    'collection',
    'audiobook',
    'publication'
]

asin_titles_clean = []

# clean html, remove undesired words and ()/[]
for asin_title in asin_titles:
    asin_title = asin_title.lower()
    orig = asin_title
    asin_title = html.unescape(asin_title)
    for word in undesireables:
        asin_title = asin_title.replace(word, '')
    asin_title = re.sub("[\(\[].*?[\)\]]", "", asin_title)
    asin_titles_clean.append(asin_title)

asin_titles_clean[:3]

['0000092878,biology gods living creation third edition 10 \n',
 '000047715x,mksap 16 audio companion: medical knowledge self-assessment program\n',
 '0000004545,flex! discography of north american punk, hardcore, and powerpop 1975-1985 a-m\n']

In [13]:
# create dictionary of asin to name, generate a list of paired ratings w/ name
named_ratings = []
mapper = {}

for asin_title in asin_titles_clean:
    a_split = asin_title.strip().split(',', 1)
    if len(a_split) == 2:
        asin = a_split[0]
        name = a_split[1]
        mapper[asin] = name

with gzip.open('./data/amazon_data/proc_5.json.gz', 'r') as fp:
    for line in fp:
        a = json.loads(line)
        asin = str(a['asin'])
        if asin in mapper:
            a_split = [a['reviewerID'], asin, str(a['overall']), mapper[asin]]
            mapped_string = ",".join(a_split)
            named_ratings.append(mapped_string + '\n')

named_ratings[:3]

### Import & Clean GoodReads Data

In [None]:
# generate GoodReads book_id pairings
goodreadsid_to_bookid = {}

with open('./data/goodbooks-10k/books.csv', "r", encoding='utf8') as f:
    reader = csv.reader(f, delimiter=",")
    for i, line in enumerate(reader):
        book_id = line[0]
        goodreads_id = line[1]
        goodreadsid_to_bookid[goodreads_id] = book_id

with open('./data/goodbooks-10k/book_tags_with_bookid.csv', 'w') as out_file:
    out_file.write('book_id,goodreads_book_id,tag_id,count\n')

    with open('./data/goodbooks-10k/book_tags.csv', "r") as in_file:
        reader = csv.reader(in_file, delimiter=",")
        for i, line in enumerate(reader):
            if i > 0:  # don't include df header
                book_id = goodreadsid_to_bookid[line[0]]
                new_row = book_id + ',' + ','.join(line)
                out_file.write(new_row + '\n')

In [None]:
# decrease maxInt value by factor of 10 for as long as OverflowError occurs
maxInt = sys.maxsize
decrement = True
while decrement:
    decrement = False
    try:
        csv.field_size_limit(maxInt)
    except OverflowError:
        maxInt = int(maxInt/10)
        decrement = True

# util to clean string
def clean(s):
	s = s.lower().strip().replace('&', 'and')
	s = ''.join([i for i in s if (i.isalpha() or i.isspace())])
	s = ' '.join(s.split())
	return s

# load books from csv
with open('./data/goodbooks-10k/books.csv', 'r', encoding="utf-8") as f:
    books = list(csv.reader(f))

# create mapping of book titles & ids
mapper = {}
mapper_original = {}

for book in books:
    book_id = book[0]

    original_title = clean(book[9])
    title = clean(book[10])

    if book_id != 'book_id':
        mapper_original[original_title] = book_id
        mapper[title] = book_id

print(f'Number of books: {len(books)}')

### Compare & Combine GoodReads & Amazon Training/Test Data

In [None]:
# compare Amazon data with GoodReads
file = open('./data/amazon_ratings.csv', 'w')
file.write('user_id,book_id,rating\n')

amazon_ratings_not_in_goodreads = [] 

for rating in named_ratings:
    split = rating.strip().split(',', 3)
    if len(split) == 4:
        user_id = split[0]
        asin = split[1]
        val = int(float(split[2]))
        name = clean(split[3])

        book_id = None
        if name in mapper:
            book_id = mapper[name]
        if book_id == None and name in mapper_original:
            book_id = mapper_original[name]
        if book_id:
            row = user_id + ',' + book_id + ',' + str(val) + '\n'
            file.write(row)
        else:
            row = user_id + ',' + asin + ',' + str(val) + '\n'
            amazon_ratings_not_in_goodreads.append(row)
file.close()

amazon_ratings_not_in_goodreads[:3]

In [3]:
# load in GoodReads ratings data (for future dev so you don't need to rerun everything)
ratings_goodreads = pd.read_csv('./data/goodbooks-10k/ratings.csv')
ratings_goodreads['book_id'] = ratings_goodreads['book_id'].astype(int)
ratings_goodreads[:3]

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5


In [None]:
# load in Amazon ratings data (for future dev so you don't need to rerun everything)
ratings_amazon = pd.read_csv('./data/amazon_ratings.csv')
ratings_amazon['book_id'] = ratings_amazon['book_id'].astype(int)
ratings_amazon = ratings_amazon.drop_duplicates(subset = ['book_id', 'user_id'])
# Create set of users that rated 5 or more items
user_counts = ratings_amazon['user_id'].value_counts() >= 5 # minimum of 5 ratings
to_drop = set()
for key,value in user_counts.items():
    if not value:
        to_drop.add(key)
# drop users that don't have 5 or more ratings
ratings_amazon = ratings_amazon[~ratings_amazon['user_id'].isin(to_drop)]

ratings_amazon[:3]

In [None]:
# combine GoodReads & Amazon data
df = ratings_goodreads.append(ratings_amazon)
# only sort by book_id bc we use the transpose of the User-Item matrix in SVD
df = df.sort_values(by=['book_id'])
df.head()

In [16]:
# combine ratings that overlap
users = list(df.user_id.unique())
books = list(df.book_id.unique())
data = df['rating'].tolist()
user_type = CategoricalDtype(categories=users, ordered=True)
book_type = CategoricalDtype(categories=books, ordered=True)
row = df.user_id.astype(user_type).cat.codes
col = df.book_id.astype(book_type).cat.codes
sparse_matrix_ratings_combined = scipy.sparse.csr_matrix(
    (data, (row, col)), shape=(len(users), len(books)), dtype=np.dtype('u1'))

scipy.sparse.save_npz('output/ratings_combined', sparse_matrix_ratings_combined)

In [23]:
# Mapping amazon user_ids to unique ints
seen = {}
next_uid = 53424 + 1 # 1 more than last user in GoodReads dataset
for index, row in ratings_amazon.iterrows():
    username = row['user_id']
    if username not in seen:
        seen[username] = next_uid
        next_uid += 1
        
    ratings_amazon.at[index, 'user_id'] = seen[username]

ratings = pd.concat([ratings_goodreads, ratings_amazon])

ratings = ratings.sort_values(by=['user_id','book_id'])
ratings = ratings.reset_index(drop=True)

ratings.to_pickle('pickled/ratings.pkl')
ratings.head()


Unnamed: 0,user_id,book_id,rating
0,1,4,5
1,1,10,4
2,1,11,5
3,1,13,4
4,1,16,3


Unnamed: 0,user_id,book_id,rating
0,1,4,5
1,1,10,4
2,1,11,5
3,1,13,4
4,1,16,3


### Misc Data Cleaning

In [12]:
# load in books dataframe (for future dev so you don't need to rerun everything)
books = pd.read_pickle('pickled/books.pkl')
books.head()


Unnamed: 0_level_0,title,image_url,url,author,description,popular_shelves,tags
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,"The Hunger Games (The Hunger Games, #1)",https://images.gr-assets.com/books/1447303603m...,https://www.goodreads.com/book/show/2767052-th...,Suzanne Collins,winning will make you famous. losing means cer...,young-adult young-adult young-adult young-adu...,young-adult young-adult young-adult young-adu...
2,Harry Potter and the Sorcerer's Stone (Harry P...,https://images.gr-assets.com/books/1474154022m...,https://www.goodreads.com/book/show/3.Harry_Po...,J.K. Rowling,harry potter's life is miserable. his parents ...,fantasy fantasy fantasy fantasy fantasy fanta...,fantasy fantasy fantasy fantasy fantasy fanta...
3,"Twilight (Twilight, #1)",https://images.gr-assets.com/books/1361039443m...,https://www.goodreads.com/book/show/41865.Twil...,Stephenie Meyer,about three things i was absolutely positive.f...,young-adult young-adult young-adult young-adu...,young-adult young-adult young-adult young-adu...
4,To Kill a Mockingbird,https://images.gr-assets.com/books/1361975680m...,https://www.goodreads.com/book/show/2657.To_Ki...,Harper Lee,the unforgettable novel of a childhood in a sl...,classics classics classics classics classics ...,classics classics classics classics classics ...
5,The Great Gatsby,https://images.gr-assets.com/books/1490528560m...,https://www.goodreads.com/book/show/4671.The_G...,F. Scott Fitzgerald,"the great gatsby, f. scott fitzgerald’s third ...",classics classics classics classics classics ...,classics classics classics classics classics ...


In [13]:
# fix book df missing many images
new_books = books.copy(True)
imageExp = re.compile("<img id=\"coverImage\" alt=\"[^\"]*\" src=\"([^\"]*)\"")

j = 0
for i in range(new_books.shape[0]):
    if "nophoto" in new_books.iloc[i, :].image_url:
        response = requests.get(new_books.iloc[i, :].url)
        try:
            new_url = imageExp.search(response.text).group(1)
            new_books.iloc[i, :].image_url = new_url
            j += 1
            if j % 50 == 49:
                print(j + 1)
        except AttributeError:
            print('No image for %d' % i)

new_books.head()

50


In [15]:
new_books.image_url.str.contains('nophoto').sum()

3251

In [6]:
books = pd.read_pickle('pickled/books.pkl')
books.head()

# with open('output/books_df', 'wb') as f:
#     pickle.dump(books, f)


Unnamed: 0_level_0,title,image_url,url,author,description,popular_shelves,tags
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,"The Hunger Games (The Hunger Games, #1)",https://images.gr-assets.com/books/1447303603m...,https://www.goodreads.com/book/show/2767052-th...,Suzanne Collins,Winning will make you famous. Losing means cer...,young-adult young-adult young-adult young-adu...,young-adult young-adult young-adult young-adu...
2,Harry Potter and the Sorcerer's Stone (Harry P...,https://images.gr-assets.com/books/1474154022m...,https://www.goodreads.com/book/show/3.Harry_Po...,J.K. Rowling,Harry Potter's life is miserable. His parents ...,fantasy fantasy fantasy fantasy fantasy fanta...,fantasy fantasy fantasy fantasy fantasy fanta...
3,"Twilight (Twilight, #1)",https://images.gr-assets.com/books/1361039443m...,https://www.goodreads.com/book/show/41865.Twil...,Stephenie Meyer,About three things I was absolutely positive.F...,young-adult young-adult young-adult young-adu...,young-adult young-adult young-adult young-adu...
4,To Kill a Mockingbird,https://images.gr-assets.com/books/1361975680m...,https://www.goodreads.com/book/show/2657.To_Ki...,Harper Lee,The unforgettable novel of a childhood in a sl...,classics classics classics classics classics ...,classics classics classics classics classics ...
5,The Great Gatsby,https://images.gr-assets.com/books/1490528560m...,https://www.goodreads.com/book/show/4671.The_G...,F. Scott Fitzgerald,"THE GREAT GATSBY, F. Scott Fitzgerald’s third ...",classics classics classics classics classics ...,classics classics classics classics classics ...


In [43]:
indices = pd.Series(books.index, index=books['title']).drop_duplicates()
indices[:3]


title
The Hunger Games (The Hunger Games, #1)                     1
Harry Potter and the Sorcerer's Stone (Harry Potter, #1)    2
Twilight (Twilight, #1)                                     3
Name: id, dtype: int64

In [60]:
books.loc[1000]

title                                   Shadow and Bone (Grisha, #1)
image_url          https://images.gr-assets.com/books/1339533695m...
url                https://www.goodreads.com/book/show/10194157-s...
author                                                 Leigh Bardugo
description        surrounded by enemies, the once-great nation o...
popular_shelves     fantasy fantasy fantasy fantasy fantasy fanta...
tags                fantasy fantasy fantasy fantasy fantasy fanta...
Name: 1000, dtype: object