In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from fuzzywuzzy import fuzz, process
import itertools
import warnings
from bs4 import BeautifulSoup as bs
from requests import get
import json
import time
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm

In [2]:
books = pd.read_csv("data/clean/books_clean.csv")
ratings = pd.read_csv("data/clean/ratings_clean.csv")
users = pd.read_csv("data/clean/users_clean.csv")

# Webscrape to fill missing data
Google only permits 1000 hits a day, so this method is not feasible. The code may be useful for other cases though.

In [None]:
isbn = isbns_to_loop[5]
url = f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}&key={api_key}"
response = get(url)
dict_response = json.loads(response.text)

In [None]:
# Used to test function get_book_details
dict_response["totalItems"]
check_isbn = [[i, resp_sub["identifier"] == isbn] 
  for i, resp in enumerate(dict_response["items"]) 
  for j, resp_sub in enumerate(dict_response["items"][i]["volumeInfo"]["industryIdentifiers"])]

correct_book_index = [value[0] for value in check_isbn if value[1] is True]
correct_book_index

In [None]:
def get_book_details(isbn):
    url = f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}&key={api_key}"
    response = get(url)
    dict_response = json.loads(response.text)
    title, author, year_of_publication, publisher = [], [], [], []
    total_items = dict_response["totalItems"]
    
    if dict_response["totalItems"]:
        total_items = dict_response["totalItems"]
    
        if total_items >= 1:    

            book_details = dict_response["items"][0]["volumeInfo"]

            try:
                title.append(book_details["title"])
            except:
                title.append(np.nan)
            try:
                author.append(book_details["authors"][0])
            except:
                author.append(np.nan)
            try:
                year_of_publication.append(book_details["publishedDate"])
            except:
                year_of_publication.append(np.nan)
            try:
                publisher.append(book_details["publisher"])
            except:
                publisher.append(np.nan)

            book_df = pd.DataFrame(title, columns=["book_title"])
            book_df["isbn"] = isbn
            book_df["book_author"] = author
            book_df["year_of_publication"] = year_of_publication
            book_df["publisher"] = publisher
            return(book_df)

In [None]:
books_with_ratings_dirty = pd.merge(ratings, books, how="left", on="isbn")
missing_books = books_with_ratings_dirty.loc[books_with_ratings_dirty.book_title.isnull()].drop_duplicates(subset="isbn")
isbns_to_loop = missing_books.isbn.values[963:(963+159)] # [0:975]
isbns_to_loop.shape

In [None]:
# isbns_to_loop = missing_books.sample(n=2).isbn.values
books_retrieved = pd.DataFrame(columns=["book_title", "isbn", "book_author", "year_of_publication", "publisher"])
for i in tqdm(range(0, len(isbns_to_loop))):
    if i is 0:
        books_retrieved = pd.concat([books_retrieved, get_book_details(isbns_to_loop[i])])
    else:
        books_retrieved = pd.concat([books_retrieved, get_book_details(isbns_to_loop[i])], ignore_index=True)
    time.sleep(1)

In [None]:
# books_retrieved.to_csv("books_retrieved_0_through_961.csv", index=False)

# Data cleaning
Above is for filling in missing data with webscraping

__Key problems:__

1. Duplicate ISBNs for same book due to different editions

1. Typos in author name

We will address the duplicate ISBNs by cleaning the author names with text processing methods, including lower case to avoid case-sensitive differences, tokenizing, and removal of special characters (for the purposes of normalizing author names). Once the author names are clean, we will join to `rating` to `books` and normalize the author names with fuzzy string matching using the __fuzzywuzzy__ (no pun intended). The normalization is achieved by selecting the author name that is the most prevelant in `rating`. If only one author name exists, the record is ignored.

In [3]:
min_publication_df = pd.merge(books.groupby(["book_title", "book_author"]).agg({"year_of_publication": np.min}), 
         books.drop("year_of_publication", axis=1), on=["book_title", "book_author"])
clean_pub_year_df = min_publication_df.assign(year_of_publication=min_publication_df.year_of_publication.astype("int"))
pub_year_df = clean_pub_year_df.drop(["publisher"], axis=1)
books_with_ratings = pd.merge(ratings, pub_year_df, on="isbn")

books_with_ratings.book_title = books_with_ratings.book_title.str.lower()
books_with_ratings.book_author = books_with_ratings.book_author.str.lower()
books.book_title = books.book_title.str.lower()
books.book_author = books.book_author.str.lower()

In [4]:
author_count_df = books_with_ratings.groupby(["book_author", "book_title"]).nunique().drop(["book_title", "book_author"], axis=1).reset_index(level=0).reset_index(level=0)
titles_to_loop = author_count_df.book_title.unique()
titles_to_loop.shape

(132021,)

In [5]:
# http://danshiebler.com/2016-09-14-parallel-progress-bar/
def parallel_process(array, function, n_jobs=16, use_kwargs=False, front_num=3):
    """
        A parallel version of the map function with a progress bar. 

        Args:
            array (array-like): An array to iterate over.
            function (function): A python function to apply to the elements of array
            n_jobs (int, default=16): The number of cores to use
            use_kwargs (boolean, default=False): Whether to consider the elements of array as dictionaries of 
                keyword arguments to function 
            front_num (int, default=3): The number of iterations to run serially before kicking off the parallel job. 
                Useful for catching bugs
        Returns:
            [function(array[0]), function(array[1]), ...]
    """
    # We run the first few iterations serially to catch bugs
    if front_num > 0:
        front = [function(**a) if use_kwargs else function(a) for a in array[:front_num]]
    # If we set n_jobs to 1, just run a list comprehension. This is useful for benchmarking and debugging.
    if n_jobs==1:
        return front + [function(**a) if use_kwargs else function(a) for a in tqdm(array[front_num:])]
    # Assemble the workers
    with ProcessPoolExecutor(max_workers=n_jobs) as pool:
        # Pass the elements of array into function
        if use_kwargs:
            futures = [pool.submit(function, **a) for a in array[front_num:]]
        else:
            futures = [pool.submit(function, a) for a in array[front_num:]]
        kwargs = {
            'total': len(futures),
            'unit': 'it',
            'unit_scale': True,
            'leave': True
        }
        # Print out the progress as tasks complete
        for f in tqdm(as_completed(futures), **kwargs):
            pass
    out = []
    # Get the results from the futures. 
    for i, future in tqdm(enumerate(futures)):
        try:
            out.append(future.result())
        except Exception as e:
            out.append(e)
    return front + out

In [6]:
def match_author(title, df=author_count_df):
    df = df.loc[df.book_title == title]
    popular_author = df[df.user_id == df.user_id.max()].book_author.values[0]
    check_authors = df[~(df.user_id == df.user_id.max())].book_author
    returned_scores = process.extract(popular_author, check_authors)
    match_dict = {}
    high_matches = [match[0] for match in returned_scores if match[1] >= 90]
    if high_matches:
        match_dict[popular_author] = high_matches
        return(match_dict)

## Process Author Names
This loop takes about 10-15 minutes depending on the local machine. Defaulted to eight cores.

In [7]:
job_results = parallel_process(titles_to_loop, match_author, n_jobs=8)

100%|██████████| 132k/132k [12:13<00:00, 180it/s]    
132018it [00:00, 462406.64it/s]


In [9]:
removed_none_results = [result for result in job_results if result is not None]
mapped_authors = {k: v for result in removed_none_results for k, v in result.items()}

In [10]:
mapped_authors

{'a. a. milne': ['a.a. milne'],
 'a.a. milne': ['a. a. milne'],
 'a.c. crispin': ['a. c. crispin'],
 'a.j. jacobs': ['a. j. jacobs'],
 'a.s.  byatt': ['a. s. byatt'],
 'aaron elkins': ['aaron j. elkins'],
 'edward abbey': ['abbey'],
 'adã¨le geras': ['adele geras'],
 'adriana trigiana': ['adriana trigiani'],
 'agatha christie': ['christie'],
 'aidan, macfarlane': ['aidan macfarlane'],
 'alan lightman': ['alan p. lightman'],
 'albert camus': ['albert  camus'],
 'gore albert': ['albert gore'],
 'louisa may alcott': ['louisa m. alcott'],
 'leopold': ['aldo leopold'],
 'aleksandr solzhenitsyn': ['aleksandr isaevich solzhenitsyn'],
 'alexis de tocqueville': ['alexis de tocquevil'],
 'alistair maclean': ['maclean'],
 'amelie nothomb': ['amã©lie nothomb'],
 'andrew': ['brother andrew'],
 'andrew a. rooney': ['andrew rooney'],
 'andrew vachss': ['andrew h. vachss'],
 'andrew m. greeley': ['andrew m greeley'],
 'andrew weil m.d.': ['andrew weil'],
 'v.c. andrews': ['v. c. andrews'],
 'ann margr

In [11]:
def replace_book_author(df, key, value):
    return(df[df.book_author.isin(value)].assign(book_author=key))

In [12]:
author_mapped_df = pd.concat([replace_book_author(books_with_ratings, key, value) for key, value, in mapped_authors.items()])
author_mapped_df.head()

Unnamed: 0,user_id,isbn,book_rating,book_title,book_author,year_of_publication
216069,36907,140361219,10,winnie-the-pooh,a. a. milne,1992
216070,46690,140361219,10,winnie-the-pooh,a. a. milne,1992
216071,76499,140361219,10,winnie-the-pooh,a. a. milne,1992
216072,94347,140361219,8,winnie-the-pooh,a. a. milne,1992
216073,128696,140361219,10,winnie-the-pooh,a. a. milne,1992


In [75]:
books_with_ratings_ex_dirty_authors = books_with_ratings[~books_with_ratings.isbn.isin(author_mapped_df.isbn)]
books_with_ratings_author_clean = pd.concat([books_with_ratings_ex_dirty_authors, author_mapped_df])
books_with_ratings_author_clean.head()

Unnamed: 0,user_id,isbn,book_rating,book_title,book_author,year_of_publication
0,276726,0155061224,5,rites of passage,judith rae,2001
1,276729,052165615X,3,help!: level 1,philip prowse,1999
2,276729,0521795028,6,the amsterdam connection : level 4 (cambridge ...,sue leather,2001
3,276744,038550120X,7,a painted house,john grisham,2001
4,11676,038550120X,10,a painted house,john grisham,2001


In [76]:
unique_books = books.drop_duplicates(subset=["book_title", "book_author"]).sort_values(by=["book_title", "book_author"])
books_with_ratings_unique_isbn = pd.merge(books_with_ratings_author_clean, unique_books.drop(["year_of_publication", "publisher"], axis=1), 
         on=["book_title", "book_author"], suffixes=["_original", "_unique"])

In [77]:
data_sets = {"books_with_ratings_unique_isbn": books_with_ratings_unique_isbn.iloc[:, 2:4].drop_duplicates().shape[0],
            "books_with_ratings_author_clean": books_with_ratings_author_clean.iloc[:, 2:4].drop_duplicates().shape[0],
            "books": books.iloc[:, 1:3].drop_duplicates().shape[0]}

In [61]:
data_sets

{'books_with_ratings_unique_isbn': 213889,
 'books_with_ratings_author_clean': 220709,
 'books': 244389}

In [78]:
print(f"The `books_with_ratings_unique_isbn` dataset is now {data_sets['books_with_ratings_unique_isbn']} rows, down {data_sets['books_with_ratings_author_clean'] - data_sets['books_with_ratings_unique_isbn']} and {data_sets['books'] - data_sets['books_with_ratings_unique_isbn']} from `books_with_ratings_author_clean` and `books` datasets, respectively.")

The `books_with_ratings_unique_isbn` dataset is now 213889 rows, down 6820 and 30500 from `books_with_ratings_author_clean` and `books` datasets, respectively.


In [80]:
# checking an example to see that the unique column has been added 
books_with_ratings_unique_isbn[books_with_ratings_unique_isbn.book_title == "jane eyre"].head()

Unnamed: 0,user_id,isbn_original,book_rating,book_title,book_author,year_of_publication,isbn_unique
48097,387,1590071212,9,jane eyre,charlotte bronte,1976,451518845
48098,2718,451518845,10,jane eyre,charlotte bronte,1976,451518845
48099,70205,451518845,9,jane eyre,charlotte bronte,1976,451518845
48100,218121,451518845,10,jane eyre,charlotte bronte,1976,451518845
48101,8370,451523326,8,jane eyre,charlotte bronte,1976,451518845


## Explort data

In [None]:
books_with_ratings_unique_isbn_export = books_with_ratings_unique_isbn.rename(
    columns={'isbn_original':'isbn', 'isbn_unique':'unique_isbn'})

In [86]:
books_with_ratings_unique_isbn_export.columns

Index(['user_id', 'isbn', 'book_rating', 'book_title', 'book_author',
       'year_of_publication', 'unique_isbn'],
      dtype='object')

In [88]:
books_with_ratings_unique_isbn_export.to_csv("data/clean/books_users_ratings.csv", index=False)

# Deprecated
The section below is deprecated until further review.

In [None]:
books_with_ratings_unique_isbn.head()

In [None]:
books_with_ratings_unique_isbn.where(books_with_ratings_unique_isbn.isbn_original == "0001048082").dropna()

In [None]:
books_check = pd.read_csv("data/clean/books_clean.csv")

In [None]:
books_check.where(books_check.book_title == "Made in America").dropna()

In [None]:
books[books.duplicated(["book_title", "book_author"])]

In [None]:
pd.DataFrame(books.book_title.value_counts()[books.book_title.value_counts() > 1]).shape

In [None]:
books.groupby(["book_title", "book_author"]).count().sort_values(by="isbn", ascending=False)

In [None]:
count_books = books.groupby(["book_title", "book_author"]).count().sort_values(by="isbn", ascending=False)

In [None]:
duplicate_books = count_books[count_books.isbn > 1].index.values

In [None]:
def closest_matches(author, author_list):
    unique_authors = np.unique([b.lower() for b in author_list])
    unique_exclude_searching_author = [ua != author.lower() for ua in unique_authors]
    processed_authors = list(itertools.compress(unique_authors, unique_exclude_searching_author))
    len_authors = [len(a) for a in processed_authors]
    len_check = [(len(author) - 8) <= l_authors <= (8 + len(author)) for l_authors in len_authors]
    first_letter_check = [author_c[0] == author.lower()[0] for author_c in processed_authors]
    combined_check = [(letter is True) & (length is True) for length, letter in zip(len_check, first_letter_check)]
    authors_to_check = list(itertools.compress(processed_authors, combined_check))
    return(process.extract(author, authors_to_check, scorer=fuzz.partial_ratio))

def clean_author(author, author_list):
    author_matches = closest_matches(author, author_list)
    results = [returned_author for returned_author, score in author_matches if score >= 90]
    if results:
        return({author: results})
    else:
        return(False)

In [None]:
test_dict = clean_author("Stephen King", books.book_author)

In [None]:
u_authors = sorted(np.unique([b.lower() for b in books.book_author]))[0:10]
u_authors

In [None]:
for auth in u_authors:
    if clean_author(auth, books.book_author) is None:
        continue
    else:
        auth_results = clean_author(auth, books.book_author)
        ba = books.book_author[~books.book_author.isin(auth_results[auth])]
        book_authors = books.book_author[books.book_author]

In [None]:
# Checking if there are any rows with title/author missing
books_with_ratings.book_title.isnull().sum()

In [None]:
books_with_ratings.book_author.isnull().sum()

In [None]:
books_with_ratings.head()

In [None]:
# unifying ISBN numbers
# looking the number of books with multiple isbn numbers
multiple_isbns = books_with_ratings_clean.groupby(["book_title"]).isbn.nunique()
multiple_isbns.value_counts()

In [None]:
has_multiple_isbns = multiple_isbns.where(multiple_isbns > 1) # sets NaN for those with just 1 isb

In [None]:
# removing the ones with just one isbn
has_multiple_isbns.dropna(inplace=True)

In [None]:
len(has_multiple_isbns)

In [None]:
# Create dictionary for books with multiple isbns
def make_isbn_dict(df):
    title_isbn_dict = {}
    for title in has_multiple_isbns.index:
        isbn_series = df.loc[df.book_title == title].isbn.unique() # returns only the unique ISBNs
        title_isbn_dict[title] = isbn_series.tolist()
    return title_isbn_dict

%time dict_unique_isbn = make_isbn_dict(books_with_ratings)

In [None]:
with open('multiple_isbn_dict.pickle', 'wb') as handle:
    pickle.dump(dict_unique_isbn, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# LOAD isbn_dict back into namespace
with open('multiple_isbn_dict.pickle', 'rb') as handle:
    multiple_isbn_dict = pickle.load(handle)

In [None]:
# checking the number of entries in the dictiionaries
len(multiple_isbn_dict)

In [None]:
# Adding 'unique_isbn' column to 'books_with_ratings' dataframe that includes the first ISBN if multiple ISBNS,
# or the single unique isbn
def add_unique_isbn_col(df):
    df['unique_isbn'] = df.apply(lambda row: multiple_isbn_dict[row.book_title][0] if row.book_title in multiple_isbn_dict.keys() else row.isbn, axis=1)
    return df

%time books_with_ratings = add_unique_isbn_col(books_with_ratings)

In [None]:
# checking an example to see that the unique column has been added 
books_with_ratings[books_with_ratings.book_title == "Jane Eyre"].head()

In [None]:
# joining the users table on the user_id field
books_users_ratings = books_with_ratings.merge(users, on='user_id')

In [None]:
books_users_ratings.head()

In [None]:
books_users_ratings.to_csv("data/clean/books_users_ratings.csv", index=False)