In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import warnings


In [83]:
books = pd.read_csv("C:/University of Chicago/Machine Learning/Project\Machine-Learning-Book-Ratings/data/clean/books_clean.csv")
ratings = pd.read_csv("C:/University of Chicago/Machine Learning/Project\Machine-Learning-Book-Ratings/data/clean/ratings_clean.csv")
users = pd.read_csv("C:/University of Chicago/Machine Learning/Project\Machine-Learning-Book-Ratings/data/clean/users_clean.csv")

In [85]:
users.country.isnull().sum()

4543

In [86]:
#unifying the books with ratings tables on isbn field
#number of entries in books table
len(books)


266732

In [87]:
#ratings table size
len(ratings)

433671

In [62]:
books_with_ratings = pd.merge(ratings, books, on='isbn')

In [63]:
#Checking if there are any rows with title/author missing
books_with_ratings.book_title.isnull().sum()

0

In [64]:
books_with_ratings.book_author.isnull().sum()

0

In [65]:
books_with_ratings.head()

Unnamed: 0,Unnamed: 0_x,user_id,isbn,book_rating,Unnamed: 0_y,book_title,book_author,year_of_publication,publisher
0,1,276726,0155061224,5,225827,Rites of Passage,Judith Rae,2001.0,Heinle
1,3,276729,052165615X,3,246849,Help!: Level 1,Philip Prowse,1999.0,Cambridge University Press
2,4,276729,0521795028,6,246850,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001.0,Cambridge University Press
3,8,276744,038550120X,7,9295,A Painted House,JOHN GRISHAM,2001.0,Doubleday
4,48815,11676,038550120X,10,9295,A Painted House,JOHN GRISHAM,2001.0,Doubleday


In [66]:
books_with_ratings =books_with_ratings.drop(columns =["Unnamed: 0_x", "Unnamed: 0_y"])

In [67]:
#unifying ISBN numbers
#looking the number of books with multiple isbn numbers
multiple_isbns = books_with_ratings.groupby('book_title').isbn.nunique()
multiple_isbns.value_counts()

1     123409
2       7733
3       1426
4        457
5        179
6         81
7         37
8         24
9          8
10         6
16         2
15         2
14         2
13         2
12         2
11         2
Name: isbn, dtype: int64

In [68]:
has_multiple_isbns= multiple_isbns.where(multiple_isbns>1) #sets NaN for those with just 1 isb

In [69]:
#removing the ones with just one isbn
has_multiple_isbns.dropna(inplace=True)

In [70]:
len(has_multiple_isbns)

9963

In [71]:
#Create dictionary for books with multiple isbns
def make_isbn_dict(df):
    title_isbn_dict = {}
    for title in has_multiple_isbns.index:
        isbn_series = df.loc[df.book_title==title].isbn.unique() # returns only the unique ISBNs
        title_isbn_dict[title] = isbn_series.tolist()
    return title_isbn_dict

%time dict_unique_isbn = make_isbn_dict(books_with_ratings)

Wall time: 11min 23s


In [72]:
with open('multiple_isbn_dict.pickle', 'wb') as handle:
    pickle.dump(dict_unique_isbn, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [73]:
# LOAD isbn_dict back into namespace
with open('multiple_isbn_dict.pickle', 'rb') as handle:
    multiple_isbn_dict = pickle.load(handle)

In [74]:
#checking the number of entries in the dictiionaries
len(multiple_isbn_dict)

9963

In [75]:
# Adding 'unique_isbn' column to 'books_with_ratings' dataframe that includes the first ISBN if multiple ISBNS,or the single unique isbn
def add_unique_isbn_col(df):
    df['unique_isbn'] = df.apply(lambda row: multiple_isbn_dict[row.book_title][0] if row.book_title in multiple_isbn_dict.keys() else row.isbn, axis=1)
    return df

%time books_with_ratings = add_unique_isbn_col(books_with_ratings)

Wall time: 12.2 s


In [76]:
#checking an example to see that the unique column has been added 
books_with_ratings[books_with_ratings.book_title=="Jane Eyre"].head()

Unnamed: 0,user_id,isbn,book_rating,book_title,book_author,year_of_publication,publisher,unique_isbn
40062,387,1590071212,9,Jane Eyre,Charlotte Bronte,2002.0,New Millenium Audio,1590071212
65551,2718,451518845,10,Jane Eyre,Charlotte Bronte,1999.0,New Amer Library Classics,1590071212
65552,70205,451518845,9,Jane Eyre,Charlotte Bronte,1999.0,New Amer Library Classics,1590071212
65553,218121,451518845,10,Jane Eyre,Charlotte Bronte,1999.0,New Amer Library Classics,1590071212
109854,8370,451523326,8,Jane Eyre,Charlotte Bronte,1988.0,Signet Classics,1590071212


In [77]:
#joining the users table on hte user_id field
books_users_ratings = books_with_ratings.merge(users, on='user_id')


In [78]:
books_users_ratings.head()

Unnamed: 0.1,user_id,isbn,book_rating,book_title,book_author,year_of_publication,publisher,unique_isbn,Unnamed: 0,age,city,state,country
0,276726,0155061224,5,Rites of Passage,Judith Rae,2001.0,Heinle,0155061224,276725,34.786876,seattle,washington,usa
1,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999.0,Cambridge University Press,052165615X,276728,16.0,rijeka,,croatia
2,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001.0,Cambridge University Press,0521795028,276728,16.0,rijeka,,croatia
3,276744,038550120X,7,A Painted House,JOHN GRISHAM,2001.0,Doubleday,038550120X,276743,34.786876,torrance,california,usa
4,11676,038550120X,10,A Painted House,JOHN GRISHAM,2001.0,Doubleday,038550120X,11675,34.786876,,,


In [88]:
books_users_ratings.drop(columns = 'Unnamed: 0')

Unnamed: 0,user_id,isbn,book_rating,book_title,book_author,year_of_publication,publisher,unique_isbn,age,city,state,country
0,276726,0155061224,5,Rites of Passage,Judith Rae,2001.0,Heinle,0155061224,34.786876,seattle,washington,usa
1,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999.0,Cambridge University Press,052165615X,16.000000,rijeka,,croatia
2,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001.0,Cambridge University Press,0521795028,16.000000,rijeka,,croatia
3,276744,038550120X,7,A Painted House,JOHN GRISHAM,2001.0,Doubleday,038550120X,34.786876,torrance,california,usa
4,11676,038550120X,10,A Painted House,JOHN GRISHAM,2001.0,Doubleday,038550120X,34.786876,,,
5,11676,0671537458,8,Waiting to Exhale,Terry McMillan,1995.0,Pocket,0671537458,34.786876,,,
6,11676,0679776818,8,Birdsong: A Novel of Love and War,Sebastian Faulks,1997.0,Vintage Books USA,0679776818,34.786876,,,
7,11676,0684867621,3,The Girl Who Loved Tom Gordon : A Novel,Stephen King,1999.0,Scribner,0684867621,34.786876,,,
8,11676,3499230933,1,Adressat unbekannt.,Kathrine Kressmann Taylor,2002.0,Rowohlt Tb.,3499230933,34.786876,,,
9,11676,8437606322,8,Anna Karenina,Leo Tolstoy,1999.0,Ediciones Catedra S.A.,8437606322,34.786876,,,


In [89]:
books_users_ratings.to_csv("C:/University of Chicago/Machine Learning/Project/Machine-Learning-Book-Ratings/data/clean/books_users_ratings.csv")