In [86]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pl
import operator
from rake_nltk import Rake
from nltk.corpus import stopwords
from collections import defaultdict
# to disable a warning that had no point(raising an error because of slicing a dataframe on this way).
pd.options.mode.chained_assignment = None  # default='warn'

In [87]:
#Loading data
books = pd.read_csv('BX-Books.csv', sep=';', error_bad_lines=False, encoding="latin-1")
books.columns = ['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL']
users = pd.read_csv('BX-Users.csv', sep=';', error_bad_lines=False, encoding="latin-1")
users.columns = ['userID', 'Location', 'Age']
ratings = pd.read_csv('BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding="latin-1")
ratings.columns = ['userID', 'ISBN', 'bookRating']

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'


In [88]:
# PRE PROCESSING OVER THE DATA SET 
#dropping last three columns containing image URLs which will not be required for analysis.
books.drop(['imageUrlS', 'imageUrlM', 'imageUrlL'],axis=1,inplace=True)

In [89]:
#ratings dataset should have books only which exist in our books dataset, unless new books are added to books dataset
ratings_new = ratings[ratings.ISBN.isin(books.ISBN)]
print (ratings.shape)
print (ratings_new.shape)
#it can be seen that many rows having book ISBN not part of books dataset got dropped off

(1149780, 3)
(1031136, 3)


In [90]:
#ratings dataset should have ratings from users which exist in users dataset, unless new users are added to users dataset
ratings_new = ratings[ratings.userID.isin(users.userID)]
print (ratings.shape)
print (ratings_new.shape)
#no new users added, hence we will go with above dataset ratings_new (1031136, 3)

(1149780, 3)
(1149780, 3)


In [91]:
# removing the ratins books that don't exist on books dataframe. 
ratings = ratings[ratings.ISBN.isin(books.ISBN)]

In [92]:
books.yearOfPublication.unique()
#investigating the rows having 'DK Publishing Inc' as yearOfPublication
books.loc[books.yearOfPublication == 'DK Publishing Inc',:]
#From above, it is seen that bookAuthor is incorrectly loaded with bookTitle, hence making required corrections
#ISBN '0789466953'
books.loc[books.ISBN == '0789466953','yearOfPublication'] = 2000
books.loc[books.ISBN == '0789466953','bookAuthor'] = "James Buckley"
books.loc[books.ISBN == '0789466953','publisher'] = "DK Publishing Inc"
books.loc[books.ISBN == '0789466953','bookTitle'] = "DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)"
#ISBN '078946697X'
books.loc[books.ISBN == '078946697X','yearOfPublication'] = 2000
books.loc[books.ISBN == '078946697X','bookAuthor'] = "Michael Teitelbaum"
books.loc[books.ISBN == '078946697X','publisher'] = "DK Publishing Inc"
books.loc[books.ISBN == '078946697X','bookTitle'] = "DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)"

In [93]:
#rechecking
books.loc[(books.ISBN == '0789466953') | (books.ISBN == '078946697X'), :]
#corrections done

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher
209538,078946697X,"DK Readers: Creating the X-Men, How It All Beg...",Michael Teitelbaum,2000,DK Publishing Inc
221678,0789466953,"DK Readers: Creating the X-Men, How Comic Book...",James Buckley,2000,DK Publishing Inc


In [94]:
#investigating the rows having 'Gallimard' as yearOfPublication
books.loc[books.yearOfPublication == 'Gallimard',:]

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher
220731,2070426769,"Peuple du ciel, suivi de 'Les Bergers\"";Jean-M...",2003,Gallimard,http://images.amazon.com/images/P/2070426769.0...


In [95]:
#making required corrections as above, keeping other fields intact
books.loc[books.ISBN == '2070426769','yearOfPublication'] = 2003
books.loc[books.ISBN == '2070426769','bookAuthor'] = "Jean-Marie Gustave Le ClÃ?Â©zio"
books.loc[books.ISBN == '2070426769','publisher'] = "Gallimard"
books.loc[books.ISBN == '2070426769','bookTitle'] = "Peuple du ciel, suivi de 'Les Bergers"

In [96]:
#rechecking
books.loc[books.ISBN == '2070426769',:]
#corrections done

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher
220731,2070426769,"Peuple du ciel, suivi de 'Les Bergers",Jean-Marie Gustave Le ClÃ?Â©zio,2003,Gallimard


In [97]:
#Correcting the dtypes of yearOfPublication
books.yearOfPublication=pd.to_numeric(books.yearOfPublication, errors='coerce')

In [98]:
# However, the value 0 is invalid and as this dataset was published in 2004, I have assumed the the years after 2005 to be 
# invalid keeping some margin in case dataset was updated thereafer
# setting invalid years as NaN
books.loc[(books.yearOfPublication > 2005) | (books.yearOfPublication == 0),'yearOfPublication'] = np.NAN

In [99]:
#replacing NaNs with mean value of yearOfPublication
books.yearOfPublication.fillna(round(books.yearOfPublication.mean()), inplace=True)

In [100]:
#rechecking
books.yearOfPublication.isnull().sum()
#resetting the dtype as int32
books.yearOfPublication = books.yearOfPublication.astype(np.int32)

In [101]:
#exploring 'bookAuthor' column
books.loc[books.bookAuthor.isnull(),:]

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher
187689,9627982032,The Credit Suisse Guide to Managing Your Perso...,,1995,Edinburgh Financial Publishing


In [102]:
# I've searched and found that link on amazon https://www.amazon.com/Credit-Suisse-Managing-Personal-International/dp/9627982075
# and the author is also the publisher so I changed it.
books.loc[books.ISBN == '9627982032','bookAuthor'] = 'Edinburgh Financial Publishing'

# END OF MY PRE PROCESSING !

In [103]:
# remove all books that have less than 10 ratings
# count all the books.
count_books = ratings['ISBN'].value_counts()
# apply .map() function with Series count on each element to remove all books that have less than 10 ratings and 
# assign it to a new variable to keep ratings untouched
ratings = ratings[ratings['ISBN'].map(count_books) >= 10]
ratings

Unnamed: 0,userID,ISBN,bookRating
0,276725,034545104X,0
2,276727,0446520802,0
8,276744,038550120X,7
10,276746,0425115801,0
11,276746,0449006522,0
...,...,...,...
1149771,276704,0743211383,7
1149772,276704,080410526X,0
1149776,276706,0679447156,0
1149777,276709,0515107662,10


In [104]:
#group by ISBN in order to do the join with books to remove all the books that have less than 10 ratings
# temp_ratings is a dictionary that contains a key as the ISBN and a dataframe that contains all 
temp_ratings = ratings.groupby(['ISBN'])

In [105]:
# getting dict_keys from temp_ratings then cast it into list in order to make it Series to join it with books
keys_Series = pd.Series(list(temp_ratings.groups.keys()))
# make a dataframe so I can name the column as ISBN to inner join it with books
frame = {'ISBN': keys_Series}
df_uniqueISBN = pd.DataFrame(frame) 

In [106]:
# final df for books
books_over10= pd.merge(left=books, right=df_uniqueISBN, how='inner', left_on='ISBN', right_on='ISBN')
books_over10

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher
0,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
1,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
2,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group
3,0440234743,The Testament,John Grisham,1999,Dell
4,0452264464,Beloved (Plume Contemporary Fiction),Toni Morrison,1994,Plume
...,...,...,...,...,...
17474,3803112133,Cosi fan tutte. Eine Geschichte.,Alan Bennett,2003,Wagenbach
17475,006019250X,The Illustrated Alchemist: A Fable About Follo...,Paulo Coelho,1998,HarperCollins Publishers
17476,0786890088,The Angel Maker,Ridley Pearson,2001,Hyperion Press
17477,006447027X,The Endless Steppe (rack) : Growing Up in Siberia,Esther Hautzig,1987,HarperTrophy


In [107]:
# count all the users.
count_users = ratings['userID'].value_counts()
# apply .map() function with Series count on each element to remove all books that have less than 5 userID and 
# assign it to a new variable to keep ratings untouched
ratings = ratings[ratings['userID'].map(count_users) >= 5]

In [108]:
#group by ISBN in order to do the join with books to remove all the books that have less than 10 ratings
# temp_ratings is a dictionary that contains a key as the ISBN and a dataframe that contains all 
temp_ratings = ratings.groupby(['userID'])
# getting dict_keys from temp_ratings then cast it into list in order to make it Series to join it with users
keys_Series = pd.Series(list(temp_ratings.groups.keys()))
# make a dataframe so I can name the column as userID to inner join it with books
frame = {'userID': keys_Series}
df_uniqueuserID = pd.DataFrame(frame) 

In [109]:
# final df for users
user_over5= pd.merge(left=users, right=df_uniqueuserID, how='inner', left_on='userID', right_on='userID')
books_over10.head()

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher
0,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
1,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
2,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group
3,440234743,The Testament,John Grisham,1999,Dell
4,452264464,Beloved (Plume Contemporary Fiction),Toni Morrison,1994,Plume


In [110]:
# initialize the new column of keywords.
# pre processing for adding keywords for each title.
def extractor(row):
    a=r.extract_keywords_from_text(row)
    return r.get_ranked_phrases() # To get keyword phrases ranked with scores highest to lowest.
   
r = Rake() # Uses stopwords for english from NLTK, and all puntuation characters.
# apply the lambda extractor for each book title and then put it to column Keywords.
books_over10['Keywords'] = books_over10['bookTitle'].map(lambda row : extractor(row))

In [111]:
# pd.set_option('display.max_rows', 30) # display 30 rows
# pd.set_option('display.max_columns', 5)
books_over10.head()

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher,Keywords
0,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,[clara callan]
1,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"[great influenza pandemic, virus, story, searc..."
2,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,"[kitchen god, wife]"
3,440234743,The Testament,John Grisham,1999,Dell,[testament]
4,452264464,Beloved (Plume Contemporary Fiction),Toni Morrison,1994,Plume,"[plume contemporary fiction, beloved]"


In [112]:
## START OF RECOMMENDER SYSTEM.
def getRecommendations(new):
    # get the userID as list from each user and then group it by it.
    random_users = new['userID'].tolist()
    new_ratings_grouped = ratings.groupby('userID')
    for index, userID in enumerate(random_users):
        # getting the data frame 
        current_users_ratings = get_group(new_ratings_grouped, userID)
        # get the column with ISBN'S to exclude them from the search.
        excluded_ISBNS = current_users_ratings['ISBN']
        # get the 3 maximum values out of bookRating column.
        df_most_rated_books = current_users_ratings.nlargest(3, ['bookRating'])
        # inner join to get the bookAuthor, Keywords and year.
        most_rated_books = pd.merge(left=df_most_rated_books, right=books_over10, how='inner', left_on='ISBN', right_on='ISBN')
        profile = most_rated_books[['bookAuthor','yearOfPublication','Keywords']]
        # Get every column as Series, turn it into list and then cast it into set.
        years = set(profile['yearOfPublication'].tolist())
        Authors = set(profile['bookAuthor'].tolist())
        Keywords = profile['Keywords'].tolist()
        # Use list comprehension to convert a list of lists to a flat list and then to a set.
        KeywordsList = [item for elem in Keywords for item in elem]
        KeywordsSet = set(KeywordsList)
        # include the books with ISBNS that are NOT (~) currently in column isbn.
        excluded_books_over10 = books_over10.loc[~books_over10['ISBN'].isin(excluded_ISBNS)]
        CalculateSimilaritiesAndWriteExcels(userID, profile, years, Authors, KeywordsSet, excluded_books_over10)
        

In [113]:
# a function that takes as input a  grouped_by dataframe and a key according to grouped, and returns the dataframe 
# that is connected to current key.
def get_group(group_df, key):
     if key in group_df.groups: return group_df.get_group(key)
     return pd.DataFrame()
def CalculateSimilaritiesAndWriteExcels(userID, profile, years, Authors, KeywordsSet, excluded_books_over10):
    # the below list comprehension takes every yearofPublication of all books then takes
    # every year as y in years from 3 most rated, calculating the difference and saves the max.
    # change authors and publishers both user and dataframe's to lowercase strings.
    excluded_books_over10['bookAuthor'] = excluded_books_over10['bookAuthor'].str.lower()
    Authors = list(map(lambda x:x.lower(),Authors))
    year_diff = [max([(1-(np.absolute(x-y)/2005)) for y in years]) for x in excluded_books_over10['yearOfPublication']]
    jac_sim = [len(KeywordsSet.intersection(set(x))) / len(KeywordsSet.union(set(x))) for x in excluded_books_over10['Keywords']]
    # check if any of most rated Authors are in the books. if it is set it 1 otherwise 0.
    Author = [1 if Author in Authors else 0 for Author in excluded_books_over10['bookAuthor']]
    # calculate 
    final_scoreJ = [0.2*jaccard + 0.4*max_year +0.4*Auth 
                    for max_year, jaccard, Auth in zip(year_diff, jac_sim, Author)]
    # get the sorted list with the original indexes also, 
    # then take only the last ten(maximum) elements of each and reverse the list to get them in the right order.
    indices, final_score_sorted = zip(*sorted(enumerate(final_scoreJ), key=operator.itemgetter(1)))
    max_indicesJ = list(indices[-10:])
    max_indicesJ.reverse()
    max_scoresJ = list(final_score_sorted[-10:])
    max_scoresJ.reverse()
    user = 'JaccardOutput' + str(userID) + '.xlsx'
    # set scoresj_df into a dataframe that contains only the top ten books according to max_indices for jaccard similarity.
    scoresj_df = excluded_books_over10.iloc[max_indicesJ[0:10]]
    # set a new column named JaccardScores to keep the scores.
    scoresj_df['JaccardScores'] = max_scoresJ
    # ADD DICE COEF AT JACCARDS BEST IN ORDER TO USE THEM FOR LAST QUESTION (EXPERIMENT 3.).
    dice_sim_temp = [2*len(KeywordsSet.intersection(set(x))) / (len(KeywordsSet) + len(set(x))) for x in scoresj_df['Keywords']]
    Author_temp = [1 if Author in Authors else 0 for Author in scoresj_df['bookAuthor']]
    year_diff_temp = [max([(1-(np.absolute(x-y)/2005)) for y in years]) for x in scoresj_df['yearOfPublication']]
    final_scoreD_temp = [0.5*dice_coef + 0.2*max_year +0.3*Auth
                    for max_year, dice_coef, Auth in zip(year_diff_temp, dice_sim_temp, Author_temp)]
    scoresj_df['DiceCoefScores'] = final_scoreD_temp
    # add the profile of each user on the excel.
    scoresj_df = scoresj_df.append(profile,sort = False)
    scoresj_df.to_excel(user)
    # DO THE SAME AS ABOVE WITH DICE COEFFICIENT INSTEAD.
    dice_sim = [2*len(KeywordsSet.intersection(set(x))) / (len(KeywordsSet) + len(set(x))) for x in excluded_books_over10['Keywords']]
    final_scoreD = [0.5*dice_coef + 0.2*max_year +0.3*Auth
                    for max_year, dice_coef, Auth in zip(year_diff, dice_sim, Author)]
    indices, final_score_sorted = zip(*sorted(enumerate(final_scoreD), key=operator.itemgetter(1)))
    max_indicesC = list(indices[-10:])
    max_indicesC.reverse()
    max_scoresC = list(final_score_sorted[-10:])
    max_scoresC.reverse()
    user = 'DiceCoefOutput' + str(userID) + '.xlsx'
    # set scoresC_df into a dataframe that contains only the top ten books according to max_indices for dice coeff.
    scoresc_df = excluded_books_over10.iloc[max_indicesC[0:10]]
    # set a new column named DiceCoefScores to keep the scores.
    scoresc_df['DiceCoefScores'] = max_scoresC
    # ADD JACCARD AT COEF BEST IN ORDER TO USE THEM FOR LAST QUESTION (EXPERIMENT 3.).
    jac_sim_diff = [len(KeywordsSet.intersection(set(x))) / len(KeywordsSet.union(set(x))) for x in scoresc_df['Keywords']]
    Author_temp = [1 if Author in Authors else 0 for Author in scoresc_df['bookAuthor']]
    year_diff_temp = [max([(1-(np.absolute(x-y)/2005)) for y in years]) for x in scoresc_df['yearOfPublication']]
    final_scoreJ_temp = [0.2*jaccard + 0.4*max_year +0.4*Auth 
                    for max_year, jaccard, Auth in zip(year_diff_temp, jac_sim_diff, Author_temp)]
    scoresc_df['JaccardScores'] = final_scoreJ_temp
    # add the profile of each user on the excel.
    scoresc_df = scoresc_df.append(profile, sort = False)
    scoresc_df.to_excel(user)

In [114]:
def GetOverlap(Dice, Jaccard):
    OverlapValues = []
    # getting only the ISBN'S columns as list to check whether they overlap or not.
    TitlesDice = list(Dice['ISBN'].dropna())
    TitlesJaccard = list(Jaccard['ISBN'].dropna())
    # from 0 to 9. (1-10)
    for i in range(len(TitlesJaccard)):
        score = 0
        Dice = TitlesDice[0:i+1]
        Jaccard = TitlesJaccard[0:i+1]
        # check wheter the the Dice's ISBNS exists on Jaccard's till [0:i+1] elements of the jaccard ISBNS list.
        for ISBN in Dice:
            if ISBN in Jaccard:
                score += 1     
        CurrOverlap = score / (i+1)
        OverlapValues.append(CurrOverlap)
    return sum(OverlapValues)/len(TitlesJaccard)

def GetSortedDict(Dice,Jaccard):
    goldenList = {}
    # create a new colomun on dataframe that is the average between Jaccard and Dice scores in both dfs.
    Dice['averageDice_Jaccard'] = Dice[['JaccardScores', 'DiceCoefScores']].mean(axis=1)
    Jaccard['averageDice_Jaccard'] = Jaccard[['JaccardScores', 'DiceCoefScores']].mean(axis=1)
    # cut the 3 last NaN values.
    Jaccard = Jaccard[:10]
    Dice = Dice[:10]
    TitlesDice = list(Dice['bookTitle'])
    TitlesJaccard = list(Jaccard['bookTitle'])
    ISBNDice = list(Dice['ISBN'])
    ISBNJaccard = list(Jaccard['ISBN'])
    # set key as ISBN and value as titles in order to replace the ISBNS with the book tiltes in the golden list.
    dictDice = dict(zip(ISBNDice, TitlesDice))
    dictJaccard = dict(zip(ISBNJaccard, TitlesJaccard))
    for ISBN in ISBNDice:
        goldenList[ISBN] = goldenList.get(ISBN, 0) + 1
    for ISBN in ISBNJaccard:
        goldenList[ISBN] = goldenList.get(ISBN, 0) + 1
    newDict = {k: [v] for k,v in goldenList.items()}
    # append the average between Dice and jaccard in order to sort them first with popularity and second with average between
    # the two indexes(jaccard and dice.).
    for k,v in newDict.items():
        if k in Dice['ISBN'].values:
            x = Dice['averageDice_Jaccard'].loc[Dice['ISBN'] == k].tolist()
            v.append(x[0])
        elif k in Jaccard['ISBN'].values:
            y = Jaccard['averageDice_Jaccard'].loc[Jaccard['ISBN'] == k].tolist()
            v.append(y[0])   
    
    # iterate over keys and change them in order to replace ISBNS with the bookTitle.
    # The key is represented as the name of book, ISBN number and 
    # in values is the count of ISBN in both lists and then the average between Jaccard and Dice coefficient.
    # Below is an example.
    # ('High Maintenance', '1573221856') [2, 0.8081837073981712]
    for k in list(newDict):
        if k in newDict and k in dictJaccard:
            newDict[dictJaccard.get(k),k] = newDict.pop(k)
        elif k in newDict and k in dictDice:    
            newDict[dictDice.get(k),k] = newDict.pop(k)
            
    # sort the dictionary first by count of ISBNS books in both lists and then by their average score.     
    FinalOrderedDictionary = {k: v for k, v in sorted(newDict.items(), key=lambda item: item[1],reverse = True)}
    for k,v in FinalOrderedDictionary.items():
        print(k,v)
    return FinalOrderedDictionary

In [116]:
#new = user_over5.sample(5)
new = user_over5.loc[(user_over5['userID'] == 86001) | (user_over5['userID'] == 74093) |
                     ( user_over5['userID'] == 120829)  | (user_over5['userID'] == 164442) | (user_over5['userID'] == 62464)]
getRecommendations(new)
for i in range(5):
    userID = str(new['userID'].iloc[i])
    Dice = pd.read_excel('DiceCoefOutput'+ userID + '.xlsx', index_col=0) 
    Jaccard = pd.read_excel('JaccardOutput'+ userID + '.xlsx', index_col=0) 
    Dice = Dice[['ISBN','bookTitle','JaccardScores','DiceCoefScores']]
    Jaccard = Jaccard[['ISBN','bookTitle','JaccardScores','DiceCoefScores']]
    print('userID: ' + userID + ', overlap : '+ str(GetOverlap(Dice, Jaccard))) 
    GoldenList = GetSortedDict(Dice,Jaccard)
    # create the GoldenSeries in order to calculate the overlap between the golden and two lists(Jaccard,Dice)
    GoldenSeries = pd.Series((key[1] for key in list(GoldenList.keys())[:10]))
    GoldenSeries = GoldenSeries.reset_index(name='ISBN')
    print('userID: ' + userID + ', overlap : '+ str(GetOverlap(GoldenSeries, Jaccard)) +\
          ' after using goldenList between GoldenList and Jaccard.')
    print('userID: ' + userID + ', overlap : '+ str(GetOverlap(GoldenSeries, Dice)) +\
        ' after using goldenList between GoldenList and Dice.')
    print ('End of {} user!!!'.format(i+1))

userID: 62464, overlap : 0.9875
('Divine Secrets of the Ya-Ya Sisterhood : A Novel', 60173289.0) [2, 0.8665170407315046]
('The Divine Secrets of the Ya-Ya Sisterhood: A Novel', 60502258.0) [2, 0.8660681629260183]
('Divine Secrets of the Ya-Ya Sisterhood', 61015075.0) [2, 0.8235643844933123]
('The Catcher in the Rye', 316769177.0) [2, 0.7745511221945137]
('On the Road', 140042598.0) [2, 0.7180555555555557]
('Little Altars Everywhere: A Novel', 60976845.0) [2, 0.710961485175949]
('On the Road (Penguin Modern Classics)', 141182679.0) [2, 0.7108118592407869]
('On the Road (Modern Classics S.)', 140031928.0) [2, 0.7026116526864656]
('On the Road (Essential.penguin S.)', 140274154.0) [2, 0.7007575757575757]
('Nine Stories', 316769509.0) [2, 0.65]
userID: 62464, overlap : 0.9875 after using goldenList between GoldenList and Jaccard.
userID: 62464, overlap : 1.0 after using goldenList between GoldenList and Dice.
End of 1 user!!!
userID: 74093, overlap : 1.0
('My Side of the Mountain', 1403481