In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Training**

**Training** - Train a model on the Amazon dataset

In [None]:
import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen

import numpy as np

In [None]:
# mount to Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Get the first 4 million reviews from CSV named amazon_reviews_1 in Google Drive

# Go to folder containing CSV
%cd "/content/drive/My Drive/data/ce256/project"
# load first 4 million reviews
amazon_df = pd.read_csv("amazon_reviews_1")

# Check the CSV in Google Drive has the correct data
amazon_df.head()

/content/drive/My Drive/data/ce256/project


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,False,"03 30, 2005",A1REUF3A1YCPHM,1713353,{'Format:': ' Hardcover'},TW Ervin II,"The King, the Mice and the Cheese by Nancy Gur...",A story children will love and learn from,1112140800,,
1,5.0,True,"06 20, 2016",AVP0HXC9FG790,1713353,,Amazon Customer,The kids loved it!,Five Stars,1466380800,,
2,5.0,True,"01 24, 2016",A324TTUBKTN73A,1713353,{'Format:': ' Paperback'},Tekla Borner,My students (3 & 4 year olds) loved this book!...,Five Stars,1453593600,,
3,5.0,False,"07 9, 2015",A2RE7WG349NV5D,1713353,{'Format:': ' Paperback'},Deborah K Woroniecki,LOVE IT,Five Stars,1436400000,,
4,5.0,True,"01 18, 2015",A32B7QIUDQCD0E,1713353,,E,Great!,Five Stars,1421539200,,


In [None]:
# Create ratings df containing only user ID, ASIN as book ID, and rating
amazon_ratings_df = amazon_df[["reviewerID", "asin", "overall"]].copy()
amazon_ratings_df.head()

Unnamed: 0,reviewerID,asin,overall
0,A1REUF3A1YCPHM,1713353,5.0
1,AVP0HXC9FG790,1713353,5.0
2,A324TTUBKTN73A,1713353,5.0
3,A2RE7WG349NV5D,1713353,5.0
4,A32B7QIUDQCD0E,1713353,5.0


In [None]:
# Credits-Prof Eirinaki, Rashmi Sharma and Aditya Patel
# conda install -c conda-forge scikit-surprise
!pip install scikit-surprise
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection.split import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV
import pandas as pd
import numpy as np
import os, io
from surprise import KNNBasic, KNNWithMeans
from surprise import SVDpp
from surprise import SVD
from surprise import accuracy

Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 4.1MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp36-cp36m-linux_x86_64.whl size=1670925 sha256=0fc3462a09b39d4128fe9497f9fd5af924d36aa7b91a0a15d1599faa1252c52d
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [None]:
reader = Reader(rating_scale=(1,5))  #invoke reader instance of surprise library
data=Dataset.load_from_df(amazon_ratings_df,reader) #load dataset into Surprise datastructure Dataset

In [None]:
#create training set
trainingSet, testSet = train_test_split(data, test_size=0.2, train_size=None, random_state=None, shuffle=True)

In [None]:
#SVD
svd = SVD()
svd.fit(trainingSet) #fit model to the training set
predictions_svd = svd.test(testSet) #predict for test set values

Testing

In [None]:
#validating rating predictions using RMSE
accuracy.rmse(predictions_svd, verbose=True) 

RMSE: 0.9699


0.9698739953380167

Making Predictions

In [None]:
# Get predicted and actual ratings of a user in the dataset
for pred in predictions_svd:
  if pred[0] == "AQEO3JYVJJH31":
    print(pred)

In [None]:
# Get example predicted ratings of books to recommend to the user
amazon_user_id = "AQEO3JYVJJH31"
svd.predict(amazon_user_id, "0091944244") # A Gentleman in Moscow, user already rated this 5.0
svd.predict(amazon_user_id, "0007548672") # All the Light We Cannot See, user already rated this 5.0
svd.predict(amazon_user_id, "0001713353") # The King, the Mice and the Cheese
svd.predict(amazon_user_id, "0001384198") # The Little Engine that Could
svd.predict(amazon_user_id, "0002005263") # The Sinister Pig
svd.predict(amazon_user_id, "059035342X") # Harry Potter and the Sorcerer's Stone, not in dataset

Prediction(uid='AQEO3JYVJJH31', iid='059035342X', r_ui=None, est=4.472453209445775, details={'was_impossible': False})

In [None]:
"""
Get the top-N highest-rated books as prediction tuples from the Amazon dataset.
param: user_id: The ID of the reviewer who will be recommended books
param: top_n: Number of books to recommend
returns: List of N prediction tuples
"""
def get_recommended_amazon_books_by_asin(user_id, top_n):
  # Get ASIN of all books as a list
  asin_list = amazon_df["asin"].tolist()
  # Get ASIN list without duplicate ASINs
  no_duplicates_asin_list = list( dict.fromkeys(asin_list) )
  # print length to get number of unique ASINs
  # print(len(no_duplicates_asin_list))

  pred_list = []
  for asin in no_duplicates_asin_list:
    pred = svd.predict(user_id, asin)
    pred_list.append(pred)
  # Sort by the estimated rating, which is fourth element in pred tuple
  # reverse=True to sort by highest ratings first
  pred_list.sort(key=lambda x: x[3], reverse=True)
  # Slice list to get top-N book preds
  return pred_list[:top_n]

In [None]:
# Now go through all books in Amazon df, and recommend the top-5 highest-rated books
amazon_user_id = "AQEO3JYVJJH31"
recommended_amazon_books = get_recommended_amazon_books_by_asin(amazon_user_id, 5)

In [None]:
for pred in recommended_amazon_books:
  print("ASIN: " + str(pred[1]) + ", predicted rating: " + str(pred[3]))

ASIN: 0001720279, predicted rating: 5
ASIN: 0001720392, predicted rating: 5
ASIN: 0001712845, predicted rating: 5
ASIN: 0001983679, predicted rating: 5
ASIN: 0001473727, predicted rating: 5


Alternatively: Making predictions using user and book dfs

In [None]:
# read the csv into a dataframe
user_df = pd.read_csv("sample_user_name.csv")

In [None]:
user_df.head(5)

Unnamed: 0,username,id
0,user1,AQEO3JYVJJH31


In [None]:
user_dict = {}
for i in range(len(user_df)):
    user_dict[user_df.iloc[i].username] = user_df.iloc[i].id

In [None]:
print(user_dict)

{'user1': 'AQEO3JYVJJH31'}


In [None]:
# read the csv into a dataframe
book_df = pd.read_csv("sample_book_name.csv")

In [None]:
book_df.head(5)

Unnamed: 0,bookName,id
0,A Gentleman in Moscow,91944244
1,All the Light We Cannot See,7548672
2,The Anatomy of Peace: Resolving the Heart of C...,141047666
3,The Complete Idiot's Guide to Music Theory,28643771


In [None]:
"""Add leading zeros to get the correct 10-digit ASIN.
:param book_df: The df containing a column called "id" which need to be 10-digit ASINs
"""
def add_leading_zeros_to_ids(book_df):
  for id in book_df["id"]:
    # print(str(id).zfill(10))
    ten_digit_id = str(id).zfill(10)
    book_df["id"] = book_df["id"].replace([id], ten_digit_id)

In [None]:
add_leading_zeros_to_ids(book_df)
book_df.head()

Unnamed: 0,bookName,id
0,A Gentleman in Moscow,91944244
1,All the Light We Cannot See,7548672
2,The Anatomy of Peace: Resolving the Heart of C...,141047666
3,The Complete Idiot's Guide to Music Theory,28643771


In [None]:
book_dict = {}
for i in range(len(book_df)):
    book_dict[book_df.iloc[i].id] = book_df.iloc[i].bookName

In [None]:
print(book_dict)

{'0091944244': 'A Gentleman in Moscow', '0007548672': 'All the Light We Cannot See', '0141047666': 'The Anatomy of Peace: Resolving the Heart of Conflict', '0028643771': "The Complete Idiot's Guide to Music Theory"}


In [None]:
from collections import defaultdict

def getBookRecommendations(topN=3):
    top_recs = defaultdict(list)
    # for uid, iid, true_r, est, _ in predictions: 
    for uid, iid, true_r, est, _ in predictions_svd: 
        top_recs[uid].append((iid, est))
     
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:topN]
     
    return top_recs 

In [None]:
recommendations = getBookRecommendations(3)

In [None]:
def getBookName(book_id):
    if book_id not in book_dict:
        return ""
    b = book_dict[book_id]
    return b

In [None]:
def getBookRecommendationsForUser(userId, recommendations):
    if userId not in user_dict:
        print("User id is not present")
        return
    u_id = user_dict[userId]
    recommended_books = recommendations[u_id]
    book_list = []
    for book in recommended_books:
        book_list.append((getBookName(book[0]),book[1]))
    return book_list    

In [None]:
# change to user ID
getBookRecommendationsForUser('user1',recommendations)

[('All the Light We Cannot See', 4.857948206684828)]

**Training** - Train a model on the Goodreads dataset

In [None]:
import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen

import numpy as np

In [None]:
# mount to Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Get the first 4 million reviews from CSV named goodreads_reviews_1 in Google Drive

# Go to folder containing CSV
%cd "/content/drive/My Drive/data/ce256/project"
# load first 4 million reviews
goodreads_df = pd.read_csv("goodreads_reviews_1.csv")

# Check the CSV in Google Drive has the correct data
goodreads_df.head()

/content/drive/My Drive/data/ce256/project


Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,8842281e1d1347389f2ab93d60773d4d,24375664,5cd416f3efc3f944fce4ce2db2290d5e,5,Mind blowingly cool. Best science fiction I've...,Fri Aug 25 13:55:02 -0700 2017,Mon Oct 09 08:55:59 -0700 2017,Sat Oct 07 00:00:00 -0700 2017,Sat Aug 26 00:00:00 -0700 2017,16,0
1,8842281e1d1347389f2ab93d60773d4d,18245960,dfdbb7b0eb5a7e4c26d59a937e2e5feb,5,This is a special book. It started slow for ab...,Sun Jul 30 07:44:10 -0700 2017,Wed Aug 30 00:00:26 -0700 2017,Sat Aug 26 12:05:52 -0700 2017,Tue Aug 15 13:23:18 -0700 2017,28,1
2,8842281e1d1347389f2ab93d60773d4d,6392944,5e212a62bced17b4dbe41150e5bb9037,3,I haven't read a fun mystery book in a while a...,Mon Jul 24 02:48:17 -0700 2017,Sun Jul 30 09:28:03 -0700 2017,Tue Jul 25 00:00:00 -0700 2017,Mon Jul 24 00:00:00 -0700 2017,6,0
3,8842281e1d1347389f2ab93d60773d4d,22078596,fdd13cad0695656be99828cd75d6eb73,4,"Fun, fast paced, and disturbing tale of murder...",Mon Jul 24 02:33:09 -0700 2017,Sun Jul 30 10:23:54 -0700 2017,Sun Jul 30 15:42:05 -0700 2017,Tue Jul 25 00:00:00 -0700 2017,22,4
4,8842281e1d1347389f2ab93d60773d4d,6644782,bd0df91c9d918c0e433b9ab3a9a5c451,4,A fun book that gives you a sense of living in...,Mon Jul 24 02:28:14 -0700 2017,Thu Aug 24 00:07:20 -0700 2017,Sat Aug 05 00:00:00 -0700 2017,Sun Jul 30 00:00:00 -0700 2017,8,0


In [None]:
# Create ratings df containing only user ID, Goodreads book ID, and rating
goodreads_ratings_df = goodreads_df[["user_id", "book_id", "rating"]].copy()
goodreads_ratings_df.head()

Unnamed: 0,user_id,book_id,rating
0,8842281e1d1347389f2ab93d60773d4d,24375664,5
1,8842281e1d1347389f2ab93d60773d4d,18245960,5
2,8842281e1d1347389f2ab93d60773d4d,6392944,3
3,8842281e1d1347389f2ab93d60773d4d,22078596,4
4,8842281e1d1347389f2ab93d60773d4d,6644782,4


In [None]:
# Credits-Prof Eirinaki, Rashmi Sharma and Aditya Patel
# conda install -c conda-forge scikit-surprise
!pip install scikit-surprise
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection.split import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV
import pandas as pd
import numpy as np
import os, io
from surprise import KNNBasic, KNNWithMeans
from surprise import SVDpp
from surprise import SVD
from surprise import accuracy



In [None]:
reader = Reader(rating_scale=(1,5))  #invoke reader instance of surprise library
data=Dataset.load_from_df(goodreads_ratings_df,reader) #load dataset into Surprise datastructure Dataset

In [None]:
#create training set
trainingSet, testSet = train_test_split(data, test_size=0.2, train_size=None, random_state=None, shuffle=True)

In [None]:
#SVD
svd = SVD()
svd.fit(trainingSet) #fit model to the training set
predictions_svd = svd.test(testSet) #predict for test set values

Testing

In [None]:
#validating rating predictions using RMSE
accuracy.rmse(predictions_svd, verbose=True) 

RMSE: 1.0804


1.0803740490580012

Making Predictions

In [None]:
# Get eaxmple predicted ratings of a user in the dataset
goodreads_user_id = "8842281e1d1347389f2ab93d60773d4d"
svd.predict(goodreads_user_id, "24375664") # The Dark Forest (Remembrance of Earth’s Past, #2), user already rated this 5.0
svd.predict(goodreads_user_id, "18245960") # The Three-Body Problem (Remembrance of Earth’s Past, #1), user already rated this 5.0
svd.predict(goodreads_user_id, "6392944") # The Murder on the Links (Hercule Poirot, #2), user already rated this 3.0

Prediction(uid='8842281e1d1347389f2ab93d60773d4d', iid='6392944', r_ui=None, est=3.3522237304915175, details={'was_impossible': False})

In [None]:
"""
Get the top-N highest-rated books as prediction tuples from the Goodreads dataset.
param: user_id: The ID of the reviewer who will be recommended books
param: top_n: Number of books to recommend
returns: List of N prediction tuples
"""
def get_recommended_goodreads_books_by_id(user_id, top_n):
  # Get book ID of all books as a list
  book_id_list = goodreads_df["book_id"].tolist()
  # Get book ID list without duplicate book IDs
  no_duplicates_book_id_list = list( dict.fromkeys(book_id_list) )
  # print length to get number of unique book IDs
  print(len(no_duplicates_book_id_list))

  pred_list = []
  for book_id in no_duplicates_book_id_list:
    pred = svd.predict(user_id, book_id)
    pred_list.append(pred)
  # Sort by the estimated rating, which is fourth element in pred tuple
  # reverse=True to sort by highest ratings first
  pred_list.sort(key=lambda x: x[3], reverse=True)
  # Slice list to get top-N book preds
  return pred_list[:top_n]

In [None]:
# Now go through all books in Goodreads df, and recommend the top-5 highest-rated books
goodreads_user_id = "8842281e1d1347389f2ab93d60773d4d"
recommended_goodreads_books = get_recommended_goodreads_books_by_id(goodreads_user_id, 5)

963125


In [None]:
for pred in recommended_goodreads_books:
  print("Book ID: " + str(pred[1]) + ", predicted rating: " + str(pred[3]))

Book ID: 7126, predicted rating: 5
Book ID: 2, predicted rating: 5
Book ID: 7304203, predicted rating: 5
Book ID: 23437291, predicted rating: 5
Book ID: 23489258, predicted rating: 5


---
**Display** title and authors

After getting final list of recommended books of an Amazon user, display title and authors.

Goodreads API Client is a Python wrapper around the Goodreads API.

https://pypi.org/project/goodreads-api-client/

In [None]:
# install to use Goodreads API
!pip install goodreads_api_client

Collecting goodreads_api_client
  Downloading https://files.pythonhosted.org/packages/1f/03/c7c9e027761d382a92c0cb4acf3bf1650ac2e844fd55851e08ccf950687a/goodreads_api_client-0.1.0.dev4-py2.py3-none-any.whl
Collecting requests==2.18.3
[?25l  Downloading https://files.pythonhosted.org/packages/ba/92/c35ed010e8f96781f08dfa6d9a6a19445a175a9304aceedece77cd48b68f/requests-2.18.3-py2.py3-none-any.whl (88kB)
[K     |████████████████████████████████| 92kB 3.6MB/s 
[?25hCollecting xmltodict==0.11.0
  Downloading https://files.pythonhosted.org/packages/42/a9/7e99652c6bc619d19d58cdd8c47560730eb5825d43a7e25db2e1d776ceb7/xmltodict-0.11.0-py2.py3-none-any.whl
Collecting rauth==0.7.3
  Downloading https://files.pythonhosted.org/packages/43/aa/7c8e852275394d65ac5bf3ac9945ecaafe4d083089e09cb0a267efea389a/rauth-0.7.3.tar.gz
Collecting idna<2.6,>=2.5
[?25l  Downloading https://files.pythonhosted.org/packages/11/7d/9bbbd7bb35f34b0169542487d2a8859e44306bb2e6a4455d491800a5621f/idna-2.5-py2.py3-none-any.w

In [None]:
import goodreads_api_client as gr

In [None]:
  # Set client API key
  api_key = 'afbTRMOw7ZbCHlQS4kDRQ'
  client = gr.Client(developer_key=api_key)

In [None]:
"""Uses Goodreads API Python client and an ISBN
and returns Goodreads ID, title, and ISBN of book with that ID.

:param client: The Goodreads API Python client
:param isbn: The ISBN this book should have
:returns: Dictionary with the Goodreads ID, title, and ISBN of the book
"""
def get_title_with_goodreads_api_and_isbn(client, isbn):
  book = client.Book.show_by_isbn(isbn)
  keys_wanted = ['id', 'title', 'isbn']
  reduced_book = {k:v for k, v in book.items() if k in keys_wanted}
  print(reduced_book)
  return reduced_book

In [None]:
"""Uses Goodreads API Python client and a Goodreads ID
and returns Goodreads ID, title, and ISBN of book with that ID.

:param client: The Goodreads API Python client
:param id: the Goodreads ID this book should have
:returns: Dictionary with the Goodreads ID, title, and ISBN of the book
"""
def get_title_with_goodreads_api(client, id):
  book = client.Book.show(id)
  keys_wanted = ['id', 'title', 'isbn']
  reduced_book = {k:v for k, v in book.items() if k in keys_wanted}
  print(reduced_book)
  return reduced_book

In [None]:
"""Uses Goodreads API Python client and a Goodreads ID
and returns authors of book with that ID.

:param client: The Goodreads API Python client
:param id: the Goodreads ID this book should have
:returns: Dictionary with the authors of the book
"""
def get_authors_with_goodreads_api(client, id):
  book = client.Book.show(id)
  keys_wanted = ['authors']
  reduced_book = {k:v for k, v in book.items() if k in keys_wanted}
  for item in reduced_book["authors"].items():
    print(item)
  return reduced_book

In [None]:
# Get the titles of recommended Amazon books using client and ASIN
import time

for pred in recommended_amazon_books:
  get_title_with_goodreads_api_and_isbn(client, pred[1])
  # Sleep 1 second to not go above max API requests
  time.sleep(1)

{'id': '21083498', 'title': 'Hooray for Diffendoofer Day!', 'isbn': '0001720279'}
{'id': '2711294', 'title': 'Green Eggs and Ham', 'isbn': '0001720392'}
{'id': '668981', 'title': 'The Berenstain Bears and the Spooky Old Tree', 'isbn': '0001712845'}
{'id': '421569', 'title': 'The Complete Brambly Hedge (Brambly Hedge, #1-8)', 'isbn': '0001983679'}
{'id': '5935634', 'title': 'The Greatest Book on "Dispensational Truth" in the World', 'isbn': '0001473727'}


In [None]:
# Get the titles of recommended Goodreads books using client and Goodreads book ID
import time

for pred in recommended_goodreads_books:
  get_title_with_goodreads_api(client, pred[1])
  time.sleep(1)

{'id': '7126', 'title': 'The Count of Monte Cristo', 'isbn': '0140449264'}
{'id': '2', 'title': 'Harry Potter and the Order of the Phoenix (Harry Potter, #5)', 'isbn': '0439358078'}
{'id': '7304203', 'title': 'Shadowfever (Fever, #5)', 'isbn': '0385341679'}
{'id': '23437291', 'title': 'Aflame (Fall Away, #4)', 'isbn': '0698403878'}
{'id': '23489258', 'title': 'Second Debt (Indebted, #3)', 'isbn': '1507628552'}


In [None]:
# time.sleep(1)
# get_authors_with_goodreads_api(client, "24375664")

The model should know which users and books are unique. This is to make sure the model is not accidentally recommending the same book from Goodreads that a user has reviewed on Amazon.

The model will assume all users on Amazon are 
different from all users on Goodreads.
That is, all user IDs are unique. There is no user with both an Amazon and Goodreads ID.

To know which books are unique, we can check the book title and authors. If two books have the same title and authors, they are the same book.

- The Amazon ASIN of a book is the same as the book's 10-digit ISBN (International Standard Book Number). 
- Can use Goodreads API and ISBN to get book title and author.
 
See links:
- https://www.oreilly.com/library/view/amazon-hacks/0596005423/ch01s03.html


- Can use Goodreads API and Goodreads book ID to get title and author.
See links:
  - https://pypi.org/project/Goodreads/
  - https://www.goodreads.com/api
  - https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/books


In [None]:
# Test using Goodreads API on a random Goodreads book ID
get_title_with_goodreads_api(client, "1")
get_title_with_goodreads_api(client, "4986701")
# Test using Goodreads API on a random ISBN
get_title_with_goodreads_api_and_isbn(client, "0001932349")
get_title_with_goodreads_api_and_isbn(client, "0002005263")

Find Goodreads **reviews** similar to an Amazon user's review. Books that have reviews similar to the Amazon review can also be recommended.

https://dev.to/coderasha/compare-documents-similarity-using-python-nlp-4odp

In [None]:
# Make sure dataframe displays full review text
pd.set_option('display.max_colwidth', -1)

  


In [None]:
# Get one review of an Amazon user
amazon_reviewer_id = "AQEO3JYVJJH31"
all_reviews_of_user = amazon_df.loc[amazon_df["reviewerID"] == amazon_reviewer_id]
# Get the user's first review in the df
first_review = all_reviews_of_user.iloc[0]
first_review
first_review_text = first_review["reviewText"]
print(first_review_text)

Thoroughly good read, gives the perspective of the war from many angles, especially the impact on children.


In [None]:
# Get the first n reviews in the dataset as a list
def get_goodreads_review_texts(n):
  review_list = []
  i = 0
  while i < n:
    # Get one review from Goodreads dataset
    goodreads_review = goodreads_df.iloc[i]
    goodreads_review_text = goodreads_review["review_text"]
    review_list.append(goodreads_review_text)
    # print(goodreads_review_text)
    # print("--------------------")
    i += 1
  return review_list

In [None]:
# Get the first 5 review texts of Goodreads dataset
goodreads_review_texts = get_goodreads_review_texts(5)

Mind blowingly cool. Best science fiction I've read in some time. I just loved all the descriptions of the society of the future - how they lived in trees, the notion of owning property or even getting married was gone. How every surface was a screen. 
 The undulations of how society responds to the Trisolaran threat seem surprising to me. Maybe its more the Chinese perspective, but I wouldn't have thought the ETO would exist in book 1, and I wouldn't have thought people would get so over-confident in our primitive fleet's chances given you have to think that with superior science they would have weapons - and defenses - that would just be as rifles to arrows once were. 
 But the moment when Luo Ji won as a wallfacer was just too cool. I may have actually done a fist pump. Though by the way, if the Dark Forest theory is right - and I see no reason why it wouldn't be - we as a society should probably stop broadcasting so much signal out into the universe.
--------------------
This is a 

In [None]:
!pip install nltk
!pip install gensim



In [None]:
import nltk
import gensim
# nltk.download('punkt')

In [None]:
# each review is a document
file_docs = []

# tokenize sentences
for review_text in goodreads_review_texts:
  print(type(review_text))
  line = sent_tokenize(review_text)
  file_docs.append(line)

print("Number of documents:",len(file_docs))
file_docs

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
Number of documents: 5


[['Mind blowingly cool.',
  "Best science fiction I've read in some time.",
  'I just loved all the descriptions of the society of the future - how they lived in trees, the notion of owning property or even getting married was gone.',
  'How every surface was a screen.',
  'The undulations of how society responds to the Trisolaran threat seem surprising to me.',
  "Maybe its more the Chinese perspective, but I wouldn't have thought the ETO would exist in book 1, and I wouldn't have thought people would get so over-confident in our primitive fleet's chances given you have to think that with superior science they would have weapons - and defenses - that would just be as rifles to arrows once were.",
  'But the moment when Luo Ji won as a wallfacer was just too cool.',
  'I may have actually done a fist pump.',
  "Though by the way, if the Dark Forest theory is right - and I see no reason why it wouldn't be - we as a society should probably stop broadcasting so much signal out into the un

In [None]:
# Tokenize words and create dictionary

gen_docs = []
for review in file_docs:
  words_in_review = []
  for sent in review:
    for w in word_tokenize(sent):
      words_in_review.append(w.lower())
  gen_docs.append(words_in_review)


for doc in gen_docs:
  print(doc)

['mind', 'blowingly', 'cool', '.', 'best', 'science', 'fiction', 'i', "'ve", 'read', 'in', 'some', 'time', '.', 'i', 'just', 'loved', 'all', 'the', 'descriptions', 'of', 'the', 'society', 'of', 'the', 'future', '-', 'how', 'they', 'lived', 'in', 'trees', ',', 'the', 'notion', 'of', 'owning', 'property', 'or', 'even', 'getting', 'married', 'was', 'gone', '.', 'how', 'every', 'surface', 'was', 'a', 'screen', '.', 'the', 'undulations', 'of', 'how', 'society', 'responds', 'to', 'the', 'trisolaran', 'threat', 'seem', 'surprising', 'to', 'me', '.', 'maybe', 'its', 'more', 'the', 'chinese', 'perspective', ',', 'but', 'i', 'would', "n't", 'have', 'thought', 'the', 'eto', 'would', 'exist', 'in', 'book', '1', ',', 'and', 'i', 'would', "n't", 'have', 'thought', 'people', 'would', 'get', 'so', 'over-confident', 'in', 'our', 'primitive', 'fleet', "'s", 'chances', 'given', 'you', 'have', 'to', 'think', 'that', 'with', 'superior', 'science', 'they', 'would', 'have', 'weapons', '-', 'and', 'defenses',

In [None]:
# create a Dictionary object that maps each word to a unique id
dictionary = gensim.corpora.Dictionary(gen_docs)
print(dictionary.token2id)

{"'s": 0, "'ve": 1, ',': 2, '-': 3, '.': 4, '1': 5, 'a': 6, 'actually': 7, 'all': 8, 'and': 9, 'arrows': 10, 'as': 11, 'be': 12, 'best': 13, 'blowingly': 14, 'book': 15, 'broadcasting': 16, 'but': 17, 'by': 18, 'chances': 19, 'chinese': 20, 'cool': 21, 'dark': 22, 'defenses': 23, 'descriptions': 24, 'done': 25, 'eto': 26, 'even': 27, 'every': 28, 'exist': 29, 'fiction': 30, 'fist': 31, 'fleet': 32, 'forest': 33, 'future': 34, 'get': 35, 'getting': 36, 'given': 37, 'gone': 38, 'have': 39, 'how': 40, 'i': 41, 'if': 42, 'in': 43, 'into': 44, 'is': 45, 'it': 46, 'its': 47, 'ji': 48, 'just': 49, 'lived': 50, 'loved': 51, 'luo': 52, 'married': 53, 'may': 54, 'maybe': 55, 'me': 56, 'mind': 57, 'moment': 58, 'more': 59, 'much': 60, "n't": 61, 'no': 62, 'notion': 63, 'of': 64, 'once': 65, 'or': 66, 'our': 67, 'out': 68, 'over-confident': 69, 'owning': 70, 'people': 71, 'perspective': 72, 'primitive': 73, 'probably': 74, 'property': 75, 'pump': 76, 'read': 77, 'reason': 78, 'responds': 79, 'rifl

In [None]:
# Create a bag of words
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]

[[(0, 1),
  (1, 1),
  (2, 4),
  (3, 5),
  (4, 9),
  (5, 1),
  (6, 4),
  (7, 1),
  (8, 1),
  (9, 3),
  (10, 1),
  (11, 3),
  (12, 2),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 2),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 2),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 5),
  (40, 3),
  (41, 6),
  (42, 1),
  (43, 4),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 3),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 3),
  (62, 1),
  (63, 1),
  (64, 4),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 2),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 1),
  (87, 1),
  (88, 2),
  (89, 3),
  (90, 1),
  (91, 1)

In [None]:
tf_idf = gensim.models.TfidfModel(corpus)
for doc in tf_idf[corpus]:
    print([[dictionary[id], np.around(freq, decimals=2)] for id, freq in doc])

[["'s", 0.1], ["'ve", 0.03], [',', 0.05], ['-', 0.07], ['1', 0.1], ['actually', 0.1], ['all', 0.1], ['arrows', 0.1], ['as', 0.09], ['be', 0.06], ['best', 0.1], ['blowingly', 0.1], ['broadcasting', 0.1], ['but', 0.06], ['by', 0.01], ['chances', 0.1], ['chinese', 0.06], ['cool', 0.2], ['dark', 0.03], ['defenses', 0.1], ['descriptions', 0.06], ['done', 0.1], ['eto', 0.1], ['even', 0.06], ['every', 0.03], ['exist', 0.1], ['fiction', 0.06], ['fist', 0.1], ['fleet', 0.1], ['forest', 0.1], ['future', 0.1], ['get', 0.06], ['getting', 0.1], ['given', 0.1], ['gone', 0.06], ['have', 0.15], ['if', 0.01], ['into', 0.06], ['is', 0.01], ['it', 0.03], ['its', 0.03], ['ji', 0.1], ['just', 0.17], ['lived', 0.1], ['loved', 0.03], ['luo', 0.1], ['married', 0.1], ['may', 0.06], ['maybe', 0.06], ['me', 0.06], ['mind', 0.06], ['moment', 0.1], ['more', 0.03], ['much', 0.06], ["n't", 0.04], ['no', 0.03], ['notion', 0.1], ['of', 0.05], ['once', 0.03], ['or', 0.06], ['our', 0.06], ['out', 0.03], ['over-confident

In [None]:
 # Create similarity object. The main class is Similarity, which builds an index for a given set of documents

# building the index
# sims = gensim.similarities.Similarity('workdir/',tf_idf[corpus],
#                                        num_features=len(dictionary))
sims = gensim.similarities.Similarity('',tf_idf[corpus],
                                        num_features=len(dictionary))

  if np.issubdtype(vec.dtype, np.int):
  if np.issubdtype(vec.dtype, np.int):


In [None]:
# Create Query Document

# Once the index is built, we are going to calculate how similar is 
# this query document to each document in the index

# query document is the original Amazon review
file2_docs = []

line = sent_tokenize(first_review_text)
file2_docs.append(str(line))

print("Number of documents:",len(file2_docs)) 
print(file2_docs) 

for line in file2_docs:
    query_doc = [w.lower() for w in word_tokenize(line)]
    #update an existing dictionary and create bag of words
    query_doc_bow = dictionary.doc2bow(query_doc) 

Number of documents: 1
["['Thoroughly good read, gives the perspective of the war from many angles, especially the impact on children.']"]


In [None]:
# perform a similarity query against the corpus
query_doc_tf_idf = tf_idf[query_doc_bow]
# print(document_number, document_similarity)
print('Comparing Result:', sims[query_doc_tf_idf]) 

Comparing Result: [0.06128649 0.12527286 0.03558587 0.01429004 0.12113652]


  result = numpy.hstack(shard_results)
