In [1]:
import pandas as pd  # Dataframes
import re  # Regular expressions 
import nltk
from nltk.tokenize import word_tokenize  # Split text into words
from nltk.corpus import stopwords  # Lists of unimportant words
from collections import Counter, defaultdict  # Count word frequency & provide more versatile dicts
from pandas.core.common import flatten  # Collapse lists of lists
from nltk.stem.wordnet import WordNetLemmatizer  # Reduce terms to their root
from nltk import pos_tag  # Tag words with parts of speech
import seaborn as sns  # Visualisations
import matplotlib.pyplot as plt  # Visualisations
from sklearn.feature_extraction.text import TfidfVectorizer  # Convert text to TF-IDF representations
from sklearn.metrics.pairwise import cosine_similarity  # Check similarities between vectors
from textwrap import wrap  # format long text

In [2]:
# Uncomment out the following lines if you haven't downloaded the NLTK packages (you only need to do this once)
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')

In [3]:
# Create a dataframe by reading the goodreads_books.csv file
books = pd.read_csv('Datasets/goodreads_books.csv')
books.head()

Unnamed: 0,isbn,language_code,description,isbn13,book_id,title,num_pages
0,743294297,eng,Addie Downs and Valerie Adler were eight when ...,9780743294294,6066819,Best Friends Forever,368
1,842379428,eng,What is Heaven really going to be like? What w...,9780842379427,89376,Heaven,533
2,590417010,eng,In Newbery Medalist Cynthia Rylant's classic b...,9780590417013,89378,Dog Heaven,40
3,8495971933,spa,En una hermosa mansion a orillas del Mediterra...,9788495971937,2008910,"Buenos días, tristeza",184
4,1479174661,eng,Arrianna Williams is an ordinary 25 yr. old wo...,9781479174669,16037548,Untold Secrets: Fire & Ice,168


In [4]:
# Replace the NaN values in the description with "" and NaN values in other columns with 0
books['description'] = books['description'].fillna("")
books['title'] = books['description'].fillna("")
books = books.fillna(0)

In [5]:
# Remove non-English books
books = books[books['language_code'] == 'eng']

In [6]:
descriptions = books['description']

descriptions = descriptions.apply(word_tokenize)

In [7]:
# Create a dictionary to map tags to ones that the lemmatiser will understand.

tag_map = defaultdict(lambda : "n")  # by default, assume nouns
tag_map['J'] = "a"  # adjectives
tag_map['V'] = "v"  # verbs
tag_map['R'] = "r"  # adverbs

# Create a function to get the pos tags for a set of tokens, and return the tokens in a way a
# lemmatizer can interpret
def get_wordnet_tags(tokens):
    """Gets wordnet tags from a set of tokens
    
       Input:
           - a list of string tokens
       Output:
           - a list of (word, tag) tuples
    """
    
    # Tag tokens with pos_tagger
    tagged_tokens = pos_tag(tokens)
    
    # Convert each tag to a version wordnet can understand
    tagged_tokens = [(token[0], tag_map[token[1][0]]) for token in tagged_tokens]
    
    return tagged_tokens

# Create a lemmatizing object

lemma = WordNetLemmatizer()

# Tag each token in each description

descriptions = descriptions.apply(get_wordnet_tags)

# Lemmatize the sets of tokens; this code takes a while to run

descriptions = descriptions.apply(lambda tokens: [lemma.lemmatize(word=token[0],
                                                                  pos=token[1])
                                                  for token in tokens])


# Get a list of stopwords

stops = stopwords.words("english")

# Filter out all stopwords and words less than 3 letters long from the descriptions.

descriptions = descriptions.apply(lambda tokens: [word.lower() for word in tokens
                                                  if word not in stops
                                                  and len(word) > 2])

In [8]:
# Add the modified descriptions back into the dataframe
books['modified_description'] = descriptions
books.head(20)

descriptions = books["modified_description"].apply(lambda text: " ".join(text))

# Save the dataframe to a csv file
books.to_csv('Datasets/goodreads_books_modified.csv')

In [9]:
# Calculate the tf-idf matrix for each modified description
tfidf_vectorizer = TfidfVectorizer(max_features=10000, min_df=0.01)

description_dtm = tfidf_vectorizer.fit_transform(descriptions)

In [10]:
# Create a function that processes a text description into the same format as the provided descriptions.

def convert_text_to_vector(text):
    """Converts a text string into a TFIDF vector
    
       Input:
           - text (str): a book description
       Output:
           - vector (scipy sparse matrix): a tf-idf vector for the description
    """
    # Clean text
    text = text.lower()
    text = re.sub("[^a-z ]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    
    # Lemmatize and remove stopwords
    text = text.split(" ")
    text = get_wordnet_tags(text)
    text = [lemma.lemmatize(word=word[0], pos=word[1]) for word in text]
    text = [word for word in text if word not in stops and len(word) > 3]
    text = " ".join(text)
    
    # Convert the description to a TF-IDF vector
    vector = tfidf_vectorizer.transform([text])
    
    return vector

In [11]:
# Example description string 

test_description = """Brittany K. Barnett was only a law student when she came across the case that would change her life forever—that of Sharanda Jones, single mother, business owner, and, like Brittany, Black daughter of the rural South. A victim of America’s devastating war on drugs, Sharanda had been torn away from her young daughter and was serving a life sentence without parole—for a first-time drug offense. In Sharanda, Brittany saw haunting echoes of her own life, as the daughter of a formerly incarcerated mother. As she studied this case, a system came into focus in which widespread racial injustice forms the core of America’s addiction to incarceration. Moved by Sharanda’s plight, Brittany set to work to gain her freedom."""

# Convert the test description to a vector 

query_vector = convert_text_to_vector(test_description)


# Use cosine similarity to find the most similar vectors to the test

similarities = cosine_similarity(query_vector, description_dtm).flatten()

books['similarity'] = similarities

# Sort the books by similarity
books.sort_values('similarity', ascending=False, inplace=True)

# Print the top 10 most similar books
books[['title', 'similarity']].head(10)

Unnamed: 0,title,similarity
179188,The true story as told by a mother and daughte...,0.427919
109371,The daughter of esteemed writer Paula Fox and ...,0.366478
106773,Now Martini delivers Paul Madriani's most chal...,0.35347
448931,Duke Leto Atreides is now the skilful ruler of...,0.342531
292067,"Dedicated to mothers, daughters and sons whose...",0.324415
118996,A witty and irresistible story of a mother and...,0.321737
5595,"When Jess's daughter, Anna, is reported lost i...",0.314948
343346,"When Jess's daughter, Anna, is reported lost i...",0.314948
264353,"After her husband takes a concubine, Madame Li...",0.312969
101648,"The first volume of the book series ""Successfu...",0.312871


In [12]:
# Define the path to the data
path = "Datasets/goodreads_interactions.csv"

# Define the chunksize for reading the data
chunksize = 10 ** 6

# Create a dataframe for user with id 661ff6b7041ea1935101f16846e3cba6 will need to use batch processing

# Initialize reader object: reader
user_id = '661ff6b7041ea1935101f16846e3cba6'

user_df = pd.DataFrame()

# Process the file in chunks
with pd.read_csv(path, chunksize=chunksize) as reader:
    for chunk in reader:
        user_df = pd.concat([user_df, chunk[chunk['user_id'] == user_id]])

In [13]:
# Get the descriptions for the books the user has read
user_books = user_df['book_id'].unique()
user_books = books[books['book_id'].isin(user_books)]
print(user_books)

              isbn language_code  \
316182  0312364121           eng   
33541   1624300138           eng   
64325   0987526146           eng   
231961  1465341889           eng   
383650  0525478817           eng   
371529  1476753164           eng   
196819  1455549002           eng   
371680  1476763976           eng   

                                              description         isbn13  \
316182  From the author of the smash-hit bestseller Fi...  9780312364120   
33541   From the New York Times and USA Today bestsell...  9781624300134   
64325   Ryan Kendall is broken. He understands pain. H...  9780987526144   
231961  Every action has consequences.\nWaking in an u...  9781465341884   
383650  There is an alternate cover edition .\n"I fel...  9780525478812   
371529  From #1 New York Timesbestselling author Colle...  9781476753164   
196819  Five months ago, Camryn and Andrew, both deali...  9781455549009   
371680  A Chicago reporter in her mid-twenties unexpec...  97814767

In [14]:
# Calculate the mean similarty between the provided description and the descriptions of the books the user has read
mean_similarity = user_books['similarity'].mean()
print(mean_similarity)

0.056290355227640046
