In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

stop_words = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eeejo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eeejo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\eeejo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
review_data = pd.read_csv(r"C:\Users\eeejo\Downloads\review_data.csv")

In [14]:
book_description = review_data.drop_duplicates(subset = ['Title','description'])
book_description.head()

Unnamed: 0,Title,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text,description,authors,publisher,publishedYear,categories
0,Its Only Art If Its Well Hung!,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,1999-10-23,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...,Unknown,Julie Strain,Unknown,1996.0,Fiction
1,Dr. Seuss: American Icon,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,2004-09-21,Really Enjoyed It,I don't care much for Dr. Seuss but after read...,Philip Nel takes a fascinating look into the k...,Philip Nel,A&C Black,2005.0,Biography & Autobiography
10,Wonderful Worship in Smaller Churches,AZ0IOBU20TBOP,Rev. Pamela Tinnin,8/10,5.0,2001-06-02,Outstanding Resource for Small Church Pastors,"I just finished the book, &quot;Wonderful Wors...",This resource includes twelve principles in un...,David R. Ray,Unknown,2000.0,Religion
14,Whispers of the Wicked Saints,A3Q12RK71N74LB,Book Reader,7/11,1.0,2005-05-26,not good,I bought this book because I read some glowing...,Julia Thomas finds her life spinning out of co...,Veronica Haddon,iUniverse,2005.0,Fiction
46,"Nation Dance: Religion, Identity and Cultural ...",AN9WUW5BG7M39,Pink Noodle,1/1,5.0,2008-02-04,interplay of traditions across Caribbean,from publisher:Addresses the interplay of dive...,Unknown,Edward Long,Unknown,2003.0,Others


In [19]:
book_description.shape

(212397, 13)

In [10]:
STOPWORDS = set(stop_words)

PATTERN_S = re.compile("\'s")  # matches `'s` from text
PATTERN_RN = re.compile("\\r\\n") #matches `\r` and `\n`
PATTERN_PUNC = re.compile(r"[^\w\s]") # matches all non 0-9 A-z whitespace


def clean_text(text):
    """
    Series of cleaning. String to lower case, remove non words characters and numbers (punctuation, curly brackets etc).
        text (str): input text
    return (str): modified initial text
    """
    text = text.lower()  # lowercase text
    # replace the matched string with ' '
    text = re.sub(PATTERN_S, ' ', text)
    text = re.sub(PATTERN_RN, ' ', text)
    text = re.sub(PATTERN_PUNC, ' ', text)
    return text

def tokenizer(sentence, stopwords=STOPWORDS, lemmatize=True):
    stemmer = WordNetLemmatizer()
    tokens = [stemmer.lemmatize(w) for w in word_tokenize(sentence)]
    tokens = [w for w in tokens if (len(w) > 2 and len(w) < 200 and w not in stopwords)]
    return tokens

In [20]:
def extract_best_indices(m, topk, mask=None):
    """
    Use sum of the cosine distance over all tokens and return best mathes.
    m (np.array): cos matrix of shape (nb_in_tokens, nb_dict_tokens)
    topk (int): number of indices to return (from high to lowest in order)
    """
    # return the sum on all tokens of cosinus for each sentence
    if len(m.shape) > 1:
        cos_sim = np.mean(m, axis=0)
    else:
        cos_sim = m
    index = np.argsort(cos_sim)[::-1] # from highest idx to smallest score
    if mask is not None:
        assert mask.shape == m.shape
        mask = mask[index]
    else:
        mask = np.ones(len(cos_sim))
    mask = np.logical_or(cos_sim[index] != 0, mask) #eliminate 0 cosine distance
    best_index = index[mask][:topk]
    return best_index

def get_recommendations_tfidf(sentence, tfidf_mat):

    """
    Return the database sentences in order of highest cosine similarity relatively to each
    token of the target sentence.
    """
    # Embed the query sentence
    tokens_query = [str(tok) for tok in tokenizer(sentence)]
    embed_query = vectorizer.transform(tokens_query)
    # Create list with similarity between query and dataset
    mat = cosine_similarity(embed_query, tfidf_mat)
    # Best cosine distance for each token independantly
    best_index = extract_best_indices(mat, topk=20)
    return best_index


In [21]:
# Adapt stop words
token_stop = tokenizer(' '.join(stop_words), lemmatize=False)

# Fit TFIDF
vectorizer = TfidfVectorizer(stop_words=token_stop, tokenizer=tokenizer)
tfidf_mat = vectorizer.fit_transform(book_description['description'].values) # -> (num_sentences, num_vocabulary)



In [22]:
test_query = 'detective story'
best_index = get_recommendations_tfidf(test_query, tfidf_mat)

result = book_description[['Title', 'description']].iloc[best_index]
result

Unnamed: 0,Title,description
389693,August is a Good Time for Killing: And Other B...,A collection of detective mystery stories.
666089,British Women Mystery Writers: Six Authors of ...,Many aspects of British detective fiction are ...
693799,THE DRAMATIZED OLD TESTAMENT - Two Volume Set ...,"Those of us hearing a ""Well Told Story"" may be..."
891238,Twelve Mystery Stories (Oxford Twelves),"The field of detective fiction is vast, and Th..."
306547,The Early Fears,"Features 39 of the author's short stories, inc..."
2175547,They're All Trying to Kill Me!: (Or How I Mana...,The setting of the story takes place in the my...
196641,a capital affair,"This story is not a story of the author, but t..."
293033,Easy True Stories: A Picture-Based Beginning R...,A couple feels something moving inside their m...
2651668,More True Stories: A Beginning Reader,A couple feels something moving inside their m...
2260115,True Stories in the News: A Beginning Reader,A couple feels something moving inside their m...
