# Document similarity

In [20]:
import pandas as pd
import numpy as np
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [3]:
df = pd.read_csv('data/amazon_alexa.tsv', sep='\t')
df.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [None]:
df.shape

## Load language model, punctuation and stopwords

In [4]:
# load (medium-size) English language model and stopwords
nlp = spacy.load('en_core_web_md')
stop_words = spacy.lang.en.stop_words.STOP_WORDS
# list of punctuation
punctuations = string.punctuation

## Custom tokenizer

In [5]:
# creating our tokenizer function
def custom_tokenizer(sentence):
    
    # remove newline chars, punctuation
    exclude = ['\n', '’', *string.punctuation]
    sentence = ''.join(ch for ch in sentence if ch not in exclude)
    
    # creating our token object, which is used to create documents with linguistic annotations.
    tokens = nlp(sentence)

    # lemmatization, lowercase, strip extra whitespace
    tokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens ]
#     tokens = [ w.lower_ for w in tokens ]

    # remove stopwords
    tokens = [ w for w in tokens if w not in stop_words ]

    # return preprocessed list of tokens
    return tokens

In [6]:
# test it out
df.loc[:5, 'verified_reviews'].apply(custom_tokenizer)

0                                         [love, echo]
1                                               [love]
2    [play, game, answer, question, correctly, alex...
3    [lot, fun, thing, 4, yr, old, learn, dinosaur,...
4                                              [music]
5    [receive, echo, gift, need, bluetooth, play, m...
Name: verified_reviews, dtype: object

In [8]:
print(f'Original:\n{df.loc[2, "verified_reviews"]}\n')
print(f'Processed:\n{" ".join(custom_tokenizer(df.loc[2, "verified_reviews"]))}')

Original:
Sometimes while playing a game, you can answer a question correctly but Alexa says you got it wrong and answers the same as you.  I like being able to turn lights on and off while away from home.

Processed:
play game answer question correctly alexa wrong answer  like able turn light away home


## TFIDF similarity

In [14]:
vec = TfidfVectorizer(tokenizer=custom_tokenizer, max_features=1500)

In [15]:
X = vec.fit_transform(df['verified_reviews'])

In [16]:
X.toarray().shape

(3150, 1500)

In [17]:
X = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
X.head()

Unnamed: 0,Unnamed: 1,"""",',1,10,100,15,1st,2,20,...,❤,⭐,️,🏻,👍,💋,😁,😂,😄,😍
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.123961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
nbrs = NearestNeighbors(n_neighbors=4, metric='cosine', n_jobs=-1).fit(X)

distances, indices = nbrs.kneighbors(X)

# add columns for 3 nearest neighbours
for i in range(1,4):
    df[f'nn_{i}'] = indices[:,i]
    df[f'nn_{i}_dist'] = distances[:,i]
    
df.head(10)

Unnamed: 0,rating,date,variation,verified_reviews,feedback,nn_1,nn_1_dist,nn_2,nn_2_dist,nn_3,nn_3_dist
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1,265,0.0,960,0.0,825,0.0
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1,461,0.0,1366,0.0,2182,0.0
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1,441,0.520152,2578,0.538588,2929,0.538588
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1,2623,0.57479,2974,0.57479,2401,0.586791
4,5,31-Jul-18,Charcoal Fabric,Music,1,2885,0.293424,2534,0.293424,1158,0.382815
5,5,31-Jul-18,Heather Gray Fabric,I received the echo as a gift. I needed anothe...,1,700,0.0,3127,0.565112,2776,0.565112
6,3,31-Jul-18,Sandstone Fabric,"Without having a cellphone, I cannot use many ...",1,701,0.0,1228,0.672364,330,0.677549
7,5,31-Jul-18,Charcoal Fabric,I think this is the 5th one I've purchased. I'...,1,702,0.0,987,0.578247,292,0.578247
8,5,30-Jul-18,Heather Gray Fabric,looks great,1,8,0.0,703,0.0,1858,0.35433
9,5,30-Jul-18,Heather Gray Fabric,Love it! I’ve listened to songs I haven’t hear...,1,704,0.0,1599,0.47181,2817,0.544518


## Document vector similarity

In [22]:
# get document vectors for each card
X = np.vstack([i for i in df['verified_reviews'].apply(lambda x: nlp(x).vector)])

In [23]:
nbrs = NearestNeighbors(n_neighbors=4, metric='cosine', n_jobs=-1).fit(X)

distances, indices = nbrs.kneighbors(X)

# add columns for 3 nearest neighbours
for i in range(1,4):
    df[f's_{i}'] = indices[:,i]
    df[f's_{i}_dist'] = distances[:,i]
    
df.head(10)

Unnamed: 0,rating,date,variation,verified_reviews,feedback,nn_1,nn_1_dist,nn_2,nn_2_dist,nn_3,nn_3_dist,s_1,s_1_dist,s_2,s_2_dist,s_3,s_3_dist
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1,265,0.0,960,0.0,825,0.0,1069,0.03196222,1095,0.031962,1568,0.032799
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1,461,0.0,1366,0.0,2182,0.0,461,0.04017389,2239,0.040174,1412,0.040174
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1,441,0.520152,2578,0.538588,2929,0.538588,1322,0.02097732,1440,0.021285,1380,0.02141
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1,2623,0.57479,2974,0.57479,2401,0.586791,1472,0.03344595,702,0.034653,7,0.034653
4,5,31-Jul-18,Charcoal Fabric,Music,1,2885,0.293424,2534,0.293424,1158,0.382815,781,0.1658735,86,0.165874,937,0.244975
5,5,31-Jul-18,Heather Gray Fabric,I received the echo as a gift. I needed anothe...,1,700,0.0,3127,0.565112,2776,0.565112,5,4.768372e-07,1828,0.017202,258,0.018035
6,3,31-Jul-18,Sandstone Fabric,"Without having a cellphone, I cannot use many ...",1,701,0.0,1228,0.672364,330,0.677549,6,0.0,953,0.012108,258,0.012108
7,5,31-Jul-18,Charcoal Fabric,I think this is the 5th one I've purchased. I'...,1,702,0.0,987,0.578247,292,0.578247,7,2.384186e-07,1256,0.021797,1515,0.022419
8,5,30-Jul-18,Heather Gray Fabric,looks great,1,8,0.0,703,0.0,1858,0.35433,703,0.0,1465,0.117256,1793,0.125488
9,5,30-Jul-18,Heather Gray Fabric,Love it! I’ve listened to songs I haven’t hear...,1,704,0.0,1599,0.47181,2817,0.544518,704,0.0,231,0.039261,926,0.039261


## Compare results

In [30]:
def show_nearest(dataframe, idx, cosine=True):
    
    cols = ['s_1', 's_2', 's_3']
    if cosine:
        cols = ['nn_1', 'nn_2', 'nn_3']
    
    print(f'Rating = {dataframe.loc[idx, "rating"]}')
    print('Text:', dataframe.loc[idx, 'verified_reviews'], sep='\n', end='\n\n')
    
    for i in cols:
        nn = dataframe.loc[idx, i]
        
        print(f'{i}, Distance = {dataframe.loc[idx, i+"_dist"]:.2f}, Rating = {dataframe.loc[idx, "rating"]}')
        print('Text:', dataframe.loc[nn, 'verified_reviews'], sep='\n', end='\n\n')

In [31]:
# show tfidf and embedding results for first 5 articles
for i in range(5):
    print('-'*30, 'DOCUMENT', i+1, 'TFIDF', '-'*30)
    show_nearest(df, i)
    print('-'*35, 'EMBEDDING', '-'*35)
    show_nearest(df, i, cosine=False)
    print('-'*40, '-'*40, '\n')

------------------------------ DOCUMENT 1 TFIDF ------------------------------
Rating = 5
Text:
Love my Echo!

nn_1, Distance = 0.00, Rating = 5
Text:
Love my Echo

nn_2, Distance = 0.00, Rating = 5
Text:
Love my Echo

nn_3, Distance = 0.00, Rating = 5
Text:
Love our echo

----------------------------------- EMBEDDING -----------------------------------
Rating = 5
Text:
Love my Echo!

s_1, Distance = 0.03, Rating = 5
Text:
Love my Echo Spot!

s_2, Distance = 0.03, Rating = 5
Text:
Love my Echo Spot!

s_3, Distance = 0.03, Rating = 5
Text:
Love my echo show!

---------------------------------------- ---------------------------------------- 

------------------------------ DOCUMENT 2 TFIDF ------------------------------
Rating = 5
Text:
Loved it!

nn_1, Distance = 0.00, Rating = 5
Text:
Love it!

nn_2, Distance = 0.00, Rating = 5
Text:
Love it

nn_3, Distance = 0.00, Rating = 5
Text:
Love it love it love it

----------------------------------- EMBEDDING ----------------------------------