In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
# import spacy
# from spacy.cli import download
# print(download('en'))

In [5]:
nlp = spacy.load('en_core_web_sm')

In [6]:
stopset = set(stopwords.words('english'))

In [7]:
data = pd.read_csv('wine.csv', index_col = 0)

In [8]:
data['reviews'][2]

'Lovely bouquet, first off vanilla balanced by slightly ripened plums and blackberries with a hint of chocolate and a smooth but not too sweet finish. Brilliant. 4.3'

In [9]:
# import nltk
# nltk.download('omw-1.4')

In [10]:
# ## Function to clean the reviews and tokenize and lemmatize it.

# def clean(doc):
#     doc = str(doc)
#     count_vectorizer = CountVectorizer()
#     vector_matrix = count_vectorizer.fit_transform([doc])    
#     tokens = count_vectorizer.get_feature_names_out()
#     ## Cleaning the token of stopwords and punctualtion (len > 2)
#     cleanup = [token.lower() for token in tokens if token.lower() not in stopset and  len(token)>2]
#     lemmatizer = WordNetLemmatizer()
#     fin = [lemmatizer.lemmatize(token) for token in cleanup]    
#     return(fin)

In [11]:
def clean(doc):
    doc = str(doc)
    doc = nlp(doc)
    tokens = [token.lemma_ for token in doc]
    ## Cleaning the token of stopwords and punctualtion (len > 2)
    cleanup = [token.lower() for token in tokens if token.lower() not in stopset and  len(token)>2]
    lemmatizer = WordNetLemmatizer()
    fin = [lemmatizer.lemmatize(token) for token in cleanup]
    return(list(set(fin)))

In [12]:
print(clean(data['reviews'][2]))

['slightly', 'sweet', '4.3', 'first', 'bouquet', 'chocolate', 'finish', 'ripen', 'plum', 'brilliant', 'smooth', 'vanilla', 'lovely', 'balance', 'blackberry', 'hint']


In [13]:
data['cleaned_reviews'] = data.apply(lambda x: clean(x['reviews']), axis = 1)

In [14]:
data.head()

Unnamed: 0,product_name,reviews,rating,cleaned_reviews
0,19 Crimes Red Blend 2020,Hard to fault this wine at its price. Vanilla ...,4.2,"[even, black, first, fruit, hit, drink, follow..."
1,19 Crimes Red Blend 2020,A treasure for the price. Especially with a cl...,4.5,"[bold, bit, tenner, club, card, price, second,..."
2,19 Crimes Red Blend 2020,"Lovely bouquet, first off vanilla balanced by ...",4.0,"[slightly, sweet, 4.3, first, bouquet, chocola..."
3,19 Crimes Red Blend 2020,"Awesome red blend! Well balanced, complex, fru...",4.0,"[braised, balanced, well, bodied, fruity, bras..."
4,19 Crimes Red Blend 2020,Another wine from 19 crimes what I tasted. Aga...,4.8,"[bold, tannin, mention, good, another, taste, ..."


In [15]:
def similarity(doc1, doc2):
    doc = [' '.join(doc1), ' '.join(doc2)]
    count_vectorizer = CountVectorizer()
    vector_matrix = count_vectorizer.fit_transform(doc)
    tokens = count_vectorizer.get_feature_names_out()
    
    cosine_similarity_matrix = cosine_similarity(vector_matrix)
    df = create_dataframe(cosine_similarity_matrix, ['doc_1','doc_2'])
    
    return(round(df['doc_1'][1], 2))

In [16]:
def create_dataframe(matrix, tokens):

    doc_names = [f'doc_{i+1}' for i, _ in enumerate(matrix)]
    df = pd.DataFrame(data=matrix, index=doc_names, columns=tokens)
    return(df)

In [17]:
## User defined attributes

doc1 = []
for i in range(0, 3):
    a = input('Enter attribute: ')
    doc1.append(a)

Enter attribute: sweet
Enter attribute: plum
Enter attribute: cherry


In [18]:
data['similarity_score'] = data.apply(lambda x: similarity(doc1, x['cleaned_reviews']), axis = 1)

In [19]:
data.head()

Unnamed: 0,product_name,reviews,rating,cleaned_reviews,similarity_score
0,19 Crimes Red Blend 2020,Hard to fault this wine at its price. Vanilla ...,4.2,"[even, black, first, fruit, hit, drink, follow...",0.23
1,19 Crimes Red Blend 2020,A treasure for the price. Especially with a cl...,4.5,"[bold, bit, tenner, club, card, price, second,...",0.0
2,19 Crimes Red Blend 2020,"Lovely bouquet, first off vanilla balanced by ...",4.0,"[slightly, sweet, 4.3, first, bouquet, chocola...",0.3
3,19 Crimes Red Blend 2020,"Awesome red blend! Well balanced, complex, fru...",4.0,"[braised, balanced, well, bodied, fruity, bras...",0.0
4,19 Crimes Red Blend 2020,Another wine from 19 crimes what I tasted. Aga...,4.8,"[bold, tannin, mention, good, another, taste, ...",0.14


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
def tfid(doc1, doc2):
    doc = [' '.join(doc1), ' '.join(doc2)]
    Tfidf_vect = TfidfVectorizer()
    
    vector_matrix = Tfidf_vect.fit_transform(doc)
    tokens = Tfidf_vect.get_feature_names_out()
    
    cosine_similarity_matrix = cosine_similarity(vector_matrix)
    df = create_dataframe(cosine_similarity_matrix, ['doc_1','doc_2'])
    
    return(round(df['doc_1'][1], 2))

In [22]:
data['tfid_similarity_score'] = data.apply(lambda x: tfid(doc1, x['cleaned_reviews']), axis = 1)

In [23]:
data.head()

Unnamed: 0,product_name,reviews,rating,cleaned_reviews,similarity_score,tfid_similarity_score
0,19 Crimes Red Blend 2020,Hard to fault this wine at its price. Vanilla ...,4.2,"[even, black, first, fruit, hit, drink, follow...",0.23,0.15
1,19 Crimes Red Blend 2020,A treasure for the price. Especially with a cl...,4.5,"[bold, bit, tenner, club, card, price, second,...",0.0,0.0
2,19 Crimes Red Blend 2020,"Lovely bouquet, first off vanilla balanced by ...",4.0,"[slightly, sweet, 4.3, first, bouquet, chocola...",0.3,0.19
3,19 Crimes Red Blend 2020,"Awesome red blend! Well balanced, complex, fru...",4.0,"[braised, balanced, well, bodied, fruity, bras...",0.0,0.0
4,19 Crimes Red Blend 2020,Another wine from 19 crimes what I tasted. Aga...,4.8,"[bold, tannin, mention, good, another, taste, ...",0.14,0.08


In [24]:
new_data = data[['product_name', 'similarity_score', 'reviews']]

In [25]:
new_data.head(2)

Unnamed: 0,product_name,similarity_score,reviews
0,19 Crimes Red Blend 2020,0.23,Hard to fault this wine at its price. Vanilla ...
1,19 Crimes Red Blend 2020,0.0,A treasure for the price. Especially with a cl...


In [26]:
new_data1 = data[['product_name', 'tfid_similarity_score', 'reviews']]

In [27]:
new_data1.head(2)

Unnamed: 0,product_name,tfid_similarity_score,reviews
0,19 Crimes Red Blend 2020,0.15,Hard to fault this wine at its price. Vanilla ...
1,19 Crimes Red Blend 2020,0.0,A treasure for the price. Especially with a cl...


In [28]:
new_data.to_csv('similarity.csv')

In [29]:
new_data1.to_csv('tfidf_similarity.csv')

In [35]:
# a = ['apple','banana','cherry', 'hey']
# b = ['the','Banana','disk']
# c = [' '.join(a), ' '.join(b)]

In [36]:
# count_vectorizer = CountVectorizer()
# vector_matrix = count_vectorizer.fit_transform(c)
# vector_matrix

In [37]:
# vector_matrix.toarray()

In [31]:
# tokens

In [38]:
# tokens = count_vectorizer.get_feature_names_out()
# create_dataframe(vector_matrix.toarray(),tokens)

In [39]:
# cosine_similarity_matrix = cosine_similarity(vector_matrix)
# create_dataframe(cosine_similarity_matrix,['doc_1','doc_2'])['doc_1'][1]