In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import re

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# import spacy
# from spacy.cli import download
# print(download('en'))

In [3]:
# import nltk
# nltk.download('wordnet')

In [4]:
# import nltk
# nltk.download('omw-1.4')

In [5]:
nlp = spacy.load('en_core_web_sm')

In [6]:
stopset = set(stopwords.words('english'))

In [7]:
data = pd.read_csv('wine.csv', index_col = 0)

In [8]:
data['reviews'][2]

'Lovely bouquet, first off vanilla balanced by slightly ripened plums and blackberries with a hint of chocolate and a smooth but not too sweet finish. Brilliant. 4.3'

In [None]:
data.isnull().sum()

In [9]:
# import nltk
# nltk.download('omw-1.4')

In [10]:
## Function to clean the reviews and tokenize and lemmatize it.

def clean(doc):
    doc = str(doc)
    count_vectorizer = CountVectorizer()
    vector_matrix = count_vectorizer.fit_transform([doc])    
    tokens = count_vectorizer.get_feature_names_out()
    ## Cleaning the token of stopwords and punctualtion (len > 2)
    cleanup = [token.lower() for token in tokens if token.lower() not in stopset and  len(token)>2]
    lemmatizer = WordNetLemmatizer()
    fin = [lemmatizer.lemmatize(token) for token in cleanup]    
    return(fin)

In [11]:
def clean(doc):
    doc = str(doc)
    doc = nlp(doc)
    tokens = [token.lemma_ for token in doc]
    ## Cleaning the token of stopwords and punctualtion (len > 2)
    cleanup = [token.lower() for token in tokens if token.lower() not in stopset and  len(token)>2]
    lemmatizer = WordNetLemmatizer()
    fin = [lemmatizer.lemmatize(token) for token in cleanup]
    return(list(set(fin)))

In [12]:
print(clean(data['reviews'][2]))

['brilliant', 'bouquet', '4.3', 'smooth', 'first', 'balance', 'sweet', 'vanilla', 'blackberry', 'slightly', 'ripen', 'plum', 'finish', 'hint', 'chocolate', 'lovely']


In [13]:
data['cleaned_reviews'] = data.apply(lambda x: clean(x['reviews']), axis = 1)

In [14]:
data.head()

Unnamed: 0,product_name,reviews,rating,cleaned_reviews
0,19 Crimes Red Blend 2020,Hard to fault this wine at its price. Vanilla ...,4.2,"[jam, hit, first, easy, oak, hint, red, palate..."
1,19 Crimes Red Blend 2020,A treasure for the price. Especially with a cl...,4.5,"[third, especially, get, week, tenner, jammy, ..."
2,19 Crimes Red Blend 2020,"Lovely bouquet, first off vanilla balanced by ...",4.0,"[brilliant, bouquet, 4.3, smooth, first, balan..."
3,19 Crimes Red Blend 2020,"Awesome red blend! Well balanced, complex, fru...",4.0,"[balanced, meat, oak, blend, medium, well, red..."
4,19 Crimes Red Blend 2020,Another wine from 19 crimes what I tasted. Aga...,4.8,"[taste, acidic, raspberry, balanced, oak, good..."


In [15]:
def similarity(doc1, doc2):
    doc = [' '.join(doc1), ' '.join(doc2)]
    count_vectorizer = CountVectorizer()
    vector_matrix = count_vectorizer.fit_transform(doc)
    tokens = count_vectorizer.get_feature_names_out()
    
    cosine_similarity_matrix = cosine_similarity(vector_matrix)
    df = create_dataframe(cosine_similarity_matrix, ['doc_1','doc_2'])
    
    return(round(df['doc_1'][1], 2))

In [16]:
def create_dataframe(matrix, tokens):

    doc_names = [f'doc_{i+1}' for i, _ in enumerate(matrix)]
    df = pd.DataFrame(data=matrix, index=doc_names, columns=tokens)
    return(df)

In [17]:
## User defined attributes

doc1 = []
for i in range(0, 3):
    a = input('Enter attribute: ')
    doc1.append(a)


Enter attribute: sweet
Enter attribute: cherry
Enter attribute: apple


In [18]:
data['similarity_score'] = data.apply(lambda x: similarity(doc1, x['cleaned_reviews']), axis = 1)

In [19]:
data.head()

Unnamed: 0,product_name,reviews,rating,cleaned_reviews,similarity_score
0,19 Crimes Red Blend 2020,Hard to fault this wine at its price. Vanilla ...,4.2,"[jam, hit, first, easy, oak, hint, red, palate...",0.23
1,19 Crimes Red Blend 2020,A treasure for the price. Especially with a cl...,4.5,"[third, especially, get, week, tenner, jammy, ...",0.0
2,19 Crimes Red Blend 2020,"Lovely bouquet, first off vanilla balanced by ...",4.0,"[brilliant, bouquet, 4.3, smooth, first, balan...",0.15
3,19 Crimes Red Blend 2020,"Awesome red blend! Well balanced, complex, fru...",4.0,"[balanced, meat, oak, blend, medium, well, red...",0.0
4,19 Crimes Red Blend 2020,Another wine from 19 crimes what I tasted. Aga...,4.8,"[taste, acidic, raspberry, balanced, oak, good...",0.14


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
def tfid(doc1, doc2):
    doc = [' '.join(doc1), ' '.join(doc2)]
    Tfidf_vect = TfidfVectorizer()
    
    vector_matrix = Tfidf_vect.fit_transform(doc)
    tokens = Tfidf_vect.get_feature_names_out()
    
    cosine_similarity_matrix = cosine_similarity(vector_matrix)
    df = create_dataframe(cosine_similarity_matrix, ['doc_1','doc_2'])
    
    return(round(df['doc_1'][1], 2))

In [22]:
data['tfid_similarity_score'] = data.apply(lambda x: tfid(doc1, x['cleaned_reviews']), axis = 1)

In [23]:
data.head()

Unnamed: 0,product_name,reviews,rating,cleaned_reviews,similarity_score,tfid_similarity_score
0,19 Crimes Red Blend 2020,Hard to fault this wine at its price. Vanilla ...,4.2,"[jam, hit, first, easy, oak, hint, red, palate...",0.23,0.15
1,19 Crimes Red Blend 2020,A treasure for the price. Especially with a cl...,4.5,"[third, especially, get, week, tenner, jammy, ...",0.0,0.0
2,19 Crimes Red Blend 2020,"Lovely bouquet, first off vanilla balanced by ...",4.0,"[brilliant, bouquet, 4.3, smooth, first, balan...",0.15,0.08
3,19 Crimes Red Blend 2020,"Awesome red blend! Well balanced, complex, fru...",4.0,"[balanced, meat, oak, blend, medium, well, red...",0.0,0.0
4,19 Crimes Red Blend 2020,Another wine from 19 crimes what I tasted. Aga...,4.8,"[taste, acidic, raspberry, balanced, oak, good...",0.14,0.08


In [24]:
new_data = data[['product_name', 'similarity_score', 'reviews']]

In [25]:
new_data.head(2)

Unnamed: 0,product_name,similarity_score,reviews
0,19 Crimes Red Blend 2020,0.23,Hard to fault this wine at its price. Vanilla ...
1,19 Crimes Red Blend 2020,0.0,A treasure for the price. Especially with a cl...


In [26]:
new_data1 = data[['product_name', 'tfid_similarity_score', 'reviews']]

In [27]:
new_data1.head(2)

Unnamed: 0,product_name,tfid_similarity_score,reviews
0,19 Crimes Red Blend 2020,0.15,Hard to fault this wine at its price. Vanilla ...
1,19 Crimes Red Blend 2020,0.0,A treasure for the price. Especially with a cl...


In [28]:
# new_data.to_csv('similarity.csv')

In [29]:
# new_data1.to_csv('tfidf_similarity.csv'

In [30]:
# a = ['apple','banana','cherry', 'hey']
# b = ['the','Banana','disk']
# c = [' '.join(a), ' '.join(b)]

In [31]:
# count_vectorizer = CountVectorizer()
# vector_matrix = count_vectorizer.fit_transform(c)
# vector_matrix

In [32]:
# vector_matrix.toarray()

In [33]:
# tokens

In [34]:
# tokens = count_vectorizer.get_feature_names_out()
# create_dataframe(vector_matrix.toarray(),tokens)

In [35]:
# cosine_similarity_matrix = cosine_similarity(vector_matrix)
# create_dataframe(cosine_similarity_matrix,['doc_1','doc_2'])['doc_1'][1]

## Sentiment Aanalysis

In [36]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

def get_sentiment(review):
    sid = SentimentIntensityAnalyzer()
    newWords = {'oaky': 2.0, 
                'rich': 2.0, 
                'complex': 2.0,
                'balance': 2.0,
                'angular' : 2.0,
                'backbone' : 2.0,
                'aggressive': -2.0, 
                'acidic': -2.0,
                'alcoholic': -2.0 }
    sid.lexicon.update(newWords)
    score = sid.polarity_scores(review)
    compound = score.get('compound')
    return compound

data['sentiment_score'] = data.apply(lambda x: get_sentiment(x['reviews'].lower()), axis = 1)

data.head()

Unnamed: 0,product_name,reviews,rating,cleaned_reviews,similarity_score,tfid_similarity_score,sentiment_score
0,19 Crimes Red Blend 2020,Hard to fault this wine at its price. Vanilla ...,4.2,"[jam, hit, first, easy, oak, hint, red, palate...",0.23,0.15,0.7579
1,19 Crimes Red Blend 2020,A treasure for the price. Especially with a cl...,4.5,"[third, especially, get, week, tenner, jammy, ...",0.0,0.0,0.6124
2,19 Crimes Red Blend 2020,"Lovely bouquet, first off vanilla balanced by ...",4.0,"[brilliant, bouquet, 4.3, smooth, first, balan...",0.15,0.08,0.6575
3,19 Crimes Red Blend 2020,"Awesome red blend! Well balanced, complex, fru...",4.0,"[balanced, meat, oak, blend, medium, well, red...",0.0,0.0,0.9402
4,19 Crimes Red Blend 2020,Another wine from 19 crimes what I tasted. Aga...,4.8,"[taste, acidic, raspberry, balanced, oak, good...",0.14,0.08,0.8245


In [40]:
data['average_similarity_and_sentiment'] = data.apply(
    lambda x: (x['sentiment_score'] + x['tfid_similarity_score'])/2, axis = 1)

In [41]:
data.head()

Unnamed: 0,product_name,reviews,rating,cleaned_reviews,similarity_score,tfid_similarity_score,sentiment_score,average_similarity_and_sentiment
0,19 Crimes Red Blend 2020,Hard to fault this wine at its price. Vanilla ...,4.2,"[jam, hit, first, easy, oak, hint, red, palate...",0.23,0.15,0.7579,0.45395
1,19 Crimes Red Blend 2020,A treasure for the price. Especially with a cl...,4.5,"[third, especially, get, week, tenner, jammy, ...",0.0,0.0,0.6124,0.3062
2,19 Crimes Red Blend 2020,"Lovely bouquet, first off vanilla balanced by ...",4.0,"[brilliant, bouquet, 4.3, smooth, first, balan...",0.15,0.08,0.6575,0.36875
3,19 Crimes Red Blend 2020,"Awesome red blend! Well balanced, complex, fru...",4.0,"[balanced, meat, oak, blend, medium, well, red...",0.0,0.0,0.9402,0.4701
4,19 Crimes Red Blend 2020,Another wine from 19 crimes what I tasted. Aga...,4.8,"[taste, acidic, raspberry, balanced, oak, good...",0.14,0.08,0.8245,0.45225
