# Sentiment analysis and word embedding

This notebooks contains code to perform sentiment analysis on wine reviews and create word embeddings for wine pairing in order to link wine with food recipes.

In [2]:
import json
import numpy as np
from deep_translator import GoogleTranslator

In [None]:
with open('wine_scrapped_data.json') as f:
    wine_data_raw = json.load(f)

translator = GoogleTranslator(source='auto', target='en')

test_data = wine_data_raw[3]
print(test_data)

## 1. Sentiment analysis

In [2]:
from pysentimiento import create_analyzer

reviews = test_data['reviews']
sentiment_analyzer = create_analyzer(task="sentiment", lang="en")
emotion_analyzer = create_analyzer(task="emotion", lang="en")

In [3]:
def sentiment_analysis(reviews):
    sentiments = []
    emotions = []
    for review in reviews:
        review = translator.translate(review)
        sentiment = sentiment_analyzer.predict(review).probas
        emotion = emotion_analyzer.predict(review).probas
        sentiments.append(sentiment)
        emotions.append(emotion)
    return {key: np.mean([sentiment[key] for sentiment in sentiments]) for key in sentiments[0]}, {key: np.mean([emotion[key] for emotion in emotions]) for key in emotions[0]}

## 2. Word embedding

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download('all')

In [5]:
custom_stopwords = set(stopwords.words('english') + ["etc", "hard", "soft", "lean", "seasoned", "ripened"])

In [7]:
from gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [8]:
def generate_embeddings(pairings):
    tokens = []
    for pairing in pairings:
        pairing = translator.translate(pairing)
        pairing = pairing.lower()
        pairing_tokens = word_tokenize(pairing)
        pairing_tokens = [token for token in pairing_tokens if token not in string.punctuation]
        pairing_tokens = [token for token in pairing_tokens if token not in custom_stopwords]
        tokens.extend([token for token in pairing_tokens])
    # group every word in a single list and remove duplicates
    tokens = list(set(tokens))
    # replace "game" by "wildflow"
    tokens = [token.replace('game', 'wildfowl') for token in tokens]
    return np.mean([model[token] for token in tokens], axis=0)

## 3. Process all wine data

In [None]:
errors_count = 0
wine_data = wine_data_raw.copy()
for wine in wine_data:
    try:
        wine['reviews_sentiment'], wine['reviews_emotion'] = sentiment_analysis(wine['reviews'])
        wine['pairings_embedding'] = generate_embeddings(wine['pairings'])
    except:
        print(wine['url'])
        # delete the wine
        wine_data.remove(wine)
        errors_count += 1
        continue

print("There were {} errors out of {} wines".format(errors_count, len(wine_data_raw)))

In [27]:
wine_data_copy = wine_data.copy()

In [28]:
# convert the numpy arrays to lists
for wine in wine_data_copy:
    # check if embedding pairing exists
    if 'pairings_embedding' in wine:
        # check if it is a numpy array
        if isinstance(wine['pairings_embedding'], np.ndarray):
            wine['pairings_embedding'] = wine['pairings_embedding'].tolist()
    else:
        print("No pairing embedding for {}".format(wine['url']))
        # remove the wine
        wine_data_copy.remove(wine)

print("There are {} wines left out of {}".format(len(wine_data_copy), len(wine_data_raw)))

with open('wine_sentiment_embedding.json', 'w') as f:
    json.dump(wine_data_copy, f, indent=4)

No pairing embedding for https://www.vivino.com/CH/fr/cave-du-rhodan-diversitas-hommage-pinot-noir/w/6929693?year=2019&price_id=27651844
No pairing embedding for https://www.vivino.com/CH/fr/gregor-kuonen-caveau-de-salquenen-merlot/w/2100391?year=2019&price_id=30828418
No pairing embedding for https://www.vivino.com/CH/fr/angelo-delea-carato-merlot/w/1211198?year=2020&price_id=31313439
No pairing embedding for https://www.vivino.com/CH/fr/vincent-girardin-meursault-1er-cru-les-perrieres/w/1415377?year=2020&price_id=30797162
No pairing embedding for https://www.vivino.com/CH/fr/domaine-de-l-aurage-castillon-cotes-de-bordeaux/w/1240817?year=2020&price_id=32194721
No pairing embedding for https://www.vivino.com/CH/fr/yonfigeac-saint-emilion-grand-cru-grand-cru-classe/w/89399?year=2018&price_id=31266318
No pairing embedding for https://www.vivino.com/CH/fr/allegria-la-belle-histoire/w/6270401?year=2020&price_id=30549885
No pairing embedding for https://www.vivino.com/CH/fr/chateau-le-puy-b

## Others

In [24]:
# load the wine_sentiment_embedding.json file
with open('wine_sentiment_embedding.json') as f:
    wine_data_embedding = json.load(f)

print(wine_data_embedding[0]['reviews_sentiment'])

# emotion : String
# sentiment : number
# average_rating : number
for wine in wine_data_embedding:
    if 'reviews_sentiment' not in wine:
        print(wine['url'])
        wine_data_embedding.remove(wine)
        continue
    wine['sentiment'] = wine['reviews_sentiment']['POS']
    wine.pop('reviews_sentiment')
    # delete "others" emotion
    wine['reviews_emotion'].pop('others')
    wine['emotion'] = max(wine['reviews_emotion'], key=wine['reviews_emotion'].get)
    wine.pop('reviews_emotion')
    # convert average rating to a number
    wine['average_rating'] = float(wine['average_rating'])

# create one file per 100 wines
nb_wine = 100
for i in range(0, len(wine_data_embedding), nb_wine):
    with open('wine_sentiment_embedding_{}.json'.format(i), 'w') as f:
        # one element per line
        f.write('[' +
                ',\n'.join(json.dumps(wine) for wine in wine_data_embedding[i:i + nb_wine]) +
                ']')

{'NEG': 0.30208458988151204, 'NEU': 0.03807370116313299, 'POS': 0.6598416943258295}
https://www.vivino.com/CH/fr/bodegas-los-astrales-sl-christina-ribera-del-duero/w/79961?year=2018&price_id=28743271
