In [59]:
import os
import pandas as pd
import re
import string
from tqdm import tqdm
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
import logging
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import random
from collections import defaultdict
from multiprocessing import Pool
import functools

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
nltk.download('punkt')
nltk.download('stopwords')

pd.set_option('display.max_columns', None)

[nltk_data] Downloading package punkt to C:\Users\Karl
[nltk_data]     Zhang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Karl
[nltk_data]     Zhang\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Feature extraction for wine reviews
Basically, we want to extract taste features \[sweetness, acidity, salinity, piquancy, fattiness, bitterness\] in numerical values from wine reviews. The feature extraction idea is similar to Roald's work. Firstly, read the wine reviews file.

In [2]:
directory = r"Wine Reviews/"

if_start = True

for file in os.listdir(directory):
    file_location = directory + str(file)
    if if_start:
        wine_df = pd.read_csv(file_location, encoding='latin-1')
        if_start = False
    else:
        append_df = pd.read_csv(file_location, encoding='latin-1', low_memory=False)  # set low_memory to False to make sure that the feature types don't change for each file
        wine_df = pd.concat([wine_df, append_df], axis=0)

wine_df.drop_duplicates(subset=['Name'], inplace=True)

In [3]:
wine_df.head(3)

Unnamed: 0.1,Unnamed: 0,Alcohol,Appellation,Bottle Size,Category,Country,Date Published,Description,Designation,Importer,Name,Price,Province,Rating,Region,Reviewer,Reviewer Twitter Handle,Subregion,User Avg Rating,Variety,Vintage,Winery,ï»¿
0,0.0,14.9%,"Napa Valley, Napa, California, US",750 ml,Red,US,7/1/2019,Made in partnership with Alpha Omega Winery's ...,MR,,Michel Rolland Napa Valley 2014 MR Red (Napa V...,$175,California,95.0,Napa,Virginie Boone,@vboone,Napa Valley,Not rated yet [Add Your Review],Bordeaux-style Red Blend,2014.0,Michel Rolland Napa Valley,
1,1.0,14.5%,"Columbia Valley (WA), Columbia Valley, Washing...",750 ml,Red,US,7/1/2019,Cabernet Sauvignon from esteemed Cold Creek Vi...,,,21 Grams 2014 Red (Columbia Valley (WA)),$125,Washington,94.0,Columbia Valley,Sean P. Sullivan,@wawinereport,Columbia Valley (WA),Not rated yet [Add Your Review],Bordeaux-style Red Blend,2014.0,21 Grams,
2,2.0,13.6%,"Paso Robles, Central Coast, California, US",750 ml,Red,US,7/1/2019,Lush black-raspberry and black-plum aromas com...,Nikiara,,Le Vigne 2016 Nikiara Red (Paso Robles),$51,California,92.0,Central Coast,Matt Kettmann,@mattkettmann,Paso Robles,Not rated yet [Add Your Review],Bordeaux-style Red Blend,2016.0,Le Vigne,


<br>
Normalize the grape varieties in the dataset.

In [4]:
variety_mapping = {'Shiraz': 'Syrah', 'Pinot Gris': 'Pinot Grigio', 'Pinot Grigio/Gris': 'Pinot Grigio', 
                   'Garnacha, Grenache': 'Grenache', 'Garnacha': 'Grenache', 'CarmenÃ¨re': 'Carmenere',
                    'GrÃ¼ner Veltliner': 'Gruner Veltliner', 'TorrontÃ©s': 'Torrontes', 
                   'RhÃ´ne-style Red Blend': 'Rhone-style Red Blend', 'AlbariÃ±o': 'Albarino',
                  'GewÃ¼rztraminer': 'Gewurztraminer', 'RhÃ´ne-style White Blend': 'Rhone-style White Blend',
                  'SpÃƒÂ¤tburgunder, Pinot Noir': 'Pinot Noir', 'Sauvignon, Sauvignon Blanc': 'Sauvignon Blanc',
                  'Pinot Nero, Pinot Noir': 'Pinot Noir', 'Malbec-Merlot, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend',
                  'Meritage, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend', 'Garnacha, Grenache': 'Grenache',
                   'FumÃ© Blanc': 'Sauvignon Blanc', 'Cabernet Sauvignon-Cabernet Franc, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend',
                   'Cabernet Merlot, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend', 'Cabernet Sauvignon-Merlot, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend',
                   'Cabernet Blend, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend', 'Malbec-Cabernet Sauvignon, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend',
                   'Merlot-Cabernet Franc, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend', 'Merlot-Cabernet Sauvignon, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend',
                   'Cabernet Franc-Merlot, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend', 'Merlot-Malbec, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend',
                   'Cabernet, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend', 'Primitivo, Zinfandel': 'Zinfandel',
                   'AragonÃªs, Tempranillo': 'Aragonez, Tempranillo'
                  }

def standardize_varieties(variety_name):
    if variety_name in variety_mapping:
        return variety_mapping[variety_name]
    else:
        return variety_name      

In [5]:
wine_df_clean = wine_df.copy()
wine_df_clean['Variety'] = wine_df_clean['Variety'].apply(standardize_varieties)

<br>
This is to fill NaN or 0 with 'none'.

In [6]:
order_of_geographies = ['Subregion', 'Region', 'Province', 'Country']

def replace_nan_or_zero(value):
    if str(value) == '0' or str(value) == 'nan':
        return 'none'
    else:
        return value

for o in order_of_geographies:
    wine_df_clean[o] = wine_df_clean[o].apply(replace_nan_or_zero)

wine_df_clean.loc[:, order_of_geographies].fillna('none', inplace=True)

<br>
We extract features based on a combination of geographies level. Therefore, wines with combinations of ['Variety', 'Country', 'Province', 'Region', 'Subregion'] exist less or equal to 1 time are not considered.

In [7]:
variety_geo = wine_df_clean.groupby(['Variety', 'Country', 'Province', 'Region', 'Subregion']).size().reset_index().rename(columns={0:'count'})
variety_geo_sliced = variety_geo.loc[variety_geo['count'] > 1]

vgeos_df = pd.DataFrame(variety_geo_sliced, columns=['Variety', 'Country', 'Province', 'Region', 'Subregion', 'count'])
vgeos_df.to_csv('varieties_all_geos.csv')

In [8]:
# variety_geo_df = pd.read_csv('varieties_all_geos_normalized.csv', index_col=0)
variety_geo_df = vgeos_df.copy()
wine_df_merged = pd.merge(left=wine_df_clean, right=variety_geo_df, left_on=['Variety', 'Country', 'Province', 'Region', 'Subregion'],
                         right_on=['Variety', 'Country', 'Province', 'Region', 'Subregion'])
wine_df_merged.drop(['Unnamed: 0', 'Appellation', 'Bottle Size', 'Category',
                     'Date Published', 'Designation', 'Importer', 'Province', 'Rating', 
                     'Reviewer', 'Reviewer Twitter Handle', 'User Avg Rating', 'count'], 
                    axis=1, inplace=True)
wine_df_merged = wine_df_merged.iloc[:, :-1]

In [9]:
wine_df_merged.head()

Unnamed: 0,Alcohol,Country,Description,Name,Price,Region,Subregion,Variety,Vintage,Winery
0,14.9%,US,Made in partnership with Alpha Omega Winery's ...,Michel Rolland Napa Valley 2014 MR Red (Napa V...,$175,Napa,Napa Valley,Bordeaux-style Red Blend,2014.0,Michel Rolland Napa Valley
1,14.5%,US,This is Cabernet Sauvignon-based with ample ad...,Soquel Vineyards 2015 Intreccio Library Select...,$75,Napa,Napa Valley,Bordeaux-style Red Blend,2015.0,Soquel Vineyards
2,14.2%,US,The producer's long-standing Bordeaux-style bl...,Flora Springs 2016 Trilogy Red (Napa Valley),$85,Napa,Napa Valley,Bordeaux-style Red Blend,2016.0,Flora Springs
3,14.5%,US,This blends a majority of Cabernet Sauvignon w...,Michael Pozzan 2015 Marianna Red (Napa Valley),$50,Napa,Napa Valley,Bordeaux-style Red Blend,2015.0,Michael Pozzan
4,14.1%,US,This Cabernet Sauvignon-dominant blend is smok...,Liparita 2016 Left Bank Reserve Red (Napa Valley),$100,Napa,Napa Valley,Bordeaux-style Red Blend,2016.0,Liparita


In [10]:
wine_df_merged_filtered = wine_df_merged.copy()
wine_df_merged_filtered.dropna(subset=['Description'], inplace=True)

<br>
Now it's time to use descriptor_mapping_tastes.csv to map the six tastes to reviews.

In [11]:
wine_reviews = list(wine_df_merged_filtered['Description'])
descriptor_mapping = pd.read_csv('descriptor_mapping_tastes.csv', encoding='latin1').set_index('raw descriptor')

core_tastes = ['sweet', 'acid', 'salt', 'piquant', 'fat', 'bitter']
descriptor_mappings = {}

for c in core_tastes:
    descriptor_mapping_filtered = descriptor_mapping.loc[descriptor_mapping['primary taste']==c]
    descriptor_mappings[c] = descriptor_mapping_filtered

<br>
This is the same function used for review vectorization.

In [12]:
stop_words = set(stopwords.words('english'))
punctuation_table = str.maketrans({key: None for key in string.punctuation})  # a table for removing punctuations
sno = SnowballStemmer('english')

def normalize_text(raw_text):
    try:
        word_list = word_tokenize(raw_text)
        normalized_sentence = []
        for w in word_list:
            try:
                w = str(w)
                lower_case_word = str.lower(w)
                stemmed_word = sno.stem(lower_case_word)  # convert all words to their stems
                no_punctuation = stemmed_word.translate(punctuation_table)  # remove punctuations
                if len(no_punctuation) > 1 and no_punctuation not in stop_words:  # remove letters and stop words
                    normalized_sentence.append(no_punctuation)
            except:
                continue
        return normalized_sentence
    except:
        return ''

In [13]:
wine_trigram_model = Phraser.load('wine_trigram_model.pkl')

<br>
This is to grab key phrases for taste in each review, in the order of [sweetness, acidity, salinity, piquancy, fattiness, bitterness].

In [14]:
def return_descriptor_from_mapping(descriptor_mapping, word, core_taste):
    if word in list(descriptor_mapping.index):
        descriptor_to_return = descriptor_mapping['combined'][word]
        return descriptor_to_return
    else:
        return None

review_descriptors = []
for review in tqdm(wine_reviews, desc="Processing Wine Reviews"):
    taste_descriptors = []
    normalized_review = normalize_text(review)
    phrased_review = wine_trigram_model[normalized_review]

    for c in core_tastes:                                                      
        descriptors_only = [return_descriptor_from_mapping(descriptor_mappings[c], word, c) for word in phrased_review]
        no_nones = [str(d).strip() for d in descriptors_only if d is not None]
        descriptorized_review = ' '.join(no_nones)
        taste_descriptors.append(descriptorized_review)
    review_descriptors.append(taste_descriptors)

Processing Wine Reviews: 100%|██████████| 140340/140340 [04:46<00:00, 490.39it/s]


In [15]:
review_descriptors[13:17]

[['', 'high_acid high_acid', '', '', '', ''],
 ['', 'high_acid', '', 'pepper', '', 'high_tannin high_tannin'],
 ['dry', '', '', '', '', ''],
 ['', '', '', '', '', '']]

In [16]:
wine_word2vec_model = Word2Vec.load("word2vec.model")

<br>
In this step, basically we use TD-IDF to vectorize all terms in one kind of taste, and then use them as the weights for the results vectorized by the pre-trained Word2Vec model.

In [17]:
taste_descriptors = []
taste_vectors = []

for n, taste in enumerate(core_tastes):
    # print(taste)
    taste_words = [r[n] for r in review_descriptors]
    
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit(taste_words)
    dict_of_tfidf_weightings = dict(zip(X.get_feature_names(), X.idf_))
        
    wine_review_descriptors = []
    wine_review_vectors = []
    
    for d in taste_words:
        descriptor_count = 0
        weighted_review_terms = []
        terms = d.split(' ')
        for term in terms:
            if term in dict_of_tfidf_weightings.keys():
                tfidf_weighting = dict_of_tfidf_weightings[term]
                try:
                    word_vector = wine_word2vec_model.wv.get_vector(term).reshape(1, 300)
                    weighted_word_vector = tfidf_weighting * word_vector
                    weighted_review_terms.append(weighted_word_vector)
                    descriptor_count += 1
                except:
                    continue
            else:
                continue
        try:
            review_vector = sum(weighted_review_terms)/len(weighted_review_terms)
            review_vector = review_vector[0]
        except:
            review_vector = np.nan
        wine_review_vectors.append(review_vector)
        wine_review_descriptors.append(terms)
    
    taste_vectors.append(wine_review_vectors)
    taste_descriptors.append(wine_review_descriptors)
    

taste_vectors_t = list(map(list, zip(*taste_vectors)))
taste_descriptors_t = list(map(list, zip(*taste_descriptors)))

review_vecs_df = pd.DataFrame(taste_vectors_t, columns=core_tastes)

columns_taste_descriptors = [a + '_descriptors' for a in core_tastes]
review_descriptors_df = pd.DataFrame(taste_descriptors_t, columns=columns_taste_descriptors)

wine_df_vecs = pd.concat([wine_df_merged_filtered, review_descriptors_df, review_vecs_df], axis=1)

<br>
So in the result, it can be seen that for each review, the 6 taste features are presented in the form of TD-IDF weighted Word2Vec vectorized vectors.

In [18]:
wine_df_vecs.head(3)

Unnamed: 0,Alcohol,Country,Description,Name,Price,Region,Subregion,Variety,Vintage,Winery,sweet_descriptors,acid_descriptors,salt_descriptors,piquant_descriptors,fat_descriptors,bitter_descriptors,sweet,acid,salt,piquant,fat,bitter
0,14.9%,US,Made in partnership with Alpha Omega Winery's ...,Michel Rolland Napa Valley 2014 MR Red (Napa V...,$175,Napa,Napa Valley,Bordeaux-style Red Blend,2014.0,Michel Rolland Napa Valley,[],[],[],[],[],[low_tannin],,,,,,
1,14.5%,US,This is Cabernet Sauvignon-based with ample ad...,Soquel Vineyards 2015 Intreccio Library Select...,$75,Napa,Napa Valley,Bordeaux-style Red Blend,2015.0,Soquel Vineyards,[],[],[],[],[],[],,,,,,
2,14.2%,US,The producer's long-standing Bordeaux-style bl...,Flora Springs 2016 Trilogy Red (Napa Valley),$85,Napa,Napa Valley,Bordeaux-style Red Blend,2016.0,Flora Springs,[dry],[high_acid],[],[],[],[high_tannin],"[6.8185306, 5.586836, -0.19776316, -0.4909574,...","[0.76527476, 1.4421346, -0.6498075, -0.9933783...",,,,"[1.2513138, -0.09633834, -0.5334033, -0.106580..."


<br>
Simply apply averages for NaN in the taste features. After feature extraction, there happens a phenomenon that NaN shows for all salt feature. Therefore, we have to drop this feature. This may be due to the incomprehensiveness of the descriptor mapping file, or the review dataset may not contain enough content about salinity.

In [19]:
core_tastes_without_salt = ['sweet', 'acid', 'piquant', 'fat', 'bitter']

avg_taste_vecs = dict()
for t in core_tastes_without_salt:
    review_arrays = wine_df_vecs[t].dropna()
    average_taste_vec = np.average(review_arrays)
    avg_taste_vecs[t] = average_taste_vec

for c in core_tastes_without_salt:
    for index, value in tqdm(wine_df_vecs[c].iteritems()):
        if not isinstance(value, np.ndarray):
            wine_df_vecs[c][index] = avg_taste_vecs[c]
            
wine_df_vecs = wine_df_vecs.drop(columns='salt')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wine_df_vecs[c][index] = avg_taste_vecs[c]
140931it [02:12, 1067.44it/s]
140931it [02:34, 909.69it/s] 
140931it [01:52, 1247.22it/s]
140931it [01:50, 1277.22it/s]
140931it [02:37, 895.61it/s] 


<br>
In the end, use PCA to reduce the vector features into numerical values.

In [20]:
def reduce_dimension(df, taste, col):
    temp_array = df[taste].values
    features = np.vstack(temp_array)
    
    # meaning centering
    mean = np.mean(features, axis=0)
    features_centered = features - mean
    
    # standardization
    scaler = StandardScaler()
    features_standardized = scaler.fit_transform(features_centered)
    pca = PCA(n_components=1)
    features_pca = pca.fit_transform(features_standardized)
    feature_series = pd.Series(features_pca.flatten())
    df[col] = feature_series
    return df

In [21]:
wine_df_vecs = reduce_dimension(wine_df_vecs, 'sweet', 'sweetness')
wine_df_vecs = reduce_dimension(wine_df_vecs, 'acid', 'acidity')
wine_df_vecs = reduce_dimension(wine_df_vecs, 'piquant', 'piquancy')
wine_df_vecs = reduce_dimension(wine_df_vecs, 'fat', 'fattiness')
wine_df_vecs = reduce_dimension(wine_df_vecs, 'bitter', 'bitterness')

<br>
It can be seen from the result that a larger value represents a stronger taste for a certain feature.

In [22]:
wine_df_vecs.head(10)

Unnamed: 0,Alcohol,Country,Description,Name,Price,Region,Subregion,Variety,Vintage,Winery,sweet_descriptors,acid_descriptors,salt_descriptors,piquant_descriptors,fat_descriptors,bitter_descriptors,sweet,acid,piquant,fat,bitter,sweetness,acidity,piquancy,fattiness,bitterness
0,14.9%,US,Made in partnership with Alpha Omega Winery's ...,Michel Rolland Napa Valley 2014 MR Red (Napa V...,$175,Napa,Napa Valley,Bordeaux-style Red Blend,2014.0,Michel Rolland Napa Valley,[],[],[],[],[],[low_tannin],"[5.4981017, 5.7307286, -1.1419975, 0.7820324, ...","[1.0589797, 1.4053282, -0.5681804, -1.081337, ...","[9.727071, 1.105253, -0.4399095, 2.1738553, 3....","[3.8851402, -0.32585424, -2.1894097, -1.142013...","[1.2510151, -0.096375, -0.53324723, -0.1066205...",0.005307,0.00133,-5.678027,-4.390644,-10.963639
1,14.5%,US,This is Cabernet Sauvignon-based with ample ad...,Soquel Vineyards 2015 Intreccio Library Select...,$75,Napa,Napa Valley,Bordeaux-style Red Blend,2015.0,Soquel Vineyards,[],[],[],[],[],[],"[5.4981017, 5.7307286, -1.1419975, 0.7820324, ...","[1.0589797, 1.4053282, -0.5681804, -1.081337, ...","[9.727071, 1.105253, -0.4399095, 2.1738553, 3....","[3.8851402, -0.32585424, -2.1894097, -1.142013...","[1.2510151, -0.096375, -0.53324723, -0.1066205...",0.000207,-0.000904,-5.653103,-4.383236,-10.965899
2,14.2%,US,The producer's long-standing Bordeaux-style bl...,Flora Springs 2016 Trilogy Red (Napa Valley),$85,Napa,Napa Valley,Bordeaux-style Red Blend,2016.0,Flora Springs,[dry],[high_acid],[],[],[],[high_tannin],"[6.8185306, 5.586836, -0.19776316, -0.4909574,...","[0.76527476, 1.4421346, -0.6498075, -0.9933783...","[9.727071, 1.105253, -0.4399095, 2.1738553, 3....","[3.8851402, -0.32585424, -2.1894097, -1.142013...","[1.2513138, -0.09633834, -0.5334033, -0.106580...",-27.316862,-3.860945,-5.652854,-4.383175,27.356512
3,14.5%,US,This blends a majority of Cabernet Sauvignon w...,Michael Pozzan 2015 Marianna Red (Napa Valley),$50,Napa,Napa Valley,Bordeaux-style Red Blend,2015.0,Michael Pozzan,[],[high_acid],[],[],[],[high_tannin],"[5.4981017, 5.7307286, -1.1419975, 0.7820324, ...","[0.76527476, 1.4421346, -0.6498075, -0.9933783...","[9.727071, 1.105253, -0.4399095, 2.1738553, 3....","[3.8851402, -0.32585424, -2.1894097, -1.142013...","[1.2513138, -0.09633834, -0.5334033, -0.106580...",-0.000156,-3.860946,-5.653067,-4.383206,27.356966
4,14.1%,US,This Cabernet Sauvignon-dominant blend is smok...,Liparita 2016 Left Bank Reserve Red (Napa Valley),$100,Napa,Napa Valley,Bordeaux-style Red Blend,2016.0,Liparita,[dry],[],[],[],[],[low_tannin],"[6.8185306, 5.586836, -0.19776316, -0.4909574,...","[1.0589797, 1.4053282, -0.5681804, -1.081337, ...","[9.727071, 1.105253, -0.4399095, 2.1738553, 3....","[3.8851402, -0.32585424, -2.1894097, -1.142013...","[1.2510151, -0.096375, -0.53324723, -0.1066205...",-27.316856,-0.000971,-5.653088,-4.383078,-10.96563
5,14.5%,US,"Thick and reductive, this wine combines 55% Ca...",Silverpoint Cellars 2014 Red (Napa Valley),$75,Napa,Napa Valley,Bordeaux-style Red Blend,2014.0,Silverpoint Cellars,[],[],[],[],[],[],"[5.4981017, 5.7307286, -1.1419975, 0.7820324, ...","[1.0589797, 1.4053282, -0.5681804, -1.081337, ...","[9.727071, 1.105253, -0.4399095, 2.1738553, 3....","[3.8851402, -0.32585424, -2.1894097, -1.142013...","[1.2510151, -0.096375, -0.53324723, -0.1066205...",-0.000156,-0.000971,-5.653051,-4.383228,-10.966383
6,14.6%,US,This blends 80% Cabernet Sauvignon with 20% Ma...,Toybox Wine Cellars 2015 Cadden Family Vineyar...,$50,Napa,Napa Valley,Bordeaux-style Red Blend,2015.0,Toybox Wine Cellars,[],[],[],[],[],[high_tannin],"[5.4981017, 5.7307286, -1.1419975, 0.7820324, ...","[1.0589797, 1.4053282, -0.5681804, -1.081337, ...","[9.727071, 1.105253, -0.4399095, 2.1738553, 3....","[3.8851402, -0.32585424, -2.1894097, -1.142013...","[1.2513138, -0.09633834, -0.5334033, -0.106580...",-0.000156,-0.000971,-5.653162,-4.383281,27.357048
7,14.5%,US,Tenacious in tannin and thickly concentrated f...,High Dive 2015 Red (Napa Valley),$95,Napa,Napa Valley,Bordeaux-style Red Blend,2015.0,High Dive,[],[],[],[],[],[],"[5.4981017, 5.7307286, -1.1419975, 0.7820324, ...","[1.0589797, 1.4053282, -0.5681804, -1.081337, ...","[9.727071, 1.105253, -0.4399095, 2.1738553, 3....","[3.8851402, -0.32585424, -2.1894097, -1.142013...","[1.2510151, -0.096375, -0.53324723, -0.1066205...",-0.000156,-0.000971,-5.653042,-4.383367,-10.966074
8,14.8%,US,"This is a classically structured, delicious Bo...",Arrow & Branch 2015 Red (Napa Valley),$125,Napa,Napa Valley,Bordeaux-style Red Blend,2015.0,Arrow & Branch,[],[],[],[],[],[],"[5.4981017, 5.7307286, -1.1419975, 0.7820324, ...","[1.0589797, 1.4053282, -0.5681804, -1.081337, ...","[9.727071, 1.105253, -0.4399095, 2.1738553, 3....","[3.8851402, -0.32585424, -2.1894097, -1.142013...","[1.2510151, -0.096375, -0.53324723, -0.1066205...",-0.000156,-0.000971,-5.65302,-4.383295,-10.966074
9,15%,US,"A well-made, incredibly affordable wine, this ...",Robert Mondavi 2015 Maestro Red (Napa Valley),$44,Napa,Napa Valley,Bordeaux-style Red Blend,2015.0,Robert Mondavi,[dry],[],[],[],[],[],"[6.8185306, 5.586836, -0.19776316, -0.4909574,...","[1.0589797, 1.4053282, -0.5681804, -1.081337, ...","[9.727071, 1.105253, -0.4399095, 2.1738553, 3....","[3.8851402, -0.32585424, -2.1894097, -1.142013...","[1.2510151, -0.096375, -0.53324723, -0.1066205...",-27.316856,-0.000971,-5.65302,-4.383295,-10.966074


<br>
Now it's time to combine the feature extraction results to the food-wine pairing dataset. Firstly, read the dataset and do some pre-processing.

In [23]:
pair_df = pd.read_csv('pairing_food.csv')
pair_df['Grape'] = pair_df['Grape'].apply(standardize_varieties)

In [24]:
pair_df.head(3)

Unnamed: 0,Pairing Food,Grape,Popularity,Critics' Score,Avg. Price kr / 750ml,Wine,Region,Country
0,beef and venison,Bordeaux Blend Red,1st,96 / 100,7926,Chateau Mouton Rothschild,Pauillac,France
1,beef and venison,Bordeaux Blend Red,3rd,96 / 100,10564,Chateau Lafite Rothschild,Pauillac,France
2,beef and venison,Merlot,4th,96 / 100,47971,Petrus,Pomerol,France


<br>
Now we combine Grape, Wine, Region, Country features as the name of the wine in pair_df. Correspondingly, combine Variety, Winery, Region, Country features as the name of the wine in wine_df_vecs.

In [25]:
pair_df['name_pair'] = pair_df['Grape'].fillna('') + ' ' + pair_df['Wine'].fillna('') + ' ' + pair_df['Region'].fillna('') + ' ' + pair_df['Country'].fillna('')
wine_df_vecs['name_wine_review'] = wine_df_vecs['Variety'].fillna('') + ' ' + wine_df_vecs['Winery'].fillna('') + ' ' + wine_df_vecs['Region'].fillna('') + ' ' + wine_df_vecs['Country'].fillna('')

In [26]:
def remove_multiple_spaces(name):
    return re.sub(r'\s+', ' ', name)

pair_df['name_pair'] = pair_df['name_pair'].apply(remove_multiple_spaces)
wine_df_vecs['name_wine_review'] = wine_df_vecs['name_wine_review'].apply(remove_multiple_spaces)

In [27]:
pair_df.head(3)

Unnamed: 0,Pairing Food,Grape,Popularity,Critics' Score,Avg. Price kr / 750ml,Wine,Region,Country,name_pair
0,beef and venison,Bordeaux Blend Red,1st,96 / 100,7926,Chateau Mouton Rothschild,Pauillac,France,Bordeaux Blend Red Chateau Mouton Rothschild P...
1,beef and venison,Bordeaux Blend Red,3rd,96 / 100,10564,Chateau Lafite Rothschild,Pauillac,France,Bordeaux Blend Red Chateau Lafite Rothschild P...
2,beef and venison,Merlot,4th,96 / 100,47971,Petrus,Pomerol,France,Merlot Petrus Pomerol France


In [28]:
wine_df_vecs.head(3)

Unnamed: 0,Alcohol,Country,Description,Name,Price,Region,Subregion,Variety,Vintage,Winery,sweet_descriptors,acid_descriptors,salt_descriptors,piquant_descriptors,fat_descriptors,bitter_descriptors,sweet,acid,piquant,fat,bitter,sweetness,acidity,piquancy,fattiness,bitterness,name_wine_review
0,14.9%,US,Made in partnership with Alpha Omega Winery's ...,Michel Rolland Napa Valley 2014 MR Red (Napa V...,$175,Napa,Napa Valley,Bordeaux-style Red Blend,2014.0,Michel Rolland Napa Valley,[],[],[],[],[],[low_tannin],"[5.4981017, 5.7307286, -1.1419975, 0.7820324, ...","[1.0589797, 1.4053282, -0.5681804, -1.081337, ...","[9.727071, 1.105253, -0.4399095, 2.1738553, 3....","[3.8851402, -0.32585424, -2.1894097, -1.142013...","[1.2510151, -0.096375, -0.53324723, -0.1066205...",0.005307,0.00133,-5.678027,-4.390644,-10.963639,Bordeaux-style Red Blend Michel Rolland Napa V...
1,14.5%,US,This is Cabernet Sauvignon-based with ample ad...,Soquel Vineyards 2015 Intreccio Library Select...,$75,Napa,Napa Valley,Bordeaux-style Red Blend,2015.0,Soquel Vineyards,[],[],[],[],[],[],"[5.4981017, 5.7307286, -1.1419975, 0.7820324, ...","[1.0589797, 1.4053282, -0.5681804, -1.081337, ...","[9.727071, 1.105253, -0.4399095, 2.1738553, 3....","[3.8851402, -0.32585424, -2.1894097, -1.142013...","[1.2510151, -0.096375, -0.53324723, -0.1066205...",0.000207,-0.000904,-5.653103,-4.383236,-10.965899,Bordeaux-style Red Blend Soquel Vineyards Napa US
2,14.2%,US,The producer's long-standing Bordeaux-style bl...,Flora Springs 2016 Trilogy Red (Napa Valley),$85,Napa,Napa Valley,Bordeaux-style Red Blend,2016.0,Flora Springs,[dry],[high_acid],[],[],[],[high_tannin],"[6.8185306, 5.586836, -0.19776316, -0.4909574,...","[0.76527476, 1.4421346, -0.6498075, -0.9933783...","[9.727071, 1.105253, -0.4399095, 2.1738553, 3....","[3.8851402, -0.32585424, -2.1894097, -1.142013...","[1.2513138, -0.09633834, -0.5334033, -0.106580...",-27.316862,-3.860945,-5.652854,-4.383175,27.356512,Bordeaux-style Red Blend Flora Springs Napa US


In [29]:
wine_df_vecs_save = wine_df_vecs.copy()

In [33]:
wine_df_vecs = wine_df_vecs[['Alcohol', 'Price', 'Vintage', 'sweetness', 'acidity', 'piquancy', 'fattiness', 'bitterness', 'name_wine_review']]
wine_df_vecs = wine_df_vecs.rename(columns={'Price': 'Price_Review'})

In [37]:
wine_df_vecs.head(3)

Unnamed: 0,Alcohol,Price_Review,Vintage,sweetness,acidity,piquancy,fattiness,bitterness,name_wine_review
0,14.9%,$175,2014.0,0.005307,0.00133,-5.678027,-4.390644,-10.963639,Bordeaux-style Red Blend Michel Rolland Napa V...
1,14.5%,$75,2015.0,0.000207,-0.000904,-5.653103,-4.383236,-10.965899,Bordeaux-style Red Blend Soquel Vineyards Napa US
2,14.2%,$85,2016.0,-27.316862,-3.860945,-5.652854,-4.383175,27.356512,Bordeaux-style Red Blend Flora Springs Napa US


<br>
Now, let's create a jaccard similarity function.

In [36]:
def calculate_jaccard_similarity(text1, text2, n=2):
    def get_ngrams(text, n):
        ngram_set = set()
        for i in range(len(text) - n + 1):
            ngram = text[i:i + n]
            ngram_set.add(ngram)
        return ngram_set

    ngram_set1 = get_ngrams(text1, n)
    ngram_set2 = get_ngrams(text2, n)

    intersection_size = len(ngram_set1.intersection(ngram_set2))
    union_size = len(ngram_set1) + len(ngram_set2) - intersection_size

    jaccard_similarity = intersection_size / union_size

    return jaccard_similarity

<br>
In the end, we find the most similar wine in reviews for each wine in the food pairing dataset. If there exist more than one, apply average, and in this situation, other features are randomly pick from one of the most simliar wines.

In [52]:
# def find_most_similar_wine(name_pair):
#     similarity_scores = []

#     for name_wine_review in wine_df_vecs['name_wine_review']:
#         similarity = calculate_jaccard_similarity(name_pair, name_wine_review)
#         similarity_scores.append(similarity)

#     max_similarity = max(similarity_scores)
#     most_similar_indices = [i for i, score in enumerate(similarity_scores) if score == max_similarity]

#     return most_similar_indices, max_similarity


# sweetness_list = []
# acidity_list = []
# piquancy_list = []
# fattiness_list = []
# bitterness_list = []
# alcohol_list = []
# price_list = []
# vintage_list = []
# name_list = []

# for index, row in tqdm(pair_df.iterrows()):
#     name_pair = row['name_pair']
#     most_similar_indices, max_similarity = find_most_similar_wine(name_pair)
    
#     sweetness_values = wine_df_vecs.loc[most_similar_indices, 'sweetness']
#     average_sweetness = sweetness_values.mean()
#     sweetness_list.append(average_sweetness)
    
#     acidity_values = wine_df_vecs.loc[most_similar_indices, 'acidity']
#     average_acidity = acidity_values.mean()
#     acidity_list.append(average_acidity)

#     piquancy_values = wine_df_vecs.loc[most_similar_indices, 'piquancy']
#     average_piquancy = piquancy_values.mean()
#     piquancy_list.append(average_piquancy)
    
#     fattiness_values = wine_df_vecs.loc[most_similar_indices, 'fattiness']
#     average_fattiness = fattiness_values.mean()
#     fattiness_list.append(average_fattiness)
    
#     bitterness_values = wine_df_vecs.loc[most_similar_indices, 'bitterness']
#     average_bitterness = bitterness_values.mean()
#     bitterness_list.append(average_bitterness)
    
#     if len(most_similar_indices) > 1:
#         chosen_index = random.choice(most_similar_indices)
#         wine_row = wine_df_vecs.iloc[chosen_index]
        
#         alcohol_list.append(wine_row['Alcohol'])
#         price_list.append(wine_row['Price_Review'])
#         vintage_list.append(wine_row['Vintage'])
#         name_list.append(wine_row['name_wine_review'])

# pair_df['sweetness'] = sweetness_list
# pair_df['acidity'] = acidity_list
# pair_df['piquancy'] = piquancy_list
# pair_df['fattiness'] = fattiness_list
# pair_df['bitterness'] = bitterness_list
# pair_df['Alcohol'] = alcohol_list
# pair_df['Price_Review'] = price_list
# pair_df['Vintage'] = vintage_list 
# pair_df['name_wine_review'] = name_list

In [72]:
# A version with lower time complexity
# wine_review_to_indices = defaultdict(list)
# for idx, name in enumerate(wine_df_vecs['name_wine_review']):
#     wine_review_to_indices[name].append(idx)

# index_to_features = defaultdict(list)
# for index, row in wine_df_vecs.iterrows():
#     index_to_features[index].append(row['sweetness'])
#     index_to_features[index].append(row['acidity'])
#     index_to_features[index].append(row['piquancy'])
#     index_to_features[index].append(row['fattiness'])
#     index_to_features[index].append(row['bitterness'])
#     index_to_features[index].append(row['Alcohol'])
#     index_to_features[index].append(row['Price_Review'])
#     index_to_features[index].append(row['Vintage'])
#     index_to_features[index].append(row['name_wine_review'])

# def find_most_similar_wine(name_pair):
#     similarity_scores = []

#     if name_pair in wine_review_to_indices:
#         wine_indices = wine_review_to_indices[name_pair]
#         return wine_indices, [1.0] * len(wine_indices) 
    
#     for name_wine_review, wine_indices in wine_review_to_indices.items():
#         for wine_index in wine_indices:
#             similarity = calculate_jaccard_similarity(name_pair, name_wine_review)
#             similarity_scores.append((wine_index, similarity))

#     similarity_scores.sort(key=lambda x: x[1], reverse=True)
#     max_similarity = similarity_scores[0][1]
#     most_similar_indices = [index for index, score in similarity_scores if score == max_similarity]

#     return most_similar_indices, max_similarity

# def safe_mean(values):
#     filtered_values = [value for value in values if isinstance(value, (int, float))]
#     return sum(filtered_values) / len(filtered_values) if filtered_values else None

# sweetness_list = []
# acidity_list = []
# piquancy_list = []
# fattiness_list = []
# bitterness_list = []
# alcohol_list = []
# price_list = []
# vintage_list = []
# name_list = []

# for index, row in tqdm(pair_df.iterrows()):
#     name_pair = row['name_pair']
#     most_similar_indices, max_similarity = find_most_similar_wine(name_pair)

#     if most_similar_indices:
#         feature_values = [index_to_features[index] for index in most_similar_indices]
#         feature_means = [safe_mean(feat) for feat in zip(*feature_values)]
#         sweetness_list.append(feature_means[0])
#         acidity_list.append(feature_means[1])
#         piquancy_list.append(feature_means[2])
#         fattiness_list.append(feature_means[3])
#         bitterness_list.append(feature_means[4])

#         if len(most_similar_indices) > 1:
#             chosen_index = random.choice(most_similar_indices)
#         else:
#             chosen_index = most_similar_indices[0]
#         wine_features = index_to_features[chosen_index]
#         alcohol_list.append(wine_features[5])
#         price_list.append(wine_features[6])
#         vintage_list.append(wine_features[7])
#         name_list.append(wine_features[8])
#     else:
#         sweetness_list.append(None)
#         acidity_list.append(None)
#         piquancy_list.append(None)
#         fattiness_list.append(None)
#         bitterness_list.append(None)
#         alcohol_list.append(None)
#         price_list.append(None)
#         vintage_list.append(None)
#         name_list.append(None)

# pair_df_test = pair_df.copy()
# pair_df_test['sweetness'] = sweetness_list
# pair_df_test['acidity'] = acidity_list
# pair_df_test['piquancy'] = piquancy_list
# pair_df_test['fattiness'] = fattiness_list
# pair_df_test['bitterness'] = bitterness_list
# pair_df_test['Alcohol'] = alcohol_list
# pair_df_test['Price_Review'] = price_list
# pair_df_test['Vintage'] = vintage_list
# pair_df_test['name_wine_review'] = name_list

<br>
To further improve time complexity, we choose a naive way to simply drop all duplicates in wine_df_vecs, and not compute mean.

In [64]:
wine_df_vecs_save2 = wine_df_vecs.copy()

In [66]:
wine_df_vecs = wine_df_vecs.drop_duplicates(subset=['name_wine_review'])

In [68]:
def find_most_similar_wine(name_pair):
    similarity_scores = []

    for name_wine_review in wine_df_vecs['name_wine_review']:
        similarity = calculate_jaccard_similarity(name_pair, name_wine_review)
        similarity_scores.append(similarity)

    max_similarity = max(similarity_scores)
    most_similar_indices = [i for i, score in enumerate(similarity_scores) if score == max_similarity]

    return most_similar_indices, max_similarity

sweetness_list = []
acidity_list = []
piquancy_list = []
fattiness_list = []
bitterness_list = []
alcohol_list = []
price_list = []
vintage_list = []
name_list = []

for index, row in tqdm(pair_df.head(10).iterrows()):
    name_pair = row['name_pair']
    most_similar_indices, max_similarity = find_most_similar_wine(name_pair)
    
    if most_similar_indices:
        chosen_index = random.choice(most_similar_indices)
        
        wine_row = wine_df_vecs.iloc[chosen_index]
        
        sweetness_list.append(wine_row['sweetness'])
        acidity_list.append(wine_row['acidity'])
        piquancy_list.append(wine_row['piquancy'])
        fattiness_list.append(wine_row['fattiness'])
        bitterness_list.append(wine_row['bitterness'])
        alcohol_list.append(wine_row['Alcohol'])
        price_list.append(wine_row['Price_Review'])
        vintage_list.append(wine_row['Vintage'])
        name_list.append(wine_row['name_wine_review'])

    else:
        sweetness_list.append(None)
        acidity_list.append(None)
        piquancy_list.append(None)
        fattiness_list.append(None)
        bitterness_list.append(None)
        alcohol_list.append(None)
        price_list.append(None)
        vintage_list.append(None)
        name_list.append(None)


pair_df_test = pair_df.head(10).copy()
pair_df_test['sweetness'] = sweetness_list
pair_df_test['acidity'] = acidity_list
pair_df_test['piquancy'] = piquancy_list
pair_df_test['fattiness'] = fattiness_list
pair_df_test['bitterness'] = bitterness_list
pair_df_test['Alcohol'] = alcohol_list
pair_df_test['Price_Review'] = price_list
pair_df_test['Vintage'] = vintage_list
pair_df_test['name_wine_review'] = name_list

10it [00:14,  1.47s/it]


In [71]:
pair_df_test

Unnamed: 0,Pairing Food,Grape,Popularity,Critics' Score,Avg. Price kr / 750ml,Wine,Region,Country,name_pair,sweetness,acidity,piquancy,fattiness,bitterness,Alcohol,Price_Review,Vintage,name_wine_review
0,beef and venison,Bordeaux Blend Red,1st,96 / 100,7926,Chateau Mouton Rothschild,Pauillac,France,Bordeaux Blend Red Chateau Mouton Rothschild P...,-27.316856,-0.000971,-5.65302,-4.383295,-10.966074,14%,$675,2016.0,Bordeaux-style Red Blend ChÃ¢teau Mouton Roths...
1,beef and venison,Bordeaux Blend Red,3rd,96 / 100,10564,Chateau Lafite Rothschild,Pauillac,France,Bordeaux Blend Red Chateau Lafite Rothschild P...,-0.000156,-0.000971,-5.65302,-4.383295,-10.966074,,$869,2016.0,Bordeaux-style Red Blend ChÃ¢teau Lafite Roths...
2,beef and venison,Merlot,4th,96 / 100,47971,Petrus,Pomerol,France,Merlot Petrus Pomerol France,-0.000156,-3.860946,-5.65302,-4.383295,27.357056,14%,$65,2014.0,Merlot ChÃ¢teau Fayat Pomerol France
3,beef and venison,Cabernet Franc - Cabernet Sauvignon,5th,95 / 100,3930,Tenuta San Guido Sassicaia Bolgheri,Tuscany,Italy,Cabernet Franc - Cabernet Sauvignon Tenuta San...,-0.000156,-3.860946,53.069016,-4.383295,-10.966074,14%,$250,2016.0,"Red Blends, Red Blends Tenuta San Guido Bolghe..."
4,beef and venison,Bordeaux Blend Red,6th,96 / 100,8419,Chateau Margaux,Margaux,France,Bordeaux Blend Red Chateau Margaux Margaux France,-0.000156,-3.860946,-5.65302,-4.383295,-10.966074,13.5%,$589,2016.0,Bordeaux-style Red Blend ChÃ¢teau Margaux Marg...
5,beef and venison,Bordeaux Blend Red,7th,94 / 100,4889,Opus One,Napa Valley,USA,Bordeaux Blend Red Opus One Napa Valley USA,-0.000156,-0.000971,-5.65302,-4.383295,-10.966074,14.7%,$195,2016.0,Bordeaux-style Red Blend Anderson Conn Valley ...
6,beef and venison,Cabernet Sauvignon - Merlot,8th,96 / 100,8832,Chateau Latour,Pauillac,France,Cabernet Sauvignon - Merlot Chateau Latour Pau...,-0.000156,-3.860946,-5.65302,-4.383295,-10.966074,13%,$12,2006.0,Cabernet Sauvignon La Patache MÃ©doc France
7,beef and venison,Bordeaux Blend Red,10th,96 / 100,7074,Chateau Haut-Brion,Pessac-Leognan,France,Bordeaux Blend Red Chateau Haut-Brion Pessac-L...,-0.000156,-0.000971,-5.65302,-4.383295,27.357056,14%,$650,2016.0,Bordeaux-style Red Blend ChÃ¢teau Haut-Brion P...
8,beef and venison,Cabernet - Sangiovese,13th,94 / 100,2038,Marchesi Antinori Tignanello Toscana IGT,Tuscany,Italy,Cabernet - Sangiovese Marchesi Antinori Tignan...,-27.316856,-0.000971,-5.65302,-4.383295,-10.966074,,$150,2012.0,Sangiovese Marchesi Antinori Brunello di Monta...
9,beef and venison,Bordeaux Blend Red,15th,94 / 100,1838,Chateau Lynch-Bages,Pauillac,France,Bordeaux Blend Red Chateau Lynch-Bages Pauilla...,-27.316856,-0.000971,-5.65302,-4.383295,-10.966074,13.5%,$154,2016.0,Bordeaux-style Red Blend ChÃ¢teau Lynch-Bages ...
