# Introduction
Living to 100 doesn’t have to mean a strict regimen of steamed vegetables and joyless meals.

A healthy, balanced, and stress-free life includes happy hours, time spent with family and friends and the occasional glass of wine with delicious dinners.

In this notebook, wine reviews, tastes and food pairing will be analised to build a wine-pairing recommender and wine recommender.

# Import libraries

In [1]:
#Standard libraries for data analysis:
    
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import norm, skew, shapiro
from scipy import stats
import statsmodels.api as sm
import os


# sklearn modules for data preprocessing:
import sklearn
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

#sklearn modules for Model Selection:
from sklearn import svm, tree, linear_model, neighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import confusion_matrix
from sklearn.utils import resample
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn import cluster
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors


#Standard libraries for data visualization:
import seaborn as sns
from scipy.stats import boxcox 
from matplotlib import pyplot
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import matplotlib 
%matplotlib inline
color = sns.color_palette()
import matplotlib.ticker as mtick
from pandas.plotting import scatter_matrix
from sklearn.metrics import roc_curve


#Standard libraries for text analysis:
import re #regex
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import ssl
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import PorterStemmer 
nltk.download('stopwords')
from collections import Counter, OrderedDict
from operator import itemgetter

from gensim.models.phrases import Phrases, Phraser
import gensim
from gensim.models import Word2Vec

#Standard libraries for web scraping / API
from tqdm import tqdm # tqdm is for printing the status bar
from bs4 import BeautifulSoup


pd.set_option('display.max_columns', None)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mariaesquivel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mariaesquivel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mariaesquivel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load dataset

### Wine descriptors list
Wine descriptors list is based on the Wine Flavor Wheel, a visual glossary of wine terms organized by origin.

In [2]:
descriptor_mapping = pd.read_csv('descriptor_mapping.csv', sep = ';')
print(descriptor_mapping.shape)
descriptor_mapping.head(3)

(1520, 5)


Unnamed: 0,raw descriptor,occurences,level_3,level_2,level_1
0,abras,190,abrasive,high_tannin,tannin
1,acacia,130,acacia,flowery,flower
2,acacia_flower,77,acacia,flowery,flower


### Wine dataset
Sources: Vivino, Wine Enthusiast Magazine

#### Vivino

In [3]:
wines_type1 = pd.read_csv('/Users/mariaesquivel/Ironhack/Projects/Final_project/Wine-5---wine-pairing-recommender/datasets/all_wines_df_type1_v3.csv')
wines_type1 = wines_type1.drop('Unnamed: 0', axis = 1)
print(wines_type1.shape)

  wines_type1 = pd.read_csv('/Users/mariaesquivel/Ironhack/Projects/Final_project/Wine-5---wine-pairing-recommender/datasets/all_wines_df_type1_v3.csv')


(60225, 246)


In [4]:
wines_type2 = pd.read_csv('/Users/mariaesquivel/Ironhack/Projects/Final_project/Wine-5---wine-pairing-recommender/datasets/all_wines_df_type2_v3.csv')
wines_type2 = wines_type2.drop('Unnamed: 0', axis = 1)
print(wines_type2.shape)


  wines_type2 = pd.read_csv('/Users/mariaesquivel/Ironhack/Projects/Final_project/Wine-5---wine-pairing-recommender/datasets/all_wines_df_type2_v3.csv')


(124975, 246)


In [5]:
wines_type3 = pd.read_csv('/Users/mariaesquivel/Ironhack/Projects/Final_project/Wine-5---wine-pairing-recommender/datasets/all_wines_df_type3_v3.csv')
wines_type3 = wines_type3.drop('Unnamed: 0', axis = 1)
print(wines_type3.shape)


  wines_type3 = pd.read_csv('/Users/mariaesquivel/Ironhack/Projects/Final_project/Wine-5---wine-pairing-recommender/datasets/all_wines_df_type3_v3.csv')


(124975, 245)


In [6]:
wines_type4 = pd.read_csv('/Users/mariaesquivel/Ironhack/Projects/Final_project/Wine-5---wine-pairing-recommender/datasets/all_wines_df_type4_v3.csv')
wines_type4 = wines_type4.drop('Unnamed: 0', axis = 1)
print(wines_type4.shape)


(342, 230)


In [7]:
# Combine all data sets
all_wines = pd.concat([wines_type1,wines_type2,wines_type3,wines_type4], axis = 0).reset_index(drop = True)
print(all_wines.shape)
all_wines.head(3)

(310517, 246)


Unnamed: 0,id,seo_name,name,year,amount,code,ratings_count,ratings_average,wine_id,wine_name,wine_seo_name,wine_type_id,wine_vintage_type,wine_is_natural,wine_has_valid_ratings,wine_region_id,wine_region_name,wine_region_name_en,wine_region_seo_name,wine_region_country.code,wine_region_country.name,wine_region_country.native_name,wine_region_country.seo_name,wine_region_country.currency.code,wine_region_country.currency.name,wine_region_country.currency.prefix,wine_region_country.currency.suffix,wine_region_country.regions_count,wine_region_country.users_count,wine_region_country.wines_count,wine_region_country.wineries_count,wine_region_country.most_used_grapes,wine_region_background_image.location,wine_region_background_image.variations.large,wine_region_background_image.variations.medium,wine_region_background_image,wine_winery_id,wine_winery_name,wine_winery_seo_name,wine_winery_status,wine_winery_background_image,wine_winery_background_image.location,wine_winery_background_image.variations.large,wine_winery_background_image.variations.medium,wine_winery_background_image.variations.small,wine_taste_structure.acidity,wine_taste_structure.fizziness,wine_taste_structure.intensity,wine_taste_structure.sweetness,wine_taste_structure.tannin,wine_taste_structure.user_structure_count,wine_taste_structure.calculated_structure_count,wine_taste_structure,wine_taste_flavor_1group,wine_taste_flavor_1secondary_keywords,wine_taste_flavor_1stats.count,wine_taste_flavor_1stats.score,wine_taste_flavor_2group,wine_taste_flavor_2secondary_keywords,wine_taste_flavor_2stats.count,wine_taste_flavor_2stats.score,wine_taste_flavor_3group,wine_taste_flavor_3secondary_keywords,wine_taste_flavor_3stats.count,wine_taste_flavor_3stats.score,wine_taste_flavor_4group,wine_taste_flavor_4secondary_keywords,wine_taste_flavor_4stats.count,wine_taste_flavor_4stats.score,wine_taste_flavor_1primary_keywordsid,wine_taste_flavor_1primary_keywordsname,wine_taste_flavor_1primary_keywordscount,wine_taste_flavor_2primary_keywordsid,wine_taste_flavor_2primary_keywordsname,wine_taste_flavor_2primary_keywordscount,wine_taste_flavor_3primary_keywordsid,wine_taste_flavor_3primary_keywordsname,wine_taste_flavor_3primary_keywordscount,wine_taste_flavor_4primary_keywordsid,wine_taste_flavor_4primary_keywordsname,wine_taste_flavor_4primary_keywordscount,wine_taste_flavor_5primary_keywordsid,wine_taste_flavor_5primary_keywordsname,wine_taste_flavor_5primary_keywordscount,wine_taste_flavor_1primary_keywordsid.1,wine_taste_flavor_1primary_keywordsname.1,wine_taste_flavor_1primary_keywordscount.1,wine_taste_flavor_1primary_keywordsid.2,wine_taste_flavor_1primary_keywordsname.2,wine_taste_flavor_1primary_keywordscount.2,wine_taste_flavor_1primary_keywordsid.3,wine_taste_flavor_1primary_keywordsname.3,wine_taste_flavor_1primary_keywordscount.3,wine_taste_flavor_1primary_keywordsid.4,wine_taste_flavor_1primary_keywordsname.4,wine_taste_flavor_1primary_keywordscount.4,wine_taste_flavor_1primary_keywordsid.5,wine_taste_flavor_1primary_keywordsname.5,wine_taste_flavor_1primary_keywordscount.5,wine_taste_flavor_3primary_keywordsid.1,wine_taste_flavor_3primary_keywordsname.1,wine_taste_flavor_3primary_keywordscount.1,wine_taste_flavor_3primary_keywordsid.2,wine_taste_flavor_3primary_keywordsname.2,wine_taste_flavor_3primary_keywordscount.2,wine_taste_flavor_3primary_keywordsid.3,wine_taste_flavor_3primary_keywordsname.3,wine_taste_flavor_3primary_keywordscount.3,wine_taste_flavor_3primary_keywordsid.4,wine_taste_flavor_3primary_keywordsname.4,wine_taste_flavor_3primary_keywordscount.4,wine_taste_flavor_3primary_keywordsid.5,wine_taste_flavor_3primary_keywordsname.5,wine_taste_flavor_3primary_keywordscount.5,wine_taste_flavor_4primary_keywordsid.1,wine_taste_flavor_4primary_keywordsname.1,wine_taste_flavor_4primary_keywordscount.1,wine_taste_flavor_4primary_keywordsid.2,wine_taste_flavor_4primary_keywordsname.2,wine_taste_flavor_4primary_keywordscount.2,wine_taste_flavor_4primary_keywordsid.3,wine_taste_flavor_4primary_keywordsname.3,wine_taste_flavor_4primary_keywordscount.3,wine_taste_flavor_4primary_keywordsid.4,wine_taste_flavor_4primary_keywordsname.4,wine_taste_flavor_4primary_keywordscount.4,wine_taste_flavor_4primary_keywordsid.5,wine_taste_flavor_4primary_keywordsname.5,wine_taste_flavor_4primary_keywordscount.5,wine_statistics_status,wine_statistics_ratings_count,wine_statistics_ratings_average,wine_statistics_labels_count,wine_statistics_vintages_count,wine_style_id,wine_style_seo_name,wine_style_regional_name,wine_style_varietal_name,wine_style_name,wine_style_image,wine_style_description,wine_style_blurb,wine_style_interesting_facts,wine_style_body,wine_style_body_description,wine_style_acidity,wine_style_acidity_description,wine_style_wine_type_id,wine_style_background_image.location,wine_style_background_image.variations.small,wine_style_country.code,wine_style_country.name,wine_style_country.native_name,wine_style_country.seo_name,wine_style_country.currency.code,wine_style_country.currency.name,wine_style_country.currency.prefix,wine_style_country.currency.suffix,wine_style_country.regions_count,wine_style_country.users_count,wine_style_country.wines_count,wine_style_country.wineries_count,wine_style_country.most_used_grapes,wine_style_region.id,wine_style_region.name,wine_style_region.name_en,wine_style_region.seo_name,wine_style_region.country.code,wine_style_region.country.name,wine_style_region.country.native_name,wine_style_region.country.seo_name,wine_style_region.country.currency.code,wine_style_region.country.currency.name,wine_style_region.country.currency.prefix,wine_style_region.country.currency.suffix,wine_style_region.country.regions_count,wine_style_region.country.users_count,wine_style_region.country.wines_count,wine_style_region.country.wineries_count,wine_style_region.country.most_used_grapes,wine_style_region.parent_id,wine_style_region.background_image.location,wine_style_region.background_image.variations.large,wine_style_region.background_image.variations.medium,wine_style_region.statistics.wineries_count,wine_style_region.statistics.wines_count,wine_style_region.statistics.sub_regions_count,wine_style_region.statistics.parent_regions_count,wine_style_background_image,wine_style_region,wine_style_region.background_image,wine_food_id,wine_food_name,wine_food_seo_name,wine_food_background_image.location,wine_food_background_image.variations.small,wine_food_id.1,wine_food_name.1,wine_food_seo_name.1,wine_food_background_image.location.1,wine_food_background_image.variations.small.1,wine_food_id.2,wine_food_name.2,wine_food_seo_name.2,wine_food_background_image.location.2,wine_food_background_image.variations.small.2,wine_food_id.3,wine_food_name.3,wine_food_seo_name.3,wine_food_background_image.location.3,wine_food_background_image.variations.small.3,wine_food_id.4,wine_food_name.4,wine_food_seo_name.4,wine_food_background_image.location.4,wine_food_background_image.variations.small.4,wine_style_grapesid,wine_style_grapesname,wine_style_grapesseo_name,wine_style_grapeshas_detailed_info,wine_style_grapeswines_count,wine_style_grapes_id,wine_style_grapes_name,wine_style_grapes_seo_name,wine_style_grapes_has_detailed_info,wine_style_grapes_wines_count,wine_style_grapes_id.1,wine_style_grapes_name.1,wine_style_grapes_seo_name.1,wine_style_grapes_has_detailed_info.1,wine_style_grapes_wines_count.1,wine_style_grapes_id.2,wine_style_grapes_name.2,wine_style_grapes_seo_name.2,wine_style_grapes_has_detailed_info.2,wine_style_grapes_wines_count.2,wine_style_grapes_id.3,wine_style_grapes_name.3,wine_style_grapes_seo_name.3,wine_style_grapes_has_detailed_info.3,wine_style_grapes_wines_count.3,wine_style_grapes_id.4,wine_style_grapes_name.4,wine_style_grapes_seo_name.4,wine_style_grapes_has_detailed_info.4,wine_style_grapes_wines_count.4
0,8519046,franco-biondi-santi-brunello-di-montalcino-ris...,Biondi-Santi Brunello di Montalcino Riserva 1955,1955,1970.05,EUR,33,4.9,82698,Brunello di Montalcino Riserva,brunello-di-montalcino-riserva,1,0,False,True,1796.0,Brunello di Montalcino,,brunello-di-montalcino,it,Italië,Italia,italy,EUR,Euros,€,,560.0,4071126.0,372183.0,42165.0,"[{'id': 16, 'name': 'Sangiovese', 'seo_name': ...",//images.vivino.com/regions/backgrounds/c25PzK...,//thumbs.vivino.com/region_backgrounds/c25PzK2...,//thumbs.vivino.com/region_backgrounds/c25PzK2...,,11500,Biondi-Santi,franco-biondi-santi,0,,,,,,3.811572,,3.170349,1.678928,3.586557,158.0,163.0,,earth,"[{'id': 422, 'name': 'tobacco', 'count': 45}, ...",129.0,19105.0,oak,"[{'id': 242, 'name': 'leather', 'count': 57}, ...",114.0,12427.0,red_fruit,"[{'id': 229, 'name': 'jam', 'count': 7}, {'id'...",96.0,11998.0,non_oak,"[{'id': 422, 'name': 'tobacco', 'count': 45}, ...",86.0,2446.0,242.0,leather,57.0,284.0,mushroom,40.0,156.0,earthy,26.0,430.0,truffle,14.0,384.0,smoke,14.0,422.0,tobacco,45.0,292.0,oak,23.0,117.0,coffee,12.0,101.0,chocolate,11.0,83.0,cedar,8.0,422.0,tobacco,45.0,292.0,oak,23.0,117.0,coffee,12.0,101.0,chocolate,11.0,83.0,cedar,8.0,422.0,tobacco,45.0,292.0,oak,23.0,117.0,coffee,12.0,101.0,chocolate,11.0,83.0,cedar,8.0,Normal,3302,4.6,17492,135,22.0,italian-brunello,Italiaans,Brunello,Italiaanse Brunello,,"From Tuscany in central Italy, Brunello enjoys...",DOCG-wijn uit Toscane van enkel sangiovese,['There are fewer than 10 winemakers for all B...,5.0,Zeer full-bodied,3.0,Hoog,1.0,//images.vivino.com/backgrounds/styles/KWGLMNU...,//images.vivino.com/backgrounds/styles/thumbs/...,it,Italië,Italia,italy,EUR,Euros,€,,560.0,4071126.0,372183.0,42165.0,"[{'id': 16, 'name': 'Sangiovese', 'seo_name': ...",394.0,Toscane,Tuscany,tuscany,it,Italië,Italia,italy,EUR,Euros,€,,560.0,4071126.0,372183.0,42165.0,"[{'id': 16, 'name': 'Sangiovese', 'seo_name': ...",4961.0,//images.vivino.com/regions/backgrounds/qa-Yh0...,//thumbs.vivino.com/region_backgrounds/qa-Yh0S...,//thumbs.vivino.com/region_backgrounds/qa-Yh0S...,6642.0,31057.0,44.0,1.0,,,,4.0,Rundvlees,beef,//images.vivino.com/backgrounds/foods/4_beef.png,//images.vivino.com/backgrounds/foods/thumbs/4...,8.0,Lam,lamb,//images.vivino.com/backgrounds/foods/8_lamb.png,//images.vivino.com/backgrounds/foods/thumbs/8...,11.0,Wild (hert),game,//images.vivino.com/backgrounds/foods/11_venis...,//images.vivino.com/backgrounds/foods/thumbs/1...,20.0,Gevogelte,poultry,//images.vivino.com/backgrounds/foods/20_chick...,//images.vivino.com/backgrounds/foods/thumbs/2...,,,,,,16.0,Sangiovese,sangiovese,True,125094.0,,,,,,,,,,,,,,,,,,,,,,,,,
1,2890763,quintarelli-giuseppe-amarone-della-valpolicell...,Quintarelli Giuseppe Amarone della Valpolicell...,2013,389.0,EUR,311,4.8,84065,Amarone della Valpolicella Classico,amarone-della-valpolicella-classico,1,0,False,True,3237.0,Amarone della Valpolicella Classico,,amarone-della-valpolicella-classico,it,Italië,Italia,italy,EUR,Euros,€,,560.0,4071126.0,372183.0,42165.0,"[{'id': 16, 'name': 'Sangiovese', 'seo_name': ...",//images.vivino.com/regions/backgrounds/Z4sEIs...,//thumbs.vivino.com/region_backgrounds/Z4sEIsQ...,//thumbs.vivino.com/region_backgrounds/Z4sEIsQ...,,11601,Quintarelli Giuseppe,quintarelli-giuseppe,0,,,,,,2.357734,,4.611889,3.318892,2.318914,327.0,422.0,,oak,"[{'id': 242, 'name': 'leather', 'count': 57}, ...",252.0,33983.0,earth,"[{'id': 422, 'name': 'tobacco', 'count': 70}, ...",204.0,19844.0,black_fruit,"[{'id': 341, 'name': 'prune', 'count': 40}]",194.0,24497.0,non_oak,"[{'id': 422, 'name': 'tobacco', 'count': 70}, ...",180.0,5838.0,101.0,chocolate,99.0,422.0,tobacco,70.0,292.0,oak,42.0,117.0,coffee,42.0,434.0,vanilla,38.0,242.0,leather,57.0,22.0,balsamic,23.0,113.0,cocoa,18.0,156.0,earthy,17.0,384.0,smoke,16.0,242.0,leather,57.0,22.0,balsamic,23.0,113.0,cocoa,18.0,156.0,earthy,17.0,384.0,smoke,16.0,242.0,leather,57.0,22.0,balsamic,23.0,113.0,cocoa,18.0,156.0,earthy,17.0,384.0,smoke,16.0,Normal,8465,4.7,49774,93,3.0,italian-amarone,Italiaans,Amarone,Italiaanse Amarone,,"Amarone della Valpolicella is a rich, dry Ital...",Corvina-based DOCG wine from Veneto,"['Due to the drying process, Amarone requires ...",5.0,Zeer full-bodied,3.0,Hoog,1.0,//images.vivino.com/backgrounds/styles/U7OaGBJ...,//images.vivino.com/backgrounds/styles/thumbs/...,it,Italië,Italia,italy,EUR,Euros,€,,560.0,4071126.0,372183.0,42165.0,"[{'id': 16, 'name': 'Sangiovese', 'seo_name': ...",460.0,Veneto,,veneto,it,Italië,Italia,italy,EUR,Euros,€,,560.0,4071126.0,372183.0,42165.0,"[{'id': 16, 'name': 'Sangiovese', 'seo_name': ...",4962.0,//images.vivino.com/regions/backgrounds/WCIaSS...,//thumbs.vivino.com/region_backgrounds/WCIaSSA...,//thumbs.vivino.com/region_backgrounds/WCIaSSA...,6100.0,37191.0,28.0,1.0,,,,4.0,Rundvlees,beef,//images.vivino.com/backgrounds/foods/4_beef.png,//images.vivino.com/backgrounds/foods/thumbs/4...,8.0,Lam,lamb,//images.vivino.com/backgrounds/foods/8_lamb.png,//images.vivino.com/backgrounds/foods/thumbs/8...,11.0,Wild (hert),game,//images.vivino.com/backgrounds/foods/11_venis...,//images.vivino.com/backgrounds/foods/thumbs/1...,38.0,Blauwe kaas,,//images.vivino.com/backgrounds/foods/38_bluec...,//images.vivino.com/backgrounds/foods/thumbs/3...,,,,,,43.0,Corvina,corvina,False,22300.0,45.0,Rondinella,rondinella,False,19930.0,226.0,Corvinone,corvinone,False,6040.0,,,,,,,,,,,,,,,
2,14245513,vega-sicilia-unico-reserva-especial-edicion-2015,Vega Sicilia Unico Reserva Especial Edición 2015,2015,608.32,EUR,644,4.8,77136,Unico Reserva Especial Edición,unico-reserva-especial-edicion,1,2,False,True,405.0,Ribera del Duero,,ribera-del-duero,es,Spanje,España,spain,EUR,Euros,€,,152.0,2153435.0,138476.0,17963.0,"[{'id': 19, 'name': 'Tempranillo', 'seo_name':...",//images.vivino.com/regions/backgrounds/_nMNiD...,//thumbs.vivino.com/region_backgrounds/_nMNiDR...,//thumbs.vivino.com/region_backgrounds/_nMNiDR...,,11050,Vega Sicilia,vega-sicilia,0,,,,,,3.67321,,3.852942,1.838485,3.483987,496.0,322.0,,oak,"[{'id': 242, 'name': 'leather', 'count': 89}, ...",252.0,37277.0,non_oak,"[{'id': 422, 'name': 'tobacco', 'count': 83}, ...",187.0,4915.0,earth,"[{'id': 422, 'name': 'tobacco', 'count': 83}, ...",187.0,21630.0,black_fruit,"[{'id': 341, 'name': 'prune', 'count': 14}]",148.0,21076.0,292.0,oak,93.0,422.0,tobacco,83.0,434.0,vanilla,80.0,83.0,cedar,36.0,101.0,chocolate,25.0,6.0,almond,5.0,417.0,toast,5.0,438.0,walnut,3.0,213.0,hazelnut,2.0,267.0,marzipan,2.0,6.0,almond,5.0,417.0,toast,5.0,438.0,walnut,3.0,213.0,hazelnut,2.0,267.0,marzipan,2.0,6.0,almond,5.0,417.0,toast,5.0,438.0,walnut,3.0,213.0,hazelnut,2.0,267.0,marzipan,2.0,Normal,12819,4.7,68460,177,180.0,spanish-ribera-del-duero-red,Spaans,Rode wijn uit Ribero del Duero,Spaanse Ribera Del Duero Rood,,"Rioja may be the most famous region in Spain, ...",Often Tempranillo-based,"['In Ribera del Duero, the grape Tempranillo i...",5.0,Zeer full-bodied,3.0,Hoog,1.0,//images.vivino.com/backgrounds/styles/jvbiWuq...,//images.vivino.com/backgrounds/styles/thumbs/...,es,Spanje,España,spain,EUR,Euros,€,,152.0,2153435.0,138476.0,17963.0,"[{'id': 19, 'name': 'Tempranillo', 'seo_name':...",405.0,Ribera del Duero,,ribera-del-duero,es,Spanje,España,spain,EUR,Euros,€,,152.0,2153435.0,138476.0,17963.0,"[{'id': 19, 'name': 'Tempranillo', 'seo_name':...",404.0,//images.vivino.com/regions/backgrounds/_nMNiD...,//thumbs.vivino.com/region_backgrounds/_nMNiDR...,//thumbs.vivino.com/region_backgrounds/_nMNiDR...,677.0,5020.0,0.0,1.0,,,,4.0,Rundvlees,beef,//images.vivino.com/backgrounds/foods/4_beef.png,//images.vivino.com/backgrounds/foods/thumbs/4...,8.0,Lam,lamb,//images.vivino.com/backgrounds/foods/8_lamb.png,//images.vivino.com/backgrounds/foods/thumbs/8...,11.0,Wild (hert),game,//images.vivino.com/backgrounds/foods/11_venis...,//images.vivino.com/backgrounds/foods/thumbs/1...,,,,,,,,,,,19.0,Tempranillo,tempranillo,True,172842.0,,,,,,,,,,,,,,,,,,,,,,,,,


#### Wine Enthusiast Magazine

In [8]:
# base_location = r"wine_data"

# i = 0
# for file in os.listdir(base_location):
#     file_location = base_location + '/' + str(file)
#     if i==0:
#         wine_mag = pd.read_csv(file_location, encoding='latin-1')
#         i+=1
#     else:
#         df_to_append = pd.read_csv(file_location, encoding='latin-1', low_memory=False)
#         wine_mag = pd.concat([wine_mag, df_to_append], axis=0)

# print(wine_mag.shape)
# wine_mag.head(3)

(145397, 23)


Unnamed: 0.1,Unnamed: 0,Alcohol,Appellation,Bottle Size,Category,Country,Date Published,Description,Designation,Importer,Name,Price,Province,Rating,Region,Reviewer,Reviewer Twitter Handle,Subregion,User Avg Rating,Variety,Vintage,Winery,ï»¿
0,0.0,13%,"Moulin-Ã -Vent, Beaujolais, France",750 ml,Red,France,6/1/2019,Owned by the Beaune negociant firm Louis Jadot...,Clos des Thorins,Kobrand,ChÃ¢teau des Jacques 2016 Clos des Thorins (M...,$41,Beaujolais,93.0,Moulin-Ã -Vent,Roger Voss,@vossroger,,Not rated yet [Add Your Review],Gamay,2016.0,ChÃ¢teau des Jacques,
1,1.0,14.5%,"Santa Maria Valley, Central Coast, California, US",750 ml,Red,US,6/1/2019,The potential for Gamay Noir to become an impo...,Murmur Vineyard,,Stasis 2016 Murmur Vineyard Gamay Noir (Santa ...,$42,California,92.0,Central Coast,Matt Kettmann,@mattkettmann,Santa Maria Valley,Not rated yet [Add Your Review],"Gamay Noir, Gamay",2016.0,Stasis,
2,2.0,13.5%,"Moulin-Ã -Vent, Beaujolais, France",750 ml,Red,France,6/1/2019,"This well-structured, layered wine offers beau...",La Roche,Kobrand,ChÃ¢teau des Jacques 2016 La Roche (Moulin-Ã ...,$41,Beaujolais,92.0,Moulin-Ã -Vent,Roger Voss,@vossroger,,Not rated yet [Add Your Review],Gamay,2016.0,ChÃ¢teau des Jacques,


In [53]:
# wine_mag.to_csv('wine_mag_db.csv')

In [56]:
wine_mag = pd.read_csv('wine_mag_db.csv')
print(wine_mag.shape)
wine_mag.head(3)

(145397, 25)


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Alcohol,Appellation,Bottle Size,Category,Country,Date Published,Description,Designation,Importer,Name,Price,Province,Rating,Region,Reviewer,Reviewer Twitter Handle,Subregion,User Avg Rating,Variety,Vintage,Winery,ï»¿,Currency
0,0,0.0,13%,"Moulin-Ã -Vent, Beaujolais, France",750 ml,Red,France,6/1/2019,Owned by the Beaune negociant firm Louis Jadot...,Clos des Thorins,Kobrand,ChÃ¢teau des Jacques 2016 Clos des Thorins (M...,$41,Beaujolais,93.0,Moulin-Ã -Vent,Roger Voss,@vossroger,,Not rated yet [Add Your Review],Gamay,2016.0,ChÃ¢teau des Jacques,,USD
1,1,1.0,14.5%,"Santa Maria Valley, Central Coast, California, US",750 ml,Red,US,6/1/2019,The potential for Gamay Noir to become an impo...,Murmur Vineyard,,Stasis 2016 Murmur Vineyard Gamay Noir (Santa ...,$42,California,92.0,Central Coast,Matt Kettmann,@mattkettmann,Santa Maria Valley,Not rated yet [Add Your Review],"Gamay Noir, Gamay",2016.0,Stasis,,USD
2,2,2.0,13.5%,"Moulin-Ã -Vent, Beaujolais, France",750 ml,Red,France,6/1/2019,"This well-structured, layered wine offers beau...",La Roche,Kobrand,ChÃ¢teau des Jacques 2016 La Roche (Moulin-Ã ...,$41,Beaujolais,92.0,Moulin-Ã -Vent,Roger Voss,@vossroger,,Not rated yet [Add Your Review],Gamay,2016.0,ChÃ¢teau des Jacques,,USD


# Data wrangling

## Function

In [9]:
def clean_up_vivino(df):
    
    #concatenate grapes and food
    df['grapes'] = df[['wine_style_grapesname','wine_style_grapes_name','wine_style_grapes_name','wine_style_grapes_name.1','wine_style_grapes_name.2','wine_style_grapes_name.3','wine_style_grapes_name.4']].fillna(' ').agg(' '.join, axis=1)
    df['food'] = df[['wine_food_seo_name','wine_food_seo_name.1','wine_food_seo_name.2','wine_food_seo_name.3','wine_food_seo_name.4']].fillna(' ').agg(' '.join, axis=1)
    df['taste'] = df[['wine_taste_flavor_1group','wine_taste_flavor_2group','wine_taste_flavor_3group',
                      'wine_taste_flavor_4group','wine_taste_flavor_1primary_keywordsname',
                      'wine_taste_flavor_2primary_keywordsname','wine_taste_flavor_3primary_keywordsname',
                      'wine_taste_flavor_4primary_keywordsname','wine_taste_flavor_5primary_keywordsname',
                      'wine_taste_flavor_1primary_keywordsname.1','wine_taste_flavor_1primary_keywordsname.2',
                      'wine_taste_flavor_1primary_keywordsname.3','wine_taste_flavor_1primary_keywordsname.4',
                      'wine_taste_flavor_1primary_keywordsname.5',
                      'wine_taste_flavor_3primary_keywordsname.1','wine_taste_flavor_3primary_keywordsname.2',
                      'wine_taste_flavor_3primary_keywordsname.3','wine_taste_flavor_3primary_keywordsname.4',
                      'wine_taste_flavor_3primary_keywordsname.5','wine_taste_flavor_4primary_keywordsname.1',
                      'wine_taste_flavor_4primary_keywordsname.2','wine_taste_flavor_4primary_keywordsname.3',
                      'wine_taste_flavor_4primary_keywordsname.4','wine_taste_flavor_4primary_keywordsname.5']].fillna(' ').agg(' '.join, axis=1)
    
    
    # drop duplicates
    df = df.drop_duplicates()
    
    # drop columns where %nulls is greater than 0.9
    nulls_wines = pd.DataFrame(df.isna().sum()/len(df)).reset_index()
    nulls_wines.columns = ['column_name', 'nulls_percentage']
    columns_NaN = nulls_wines[nulls_wines['nulls_percentage'] >= 0.9]
    drop_columns_list = list(columns_NaN['column_name'])
    df = df.drop(drop_columns_list, axis = 1)
    
    # translate dutch words to english
    df['wine_style_acidity_description'] = df['wine_style_acidity_description'].replace(['Hoog', 'Laag'], ['High','Low'])
    df['wine_style_body_description'] = df['wine_style_body_description'].replace(['Zeer full-bodied', 'Zeer licht-bodied'], ['Very full-bodied','Very light-bodied'])

    
    #replace type_id by text
    df['wine_type_id'] = df['wine_type_id'].replace(['1', '2','3','4'], ['Red','White','Sparkling','Rose'])
    
    df['wine_characteristics'] = df[['wine_style_description','wine_style_acidity_description','wine_style_body_description']].fillna(' ').agg(' '.join, axis=1)
    
    df = df[['wine_name', 'year',  'wine_type_id', 'wine_region_country.seo_name', 'wine_region_name', 
             'ratings_average', 'grapes', 'wine_winery_name', 'amount', 'code','wine_characteristics']]
    
    return df

In [10]:
def clean_up_wine_mag(df):
    
    df['Currency'] = 'USD'
    df = df[['Name', 'Vintage', 'Category', 'Country', 'Province', 'Rating', 'Variety', 'Winery',
             'Price', 'Currency', 'Description']]
    
    # drop duplicates
    df = df.drop_duplicates()
    
    # drop NaN 
    df = df.dropna().reset_index(drop=True)
    
    return df

In [11]:
# Find duplicate columns

def getDuplicateColumns(df):
 
    # Create an empty set
    duplicateColumnNames = set()
 
    # Iterate through all the columns
    # of dataframe
    for x in range(df.shape[1]):
 
        # Take column at xth index.
        col = df.iloc[:, x]
 
        # Iterate through all the columns in
        # DataFrame from (x + 1)th index to
        # last index
        for y in range(x + 1, df.shape[1]):
 
            # Take column at yth index.
            otherCol = df.iloc[:, y]
 
            # Check if two columns at x & y
            # index are equal or not,
            # if equal then adding
            # to the set
            if col.equals(otherCol):
                duplicateColumnNames.add(df.columns.values[y])
 
    # Return list of unique column names
    # whose contents are duplicates.
    return list(duplicateColumnNames)

In [12]:
all_wines_vivino = clean_up_vivino(all_wines)
print(all_wines_vivino.shape)
all_wines_vivino.head(3)

(6586, 11)


Unnamed: 0,wine_name,year,wine_type_id,wine_region_country.seo_name,wine_region_name,ratings_average,grapes,wine_winery_name,amount,code,wine_characteristics
0,Brunello di Montalcino Riserva,1955,1,italy,Brunello di Montalcino,4.9,Sangiovese,Biondi-Santi,1970.05,EUR,"From Tuscany in central Italy, Brunello enjoys..."
1,Amarone della Valpolicella Classico,2013,1,italy,Amarone della Valpolicella Classico,4.8,Corvina Rondinella Rondinella Corvinone,Quintarelli Giuseppe,389.0,EUR,"Amarone della Valpolicella is a rich, dry Ital..."
2,Unico Reserva Especial Edición,2015,1,spain,Ribera del Duero,4.8,Tempranillo,Vega Sicilia,608.32,EUR,"Rioja may be the most famous region in Spain, ..."


In [13]:
all_wines_wine_mag = clean_up_wine_mag(wine_mag)
print(all_wines_wine_mag.shape)
all_wines_wine_mag.head(3)

(131132, 11)


Unnamed: 0,Name,Vintage,Category,Country,Province,Rating,Variety,Winery,Price,Currency,Description
0,ChÃ¢teau des Jacques 2016 Clos des Thorins (M...,2016.0,Red,France,Beaujolais,93.0,Gamay,ChÃ¢teau des Jacques,$41,USD,Owned by the Beaune negociant firm Louis Jadot...
1,Stasis 2016 Murmur Vineyard Gamay Noir (Santa ...,2016.0,Red,US,California,92.0,"Gamay Noir, Gamay",Stasis,$42,USD,The potential for Gamay Noir to become an impo...
2,ChÃ¢teau des Jacques 2016 La Roche (Moulin-Ã ...,2016.0,Red,France,Beaujolais,92.0,Gamay,ChÃ¢teau des Jacques,$41,USD,"This well-structured, layered wine offers beau..."


In [14]:
# Combine both datasets

all_wines_vivino.columns = ['Name', 'Vintage', 'Category', 'Country', 'Province', 'Rating', 'Variety', 'Winery',
             'Price', 'Currency', 'Description']

wine_dataframe = pd.concat([all_wines_vivino,all_wines_wine_mag], axis = 0)
wine_dataframe = wine_dataframe.reset_index(drop=True)
print(wine_dataframe.shape)
wine_dataframe.head(3)

(137718, 11)


Unnamed: 0,Name,Vintage,Category,Country,Province,Rating,Variety,Winery,Price,Currency,Description
0,Brunello di Montalcino Riserva,1955,1,italy,Brunello di Montalcino,4.9,Sangiovese,Biondi-Santi,1970.05,EUR,"From Tuscany in central Italy, Brunello enjoys..."
1,Amarone della Valpolicella Classico,2013,1,italy,Amarone della Valpolicella Classico,4.8,Corvina Rondinella Rondinella Corvinone,Quintarelli Giuseppe,389.0,EUR,"Amarone della Valpolicella is a rich, dry Ital..."
2,Unico Reserva Especial Edición,2015,1,spain,Ribera del Duero,4.8,Tempranillo,Vega Sicilia,608.32,EUR,"Rioja may be the most famous region in Spain, ..."


In [15]:
duplicateColumnNames = getDuplicateColumns(wine_dataframe)
duplicateColumnNames

[]

In [16]:
# all_wines = all_wines.drop(columns = duplicateColumnNames)
# all_wines.shape

##### Re-check NaN values

In [17]:
# check NaN
# create a dataframe to store percentage of nulls
nulls_wines = pd.DataFrame(wine_dataframe.isna().sum()/len(wine_dataframe)).reset_index()
nulls_wines.columns = ['column_name', 'nulls_percentage']
nulls_wines[nulls_wines['nulls_percentage'] > 0.5]

Unnamed: 0,column_name,nulls_percentage


## NLP -  pre-processing

Next steps:

1. Normalize words in wine description (tokenization, remove stopwords, punctuation and non alpha-numeric charactersc, stemming, lemmatization)
2. Enhance the set of normalized words with phrases (bi-grams and tri-grams)
3. Apply mapping of wine terms (based on wine wheel) to each description
4. Retrieve the Word2Vec word embedding for each mapped term in the description
5. Weight each word embedding in the wine description with a TF-IDF weighting
6. Sum the word embeddings within each wine description to create a single vector representation of the wine description

### Remove special characters, numbers, etc.

In [18]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [19]:
preprocessed_name = []

for word in tqdm(wine_dataframe['Name'].values):
    word = re.sub(r"http\S+", "", word)
    word = decontracted(word)
    word = re.sub("\S*\d\S*", "", word).strip()
    word = re.sub('[^A-Za-z]+', ' ', word)
    word = str.lower(word)

    preprocessed_name.append(word.strip())
    
wine_dataframe['Name'] = preprocessed_name

100%|████████████████████████████████| 137718/137718 [00:02<00:00, 59724.96it/s]


In [20]:
preprocessed_description = []

for word in tqdm(wine_dataframe['Description'].values):
    word = re.sub(r"http\S+", "", word)
    word = decontracted(word)
    word = re.sub("\S*\d\S*", "", word).strip()
    word = re.sub('[^A-Za-z]+', ' ', word)
    word = str.lower(word)

    preprocessed_description.append(word.strip())
    
wine_dataframe['Description'] = preprocessed_description

100%|████████████████████████████████| 137718/137718 [00:07<00:00, 18299.69it/s]


### Tokenization
Tokenization is breaking the raw text into small chunks. Tokenization breaks the raw text into words, sentences called tokens. These tokens help in understanding the context or developing the model for the NLP. The tokenization helps in interpreting the meaning of the text by analyzing the sequence of the words.

https://towardsdatascience.com/tokenization-for-natural-language-processing-a179a891bad4


### Stemming and lemmatization

"Stemming is the process of reducing inflection in words to their root forms such as mapping a group of words to the same stem even if the stem itself is not a valid word in the Language."

Stem (root) is the part of the word to which you add inflectional (changing/deriving) affixes such as (-ed,-ize, -s,-de,mis). So stemming a word or sentence may result in words that are not actual words. Stems are created by removing the suffixes or prefixes used with a word.

Information: Removing suffixes from a word is called Suffix Stripping


Lemmatization is the process of converting a word to its base form. 


The difference between stemming and lemmatization is, lemmatization considers the context and converts the word to its meaningful base form, whereas stemming just removes the last few characters, often leading to incorrect meanings and spelling errors.


### Stopwords
Stop Words: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine has been programmed to ignore, both when indexing entries for searching and when retrieving them as the result of a search query. 

These words should not take up space in the database, or take up valuable processing time. For this, they should be removed.

In [21]:
description_list = list(wine_dataframe['Description'])
description_list = [str(r) for r in description_list]
full_corpus = ' '.join(description_list)
sentences_tokenized = sent_tokenize(full_corpus)

#print(sentences_tokenized[:5])

In [22]:
stop_words = set(stopwords.words('english')) 

punctuation_table = str.maketrans({key: None for key in string.punctuation})
sno = SnowballStemmer('english')

def normalize_text(raw_text):
    try:
        word_list = word_tokenize(raw_text)
        normalized_sentence = []
        for w in word_list:
            try:
                w = str(w)
                lower_case_word = str.lower(w)
                stemmed_word = sno.stem(lower_case_word)
                no_punctuation = stemmed_word.translate(punctuation_table)
                if len(no_punctuation) > 1 and no_punctuation not in stop_words:
                    normalized_sentence.append(no_punctuation)
            except:
                continue
        return normalized_sentence
    except:
        return ''

# sentence_sample = sentences_tokenized[:10]
normalized_sentences = []
for s in sentences_tokenized:
    normalized_text = normalize_text(s)
    normalized_sentences.append(normalized_text)

### Phrases
Automatically detect common phrases – aka multi-word expressions, word n-gram collocations – from a stream of sentences.

https://radimrehurek.com/gensim/models/phrases.html

In paragraphs, certain words always tend to occur in pairs (bigram) or in groups of threes (trigram). Because the two words combined together form the actual entity. For example: The word ‘French’ refers the language or region and the word ‘revolution’ can refer to the planetary revolution. But combining them, ‘French Revolution’, refers to something completely different.

It’s quite important to form bigrams and trigrams from sentences, especially when working with bag-of-words models.

So how to create the bigrams?

It’s quite easy and efficient with gensim’s Phrases model. The created Phrases model allows indexing, so, just pass the original text (list) to the built Phrases model to form the bigrams.

https://www.machinelearningplus.com/nlp/gensim-tutorial/#10howtocreatebigramsandtrigramsusingphrasermodels

In [23]:
phrases = Phrases(normalized_sentences)
phrases = Phrases(phrases[normalized_sentences])

ngrams = Phraser(phrases)

phrased_sentences = []
for sent in normalized_sentences:
    phrased_sentence = ngrams[sent]
    phrased_sentences.append(phrased_sentence)

full_list_words = [item for sublist in phrased_sentences for item in sublist]

Extract the most common words and rank these by how frequently they appear.

In [24]:
word_counts = Counter(full_list_words)
sorted_counts = OrderedDict(word_counts.most_common(5000))
counter_df = pd.DataFrame.from_dict(sorted_counts, orient='index')
# top_5000_words = counter_df.head(5000)
counter_df.to_csv('top_5000_descriptors.csv')

### Map wine description to Wine Wheel

In [25]:
descriptor_mapping = descriptor_mapping.set_index('raw descriptor')
descriptor_mapping.head(3)

Unnamed: 0_level_0,occurences,level_3,level_2,level_1
raw descriptor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
abras,190,abrasive,high_tannin,tannin
acacia,130,acacia,flowery,flower
acacia_flower,77,acacia,flowery,flower


In [26]:
descriptor_mapping.index

Index(['abras', 'acacia', 'acacia_flower', 'acacia_honey', 'accid_benedictin',
       'acid', 'acid_german', 'ad_depth', 'add_complex', 'add_depth',
       ...
       'yellow_flower', 'yellow_peach', 'yellow_pear', 'yellow_stone', 'zest',
       'zesti', 'zesti_lime', 'zing', 'zingi', 'zippi'],
      dtype='object', name='raw descriptor', length=1520)

In [27]:
def map_wine_wheel(word):
    if word in list(descriptor_mapping.index):
        normalized_word = descriptor_mapping['level_3'][word]
        return normalized_word
    else:
        return word

normalized_sentences = []
for sentence in phrased_sentences:
    normalized_sentence = []
    for word in sentence:
        normalized_word = map_wine_wheel(word)
        normalized_sentence.append(str(normalized_word))
    normalized_sentences.append(normalized_sentence)

## Word Embeddings with Word2Vec

Word2Vec approach uses deep learning and neural networks-based techniques to convert words into corresponding vectors in such a way that the semantically similar vectors are close to each other in N-dimensional space, where N refers to the dimensions of the vector.

Word2Vec returns some astonishing results. Word2Vec's ability to maintain semantic relation is reflected by a classic example where if you have a vector for the word "King" and you remove the vector represented by the word "Man" from the "King" and add "Women" to it, you get a vector which is close to the "Queen" vector.

Word2Vec has several advantages over bag of words. Word2Vec retains the semantic meaning of different words in a document. The context information is not lost. Another great advantage of Word2Vec approach is that the size of the embedding vector is very small. Each dimension in the embedding vector contains information about one aspect of the word. 

In [28]:
model = Word2Vec(normalized_sentences, vector_size=300, min_count=5, epochs=15)
print(model)

model.save('wine_word2vec_model.bin')

Word2Vec(vocab=14772, vector_size=300, alpha=0.025)


#### test the model

In [29]:
# get 10 most similar words
model.wv.most_similar(positive='meat', topn=10)

[('would_overwhelm', 0.99947190284729),
 ('dish', 0.99944669008255),
 ('avoid_lighter', 0.9993506669998169),
 ('pinot_noir', 0.9992489814758301),
 ('dish_caution', 0.9991723895072937),
 ('varieti', 0.999150812625885),
 ('produc_region', 0.999139130115509),
 ('differ_tier', 0.999128520488739),
 ('valpolicella_typic', 0.9991271495819092),
 ('avoid_pair', 0.9991248250007629)]

Results above seem to be related to chocolate. Since the model is perfoming well, I will apply the same logic to wine descriptions

## Wine Description Embeddings

In [30]:
wine_reviews = list(wine_dataframe['Description'])

def map_wine_wheel_description(word):
    if word in list(descriptor_mapping.index):
        descriptor_to_return = descriptor_mapping['level_3'][word]
        return descriptor_to_return

descriptorized_reviews = []
for review in wine_reviews:
    normalized_review = normalize_text(review)
    phrased_review = ngrams[normalized_review]
    descriptors_only = [map_wine_wheel_description(word) for word in phrased_review]
    no_nones = [str(d) for d in descriptors_only if d is not None]
    descriptorized_review = ' '.join(no_nones)
    descriptorized_reviews.append(descriptorized_review)

In [31]:
descriptorized_reviews

['full_bodied bold rich full_bodied red black fruit soil velvety tannin medium_bodied acid full_bodied complex salt cheese meat saline meat full_bodied',
 'amarone dry red amarone ripe bold fresh dry raisin concentrated sugar full_bodied wood_age amarone amarone wood_age wood_age amarone amarone amarone rich robust low_alcohol full_bodied',
 'dark dark fruit tobacco vanilla dark bread grapey dark delicate polished oak wood_age full_bodied',
 'dark dark fruit tobacco vanilla dark bread grapey dark delicate polished oak wood_age full_bodied',
 'medium_bodied',
 'rich concentrated full_bodied red grippy tannin cassis cigar cake_spice gravel full_bodied',
 'dark dark fruit tobacco vanilla dark bread grapey dark delicate polished oak wood_age full_bodied',
 'perfumed flower violet soft silky tannin powerful elegant soil limestone chalk gravel gravel soil full_bodied',
 'dark dark fruit tobacco vanilla dark bread grapey dark delicate polished oak wood_age full_bodied',
 'elegant grain full_b

### TF-IDF (term frequency-inverse document frequency)

TF- the number of times the word t occurs in document d divided by the total number of the words in document d. In other words, it is the probability of finding a word in document d.


TFIDF works by proportionally increasing the number of times a word appears in the document but is counterbalanced by the number of documents in which it is present. Hence, words like 'this', 'are' etc., that are commonly present in all the documents are not given a very high rank.

TFIDF takes into consideration both the frequency of each term across all descriptions, as well as the number of descriptors in each wine review.

In [32]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit(descriptorized_reviews)

dict_of_tfidf_weightings = dict(zip(X.get_feature_names_out(), X.idf_))

wine_review_vectors = []
for d in descriptorized_reviews:
    descriptor_count = 0
    weighted_review_terms = []
    terms = d.split(' ')
    for term in terms:
        if term in dict_of_tfidf_weightings.keys():
            tfidf_weighting = dict_of_tfidf_weightings[term]
            word_vector = model.wv.get_vector(term).reshape(1, 300)
            weighted_word_vector = tfidf_weighting * word_vector
            weighted_review_terms.append(weighted_word_vector)
            descriptor_count += 1
        else:
            continue
    try:
        review_vector = sum(weighted_review_terms)/len(weighted_review_terms)
    except:
        review_vector = []
    vector_and_count = [terms, review_vector, descriptor_count]
    wine_review_vectors.append(vector_and_count)

wine_dataframe['normalized_descriptors'] = list(map(itemgetter(0), wine_review_vectors))
wine_dataframe['review_vector'] = list(map(itemgetter(1), wine_review_vectors))
wine_dataframe['descriptor_count'] = list(map(itemgetter(2), wine_review_vectors))

wine_dataframe.reset_index(inplace=True)
wine_dataframe.head()

Unnamed: 0,index,Name,Vintage,Category,Country,Province,Rating,Variety,Winery,Price,Currency,Description,normalized_descriptors,review_vector,descriptor_count
0,0,brunello di montalcino riserva,1955,1,italy,Brunello di Montalcino,4.9,Sangiovese,Biondi-Santi,1970.05,EUR,from tuscany in central italy brunello enjoys ...,"[full_bodied, bold, rich, full_bodied, red, bl...","[[1.0030519, 1.7826862, -0.5469858, 0.28920358...",20
1,1,amarone della valpolicella classico,2013,1,italy,Amarone della Valpolicella Classico,4.8,Corvina Rondinella Rondinella Corvinone,Quintarelli Giuseppe,389.0,EUR,amarone della valpolicella is a rich dry itali...,"[amarone, dry, red, amarone, ripe, bold, fresh...","[[2.2180426, 4.3720107, -1.171321, 0.38832542,...",24
2,2,unico reserva especial edici n,2015,1,spain,Ribera del Duero,4.8,Tempranillo,Vega Sicilia,608.32,EUR,rioja may be the most famous region in spain b...,"[dark, dark, fruit, tobacco, vanilla, dark, br...","[[0.58700037, 2.1194954, -0.39164963, -0.10664...",14
3,3,unico reserva especial edici n,2019,1,spain,Ribera del Duero,4.8,Tempranillo,Vega Sicilia,593.07,EUR,rioja may be the most famous region in spain b...,"[dark, dark, fruit, tobacco, vanilla, dark, br...","[[0.58700037, 2.1194954, -0.39164963, -0.10664...",14
4,4,chambertin clos de beze grand cru,1995,1,france,Chambertin-Clos de Bèze Grand Cru,4.9,Pinot Noir,Domaine Armand Rousseau,4228.95,EUR,high medium bodied,[medium_bodied],"[[2.7672644, 3.5603874, -1.7324052, 0.97336274...",1


## Wine Description Vectors

### Similarities Between Grape Varieties

Similarities between grape varieties will be done by taking the average of all wine description vectors for each grape variety, compressing them into two dimensions using PCA (Principal Component Analysis) and then producing a visual mapping.

In [33]:
# keep descriptions with more than 5 descriptors
wine_reviews_mincount = wine_dataframe.loc[wine_dataframe['descriptor_count'] > 5]
wine_reviews_mincount.reset_index(inplace=True)

In [34]:
variety_mapping = {'Shiraz': 'Syrah', 'Pinot Gris': 'Pinot Grigio', 'Pinot Grigio/Gris': 'Pinot Grigio', 
                   'Garnacha, Grenache': 'Grenache', 'Garnacha': 'Grenache', 'CarmenÃ¨re': 'Carmenere',
                    'GrÃ¼ner Veltliner': 'Gruner Veltliner', 'TorrontÃ©s': 'Torrontes', 
                   'RhÃ´ne-style Red Blend': 'Rhone-style Red Blend', 'AlbariÃ±o': 'Albarino',
                  'GewÃ¼rztraminer': 'Gewurztraminer', 'RhÃ´ne-style White Blend': 'Rhone-style White Blend'}

In [35]:
def consolidate_varieties(variety_name):
    if variety_name in variety_mapping:
        return variety_mapping[variety_name]
    else:
        return variety_name

wine_reviews_clean = wine_reviews_mincount.copy()
wine_reviews_clean['Variety'] = wine_reviews_clean['Variety'].apply(consolidate_varieties)

def subset_wine_vectors(list_of_varieties):
    wine_variety_vectors = []
    for v in list_of_varieties:
        one_var_only = wine_reviews_clean.loc[wine_reviews_clean['Variety'] == v]
        review_arrays = one_var_only['review_vector'].apply(lambda x: x[0])
        average_variety_vec = np.average(review_arrays)
        wine_variety_vector = [v, average_variety_vec]
        wine_variety_vectors.append(wine_variety_vector)
    return wine_variety_vectors

def pca_wine_variety(list_of_varieties):
    wine_var_vectors = subset_wine_vectors(list_of_varieties)
    pca = PCA(n_components=2)
    pca.fit([w[1] for w in wine_var_vectors])  
    pca_dataset = pca.fit_transform([w[1] for w in wine_var_vectors])
    pca_dataframe = pd.DataFrame(pca_dataset, columns=['pca_1', 'pca_2'])
    pca_dataframe.index = [w[0] for w in wine_var_vectors]
    # print(pca_dataframe)
    return pca_dataframe

### Wine pairing Recommender

In [36]:
input_vectors = list(wine_reviews_mincount['review_vector'])
input_vectors_listed = [a.tolist() for a in input_vectors]
input_vectors_listed = [a[0] for a in input_vectors_listed]

knn = NearestNeighbors(n_neighbors=10, algorithm= 'brute', metric='cosine')
model_knn = knn.fit(input_vectors_listed)

In [37]:
wine_reviews_mincount

Unnamed: 0,level_0,index,Name,Vintage,Category,Country,Province,Rating,Variety,Winery,Price,Currency,Description,normalized_descriptors,review_vector,descriptor_count
0,0,0,brunello di montalcino riserva,1955,1,italy,Brunello di Montalcino,4.9,Sangiovese,Biondi-Santi,1970.05,EUR,from tuscany in central italy brunello enjoys ...,"[full_bodied, bold, rich, full_bodied, red, bl...","[[1.0030519, 1.7826862, -0.5469858, 0.28920358...",20
1,1,1,amarone della valpolicella classico,2013,1,italy,Amarone della Valpolicella Classico,4.8,Corvina Rondinella Rondinella Corvinone,Quintarelli Giuseppe,389.0,EUR,amarone della valpolicella is a rich dry itali...,"[amarone, dry, red, amarone, ripe, bold, fresh...","[[2.2180426, 4.3720107, -1.171321, 0.38832542,...",24
2,2,2,unico reserva especial edici n,2015,1,spain,Ribera del Duero,4.8,Tempranillo,Vega Sicilia,608.32,EUR,rioja may be the most famous region in spain b...,"[dark, dark, fruit, tobacco, vanilla, dark, br...","[[0.58700037, 2.1194954, -0.39164963, -0.10664...",14
3,3,3,unico reserva especial edici n,2019,1,spain,Ribera del Duero,4.8,Tempranillo,Vega Sicilia,593.07,EUR,rioja may be the most famous region in spain b...,"[dark, dark, fruit, tobacco, vanilla, dark, br...","[[0.58700037, 2.1194954, -0.39164963, -0.10664...",14
4,5,5,grand vin pauillac premier grand cru class,1990,1,france,Pauillac,4.8,Cabernet Sauvignon Cabernet Franc Cabernet Fra...,Château Latour,937.75,EUR,bordeaux pauillac wines are rich and concentra...,"[rich, concentrated, full_bodied, red, grippy,...","[[1.464141, 2.5000045, -0.7663763, 0.563424, 0...",11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130610,137711,137711,castillo de almansa reserva tempranillo almansa,1993.0,Red,Spain,Central Spain,87.0,Tempranillo,Castillo de Almansa,$10,USD,bright cherry flavors up front morph into love...,"[bright, cherry, earth, complex, light_bodied,...","[[0.36946848, 0.7590681, -0.19092798, 0.083533...",7
130611,137712,137712,la rioja alta vina ardanza reserva rioja,1990.0,Red,Spain,Northern Spain,86.0,Tempranillo,La Rioja Alta,$27,USD,mature shows browning at the rim and plenty of...,"[mature, leafy, tobacco, sweet, vanilla, cherr...","[[0.20185587, 0.78000206, -0.1353619, -0.07407...",7
130612,137713,137713,pr ncipe de viana tempranillo navarra,1996.0,Red,Spain,Northern Spain,85.0,Tempranillo,PrÃ­ncipe de Viana,$11,USD,dark cherries earth and some oak scents mark t...,"[dark, cherry, earth, oak, easy, wood_age, pop...","[[0.4642195, 1.358213, -0.24858989, -0.0084054...",9
130613,137714,137714,torres coronas tempranillo pened s,1997.0,Red,Spain,Catalonia,84.0,Tempranillo,Torres,$9,USD,a mainly tempranillo wine with a small additio...,"[fresh, purple, sweet, ripe, simple, fruit, ea...","[[0.4444789, 0.83052886, -0.24661751, 0.138429...",11


In [38]:
def best_similar_wines(name_test,k):
    
    wine_test_vector = wine_reviews_mincount.loc[wine_reviews_mincount['Name'] == name_test]['review_vector'].tolist()[0]
    
    distance, indice = model_knn.kneighbors(wine_test_vector, n_neighbors=k+1)
    distance_list = distance[0].tolist()[1:]
    indice_list = indice[0].tolist()[1:]

    main_wine = wine_reviews_mincount.loc[wine_reviews_mincount['Name'] == name_test]

    print('Wine to match:', name_test)
    print('The original wine has the following descriptors:', list(main_wine['normalized_descriptors'])[0])
    print('_________')

    n = 1
    for d, i in zip(distance_list, indice_list):
        wine_name = wine_reviews_mincount['Name'][i]
        wine_category = wine_reviews_mincount['Category'][i]
        wine_country = wine_reviews_mincount['Country'][i]
        wine_grape = wine_reviews_mincount['Variety'][i]
        wine_descriptors = wine_reviews_mincount['normalized_descriptors'][i]
        print('Suggestion', str(n), ':', wine_name)
        print('Category', str(n), ':', wine_category)
        print('Grapes', str(n), ':', wine_grape)
        print('Country', str(n), ':', wine_country)
        print('This wine has the following descriptors:', wine_descriptors)
        print('')
        n+=1

best_similar_wines('la rioja alta vina ardanza reserva rioja',5)

Wine to match: la rioja alta vina ardanza reserva rioja
The original wine has the following descriptors: ['dill', 'flower', 'dry', 'fruit', 'raisin', 'plum', 'cinnamon', 'tomato', 'acid']
_________
Suggestion 1 : nieto senetiner don nicanor malbec mendoza
Category 1 : Red
Grapes 1 : Malbec
Country 1 :  Argentina
This wine has the following descriptors: ['black', 'jam', 'heavy', 'flower', 'thick', 'raisin', 'prune', 'heavy', 'inky', 'elegant']

Suggestion 2 : telmo rodr guez gazur ribera del duero
Category 2 : Red
Grapes 2 : Tinto Fino, Tempranillo
Country 2 :  Spain
This wine has the following descriptors: ['plum', 'berry', 'raisin', 'funky', 'flower', 'gritty', 'chunky', 'ripe', 'baked', 'plum', 'raisin', 'high_tannin', 'soft', 'sticky', 'full_bodied']

Suggestion 3 : louis latour bourgogne
Category 3 : Red
Grapes 3 : Pinot Noir
Country 3 :  France
This wine has the following descriptors: ['juicy', 'jam', 'strawberry', 'light_bodied', 'soft', 'acid']

Suggestion 4 : antonin rodet ch t

In [39]:
def best_wine_paring(list_of_descriptors,number_of_suggestions):
    

    weighted_review_terms = []
    for term in list_of_descriptors:
        if term not in dict_of_tfidf_weightings:
            if term not in descriptor_mapping.index:
                print('choose a different descriptor from', term)
                continue
            else:
                term = descriptor_mapping['normalized'][term]
        tfidf_weighting = dict_of_tfidf_weightings[term]
        word_vector = model.wv.get_vector(term).reshape(1, 300)
        weighted_word_vector = tfidf_weighting * word_vector
        weighted_review_terms.append(weighted_word_vector)
    review_vector = sum(weighted_review_terms)
    
    distance, indice = model_knn.kneighbors(review_vector, n_neighbors=number_of_suggestions+1)
    distance_list = distance[0].tolist()[1:]
    indice_list = indice[0].tolist()[1:]

    n = 1
    for d, i in zip(distance_list, indice_list):
        wine_name = wine_reviews_mincount['Name'][i]
        wine_category = wine_reviews_mincount['Category'][i]
        wine_country = wine_reviews_mincount['Country'][i]
        wine_grape = wine_reviews_mincount['Variety'][i]
        wine_descriptors = wine_reviews_mincount['normalized_descriptors'][i]
        print('Suggestion', str(n), ':', wine_name)
        print('Category', str(n), ':', wine_category)
        print('Grapes', str(n), ':', wine_grape)
        print('Country', str(n), ':', wine_country)
        print('This wine has the following descriptors:', wine_descriptors)
        print('')
        n+=1
    

In [40]:
taste_food = ['jam', 'cheese']
best_wine_paring(list_of_descriptors=taste_food,number_of_suggestions = 3)

Suggestion 1 : migration dierberg vineyard chardonnay santa maria valley
Category 1 : White
Grapes 1 : Chardonnay
Country 1 :  US
This wine has the following descriptors: ['fresh', 'apple', 'wet_rocks', 'salt', 'brown_butter', 'marmalade', 'ripe', 'nectarine']

Suggestion 2 : brewer clifton chardonnay sta rita hills
Category 2 : White
Grapes 2 : Chardonnay
Country 2 :  US
This wine has the following descriptors: ['brine', 'wet_earth', 'sour', 'salt', 'citrus', 'nectarine', 'salt', 'pear']

Suggestion 3 : cusumano angimb tenuta ficuzza white terre siciliane
Category 3 : White
Grapes 3 : White Blend
Country 3 :  Italy
This wine has the following descriptors: ['ripe', 'pear', 'acacia', 'bread_crust', 'linear', 'apple', 'citrus', 'bitter_almond', 'fresh', 'acid', 'salt', 'closed']



# Wine @ 5

In [41]:
!jupyter nbextension install --py widgetsnbextension --sys-prefix
!jupyter nbextension enable --py widgetsnbextension --sys-prefix
!jupyter serverextension enable voila --sys-prefix

Installing /Users/mariaesquivel/opt/anaconda3/lib/python3.9/site-packages/widgetsnbextension/static -> jupyter-js-widgets
Up to date: /Users/mariaesquivel/opt/anaconda3/share/jupyter/nbextensions/jupyter-js-widgets/extension.js.map
Up to date: /Users/mariaesquivel/opt/anaconda3/share/jupyter/nbextensions/jupyter-js-widgets/extension.js
- Validating: [32mOK[0m

    To initialize this nbextension in the browser every time the notebook (or other app) loads:
    
          jupyter nbextension enable widgetsnbextension --py --sys-prefix
    
Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m
Enabling: voila
- Writing config: /Users/mariaesquivel/opt/anaconda3/etc/jupyter
    - Validating...
      voila 0.4.0 [32mOK[0m


In [42]:
import warnings
warnings.filterwarnings('ignore')

import ipywidgets as widgets
from IPython.display import display, clear_output

In [43]:
# Image Widget - wine pairing

file = open('pairing.jpg', 'rb')
image = file.read()

image_headline = widgets.Image(
                    value=image,
                    format='jpg',
                    width='300'
                )

label_headline = widgets.Label(
                    style={'description_width': 'initial'}
                )

vbox_headline = widgets.VBox([image_headline, label_headline])

In [44]:
# Image Widget - wine recommender

file1 = open("recommender.jpg", "rb")
image1 = file1.read()

image_headline1 = widgets.Image(
                    value=image1,
                    format='jpg',
                    width='300'
                )

label_headline1 = widgets.Label(
                    style={'description_width': 'initial'}
                )

vbox_headline1 = widgets.VBox([image_headline1, label_headline1])

In [45]:
# wine_name

pairing = widgets.Text(placeholder = 'Tastes, Flavours, Ingredients')

wine_name = widgets.Text(placeholder = 'Wine to match')

In [46]:
# number of suggestions

suggestions1 = widgets.Text(placeholder = 'Number of recommendations')
suggestions2 = widgets.Text(placeholder = 'Number of recommendations')

In [47]:
# button send - wine pairing

button_send = widgets.Button(
                description='Search',
                tooltip='Send',
                style={'description_width': 'initial'}
            )

output = widgets.Output()

def on_button_clicked(event):
    taste_food = [pairing.value]
    with output:
        clear_output()
        
        print(f"Here are your {suggestions1.value} suggestions for your delicious food!")
        print('')
        print('')
        best_wine_paring(list_of_descriptors=taste_food,number_of_suggestions = int(suggestions1.value))
        
        
        
        

button_send.on_click(on_button_clicked)

vbox_result = widgets.VBox([button_send, output])

In [48]:
# button send - wine recommender

button_send1 = widgets.Button(
                description='Search',
                tooltip='Send',
                style={'description_width': 'initial'}
            )

output1 = widgets.Output()

def on_button_clicked_1(event):
    with output1:
        clear_output()
        
        print(f"Here are your {suggestions2.value} suggestions that best match your wine!")
        print('')
        print('')
        best_similar_wines(str(wine_name.value),int(suggestions2.value))
        
        
        
        

button_send1.on_click(on_button_clicked_1)

vbox_result_1 = widgets.VBox([button_send1, output1])

In [49]:
# stacked right hand side - wine pairing

text_0 = widgets.HTML(value="<h1>WINE @ 5</h1>")
text_1 = widgets.HTML(value="<h2> Wine Pairing</h2>")
space = widgets.HTML(value="<h2>    </h2>")
text_2 = widgets.HTML(value="<h2> Today, I am in a mood for:  </h2>")
text_4 = widgets.HTML(value="<h2> How many recommendations?  </h2>")




vbox_text = widgets.VBox([text_0, text_1, space, text_2,pairing, text_4, suggestions1, vbox_result])

In [50]:
# stacked right hand side - wine recommender

text_0 = widgets.HTML(value="<h1>WINE @ 5</h1>")
text_1 = widgets.HTML(value="<h2> Wine Recommender</h2>")
space = widgets.HTML(value="<h2>    </h2>")
text_3 = widgets.HTML(value="<h2> I am looking for a similar wine to this one: </h2>")
text_4 = widgets.HTML(value="<h2> How many recommendations?  </h2>")




vbox_text1 = widgets.VBox([text_0, text_1, space, text_3, wine_name, text_4, suggestions2, vbox_result_1])

## Wine @ 5 - Wine pairing recommender 

In [51]:
page = widgets.HBox([vbox_headline, vbox_text])
display(page)

HBox(children=(VBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x01,\x01,\x00\x00\xff…

## Wine @ 5 - Similar Wine recommender 

In [52]:
page1 = widgets.HBox([vbox_headline1, vbox_text1])
display(page1)

HBox(children=(VBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x…