# Wine tasting Notebook

#### author: A. Tomberg
#### date: 04/12/2019
##### In this notebook, we will analyze a set of wines, pulled from Kaggle website (https://www.kaggle.com/zynicide/wine-reviews). We will look at how the description from a sommalier can be used to cluster wines by similar features, and use these cluster to recommend wines that you may like based on a selected example.

In [1]:
import string
import collections

import nltk
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import FreqDist

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
from sklearn import metrics
from pprint import pprint

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

#nltk.download('stopwords')
#nltk.download('punkt')

from joblib import dump, load

In [37]:
%run my_functions.ipynb

In [None]:
wine_data = pd.read_csv('winemag-data-130k-v2.csv/winemag-data-130k-v2.csv')
wine_data.info()

In [None]:
wine_data.describe()

In [None]:
wine_data.head()

In [None]:
# Plot what kind of wines are found in the set
# N.B. expects 1 word per row in selected column

plot_frequency_of_occurence(wine_data, 'variety')

In [None]:
wine_data['parsed_descr'] = wine_data['description'].apply(parse_decription)

In [None]:
wine_data.to_csv(r'parsed_data.csv')

In [None]:
wine_data = pd.read_csv('parsed_data.csv')

In [None]:
m = plot_freq_words(wine_data, 'parsed_descr', how_many = 50)

In [None]:
# vectorize the words found in the descriptions of wines

# 1) define words that you don't want to use:
stop_words = set(stopwords.words('english'))
stop_words.update(["drink", "now", "wine", "flavor", "flavors"])


In [None]:
# 2) create a vector representation of descriptions
vectorizer = TfidfVectorizer(lowercase=True, tokenizer=parse_decription, stop_words=stop_words, max_df=50000, min_df=500, max_features=None, analyzer='word')
tfidf_model = vectorizer.fit_transform(wine_data['description'])


In [None]:
# save vectorizer for later
dump(tfidf_model, 'tfidf_model.joblib') 

In [None]:
# load saved vectorizer
tfidf_model = load('tfidf_model.joblib') 

print("n_samples: %d, n_features: %d" % tfidf_model.shape)

In [None]:
# Do the actual clustering

do_mini_batches = True
how_many_clusters = 6


if do_mini_batches:
    km = MiniBatchKMeans(n_clusters=how_many_clusters, init='k-means++', n_init=100,
                         init_size=1000, batch_size=1000)
else:
    km = KMeans(n_clusters=how_many_clusters, init='k-means++', max_iter=100, n_init=10)

print("Clustering data with %s" % km)
km.fit(tfidf_model);

In [None]:
silhouette_score(tfidf_model, km.labels_, sample_size = 10000)

In [None]:
dump(km, 'wine_kmeans6.joblib') 

In [None]:
km=load('wine_kmeans6.joblib') 

In [None]:
# add a cluster column to the wine dataset:

wine_data["cluster"] = ""

for idx, label in enumerate(km.labels_):
    wine_data.at[idx,"cluster"] = label

In [None]:
wine_data.head(10)

In [None]:
my_clusters = create_clusters_from_dataframe(wine_data, number_of_clusters = 5)

In [None]:
# let's look at how big the clusters are

sizes = [c.shape[0] for c in my_clusters]
labels = ['cluster_'+str(i) for i in range(len(my_clusters))]
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=False)
ax1.axis('equal')
plt.show()

In [None]:
# for each cluster, which words are the most frequent? 
most_frequent_words = list()

i = 0
for c in my_clusters : 
    most_frequent_words.append(plot_freq_words(c, 'parsed_descr', how_many = 20, size = (10,8), \
                    title = 'Plotting frequency of occurence for cluster '+ str(i) ))
    i = i+1

In [None]:
# Create and generate a word cloud image:

for i in range(len(my_clusters)):
    text = " ".join(list(most_frequent_words[i].word))
    print('CLUSTER '+str(i))
    draw_word_cloud(text)

In [None]:
#which variety is the most frequent? 

i = 0
for c in my_clusters : 
    (plot_variety_in_cluster(c, n = 20, \
                    title = 'Plotting variety for cluster '+ str(i) ))
    i = i+1


In [None]:
wine_data.to_csv(r'clustered_data.csv')

### NOW LET'S SEE WHAT YOU LIKE  :)

We're going to vectorize the descriptions in each cluster, then compute a similarity matrix. Using this matrix, we can find nearest neighbours to a selected wine, and be able to recommend another wine to try based on preference.

In [17]:
# load clusters from file
wine_data = pd.read_csv('clustered_data.csv')
my_clusters = create_clusters_from_dataframe(wine_data, number_of_clusters = 5)

I created 5 clusters.


In [None]:
# 1) define words that you don't want to use:
stop_words = set(stopwords.words('english'))
stop_words.update(["drink", "now", "wine", "flavor", "flavors"])

# 2) vectorze using counts
vectorizer = CountVectorizer(lowercase=True, tokenizer=parse_decription, stop_words=stop_words, analyzer='word', max_features=100)

In [None]:
vectorizers = list()

count = 0
for c in my_clusters:
    tfidf = vectorizer.fit_transform(c.description)
    print('Cluster ', count, " n_samples: %d, n_features: %d" % tfidf.shape)
    vectorizers.append(tfidf)
    count = count+1


In [None]:
# save vectorizers for later
count = 0
for v in vectorizers:
    dump(v, 'vectorizer_'+str(count)+'.joblib') 
    count = count+1

In [5]:
# load saved vectorizers

vectorizers = list()

for c in range(len(my_clusters)):
    vectorizers.append(load('vectorizer_'+str(c)+'.joblib')) 
    

In [6]:
# compute similarity matrices for each cluster
similarity_scores = list()
for v in vectorizers:
    similarity_scores.append(cosine_similarity(v.toarray()))


In [38]:
# finds the first index of the wine wiht search found in title    
wine_tag = 'Stoneleigh'
wine_variety = 'Blanc'

idx_in_data, which_cluster, idx = get_index_from_title(wine_tag, wine_variety, wine_data, my_clusters)

#print(wine_data[wine_data.title.str.contains('Moscato')])

In [36]:

similar = list(enumerate(similarity_scores[which_cluster][idx])) #accessing the row corresponding to given wine to find all the similarity scores, enumerating over it
sorted_similar = sorted(similar,key=lambda x:x[1],reverse=True)[1:]

print("Top 5 similar movies to "+wine_tag+" & " + wine_variety+" are:\n")

best_matches = [idx]
for element in sorted_similar[:5]:
    best_matches.append(element[0])
    
best = (my_clusters[which_cluster]).loc[best_matches]
best = best[['title', 'country','variety','description', 'price']].copy()

best

Top 5 similar movies to Stoneleigh & Blanc are:



Unnamed: 0,title,country,variety,description,price
31,Stoneleigh 2008 Sauvignon Blanc (Marlborough),New Zealand,Sauvignon Blanc,The Stoneleigh style traditionally favors ripe...,19.0
9103,Marqués de la Concordia 2012 MM Reserva de la ...,Spain,Sparkling Blend,This round yet direct Cava offers stone-fruit ...,15.0
12764,Siren Song 2013 The Muse Brut Blanc de Noirs M...,US,Sparkling Blend,"Aromas of brioche, strawberry and citrus are f...",45.0
1720,Terrapura 2016 Sauvignon Blanc (Curicó Valley),Chile,Sauvignon Blanc,Light simple citrus aromas fall halfway betwee...,11.0
5169,Terrapura 2016 Sauvignon Blanc (Curicó Valley),Chile,Sauvignon Blanc,Light simple citrus aromas fall halfway betwee...,11.0
18036,Hatzimichalis 2009 Estate Hatzimichalis Chardo...,Greece,Chardonnay,Buttery spice and fresh citrus aromas lead thi...,22.0


And we're done!