In [44]:
import networkx as nx
import mpld3
import matplotlib.pyplot as plt
from numpy import random
import pandas as pd
import pickle
import numpy as np
import sys
import seaborn as sns
import requests
from bs4 import BeautifulSoup
import time
import nltk
import gensim
from gensim.models.doc2vec import Doc2Vec
from nltk.stem.snowball import SnowballStemmer
import re

sns.set_palette('Dark2')
plt.rcParams['figure.figsize'] = (8,6)
%matplotlib inline


In [4]:
wines = pd.read_csv(
    '~/Google Drive/Data Science/WineData/wineData_1_44000.csv',
    index_col=0, encoding="utf-8")
wines = wines.dropna(subset=['description'])
wines = wines.reset_index()
wines.head()

Unnamed: 0,level_0,index,@context,@id,@type,alc,category,closure,description,foodnote,...,rs,size,sku,style,ta,type,variety,winemaker,wood,year
0,2,0,http://schema.org,https://wine.co.za/wine/wine.aspx?WINEID=5,Product,12.0,Cabernet Sauvignon,,"Immense in all proportions, this wine defines ...",,...,1.0,,5,Dry,6.0,Red,Cabernet Sauvignon,Rod Easthope,wooded,1996.0
1,3,0,http://schema.org,https://wine.co.za/wine/wine.aspx?WINEID=6,Product,13.0,Shiraz,,This is an elegant and flavoursome Shiraz with...,,...,2.0,,6,,5.0,Red,Shiraz,Andr van Dyk,,1999.0
2,4,0,http://schema.org,https://wine.co.za/wine/wine.aspx?WINEID=7,Product,12.0,Sauvignon Blanc,,"A well balanced, full tropical flavoured with ...",,...,4.0,,7,,6.0,White,Sauvignon Blanc,Johan Joubert,,2000.0
3,5,0,http://schema.org,https://wine.co.za/wine/wine.aspx?WINEID=10,Product,18.0,Ruby Cabernet,,"A Ruby tipe port blended from Ruby Cabernet, f...",,...,104.0,,10,,5.0,,Ruby Cabernet,Christie Steytler/Elmo du Plessis,,1998.0
4,6,0,http://schema.org,https://wine.co.za/wine/wine.aspx?WINEID=11,Product,13.0,Chenin Blanc,,Dry wine with fruity quava bouquet and flavour.,,...,3.0,,11,,5.0,White,Chenin Blanc,De Wet Lategan,,1999.0


In [5]:
wines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27687 entries, 0 to 27686
Data columns (total 25 columns):
level_0        27687 non-null int64
index          27687 non-null int64
@context       27687 non-null object
@id            27687 non-null object
@type          27687 non-null object
alc            26190 non-null float64
category       27098 non-null object
closure        19725 non-null object
description    27687 non-null object
foodnote       14252 non-null object
image          27687 non-null object
name           27687 non-null object
origin         24996 non-null object
pH             25175 non-null float64
pack           24768 non-null object
rs             25512 non-null float64
size           9527 non-null object
sku            27687 non-null int64
style          17278 non-null object
ta             25104 non-null float64
type           25110 non-null object
variety        26722 non-null object
winemaker      26925 non-null object
wood           12877 non-null object
yea

# Cleaning data

In [6]:
# def decode_reencode(text):
#     return text.decode('utf-8', 'ignore').encode('ascii', 'ignore')
# wines.description = wines.description
# wines.description.head()

# Finding similarity between wines

In order to find the similarity between the tasting notes, I will use Doc2Vec (https://radimrehurek.com/gensim/models/doc2vec.html)
also assisted by https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb

First comes the split between testing and training

In [8]:
mask = np.random.random(size=len(wines)) < 0.9
np.save(' training_mask',mask)
train = list(wines.description[mask])
test = list(wines.description[~mask])

In [9]:
def read_docs(text, tokens_only=False):
    corpus = []
    for i, line in enumerate(text):
        if tokens_only:
            corpus.append(
                gensim.utils.simple_preprocess(line))
        else:
            corpus.append(
                gensim.models.doc2vec.TaggedDocument(
                gensim.utils.simple_preprocess(line),
                    [i]))
    return corpus
        
train_corpus = list(read_docs(train))
test_corpus = list(read_docs(test, tokens_only=True))

In [10]:
model = Doc2Vec(size=50,
                window=5, 
                min_count=4, 
                workers=2,
               iter=50)

model.build_vocab(train_corpus)

In [11]:
model.train(train_corpus, 
            total_examples=model.corpus_count,
            epochs=model.iter)

34280317

In [12]:
model.save('descriptions_doc2vec')

In [13]:
print 'Training example:\n',train_corpus[1], '\n'
print 'Inferred vector:\n', \
    model.infer_vector(train_corpus[1][0])


Training example:
TaggedDocument([u'this', u'is', u'an', u'elegant', u'and', u'flavoursome', u'shiraz', u'with', u'typical', u'smoky', u'aromas', u'the', u'pleasant', u'oak', u'element', u'adds', u'additional', u'flavour', u'to', u'wine', u'which', u'is', u'already', u'extremely', u'drinkable', u'but', u'will', u'benefit', u'further', u'from', u'few', u'years', u'in', u'the', u'bottle'], [1]) 

Inferred vector:
[-0.0930359  -0.30942932  0.39607388  0.45649219  0.27134851  0.44686744
  0.61827892  0.30839178 -0.08189704 -0.00677106 -0.22757369  0.44066882
 -0.05589475  0.14599653  0.13611734 -0.25776219 -0.14768726 -0.15126583
  0.10687597  0.343086    0.03484654  0.10171247  0.25902629 -0.06561485
  0.66637981  0.78560984  0.51179874  0.51891065 -0.00267251  0.14817163
  0.40904647  0.07971036 -0.50989825 -0.09189577 -0.55828309  0.5867455
  0.06072885 -0.03525263  0.06929871  0.23024754  0.39947405 -0.09008864
  0.18071322  0.22674729  0.06049408 -0.01762876  0.17230511  0.5450272
 -0

In [14]:
inferred_vector = model.infer_vector(test_corpus[1])
sims = model.docvecs.most_similar([inferred_vector],
                                  topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(1, ' '.join(test_corpus[1])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2),
                     ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index],
                              ' '.join(train_corpus[sims[index][0]].words)))

Test Document (1): «bright and clear the colour of straw with pleasant gooseberry flavours and hints of straw and asparagus complex interplay of tastes crisp and dry with medium to long aftertaste»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc4,s0.001,t2):

MOST (16310, 0.75765460729599): «natural fermentation no added yeast months matured in french oak barrels citrus fruit and crispness is retained wood perfectly integrated on palate»

MEDIAN (22904, 0.27647846937179565): «considered as one of the noble wine varietals full bodied wine which can age very well with flavours of cassis blackberries and other dark fruits with tobacco and cigar box complexities well balanced with fine integrated tannins»

LEAST (24610, -0.2500000596046448): «the brick red colour of the grenache is well matched by ripe plum perfumes savoury note adds to the allure on the palate the wine is succulent supple and round the bright fruit endures to the finish and creates persistent aftertaste»



# Making a graph

Now that I've established that the document training works, I want to have all of the wines in the model so I can check similarities

In [46]:
descriptions = list(read_docs(wines.description))
pickle.dump(descriptions, open('preprocessed_descriptions.pkl',
                 'wb'))
model = Doc2Vec(size=50,
                window=5, 
                min_count=4, 
                workers=2,
               iter=50)
model.build_vocab(descriptions)
model.train(descriptions, 
            total_examples=model.corpus_count,
            epochs=model.iter)

38098747

In [42]:
G=nx.Graph()
for i in range(len(train)):
    row = wines.loc[i]
    current_vector = model.infer_vector(
        descriptions[i][0])
    G.add_node(i)
    connections = model.docvecs.most_similar([inferred_vector],
                      topn=3)
    for j in range(len(connections)):
        index = connections[j][0]
        new_row = wines.iloc[index,:]
        G.add_node(index)
        G.add_edge(i, index)


In [None]:
pos = nx.spring_layout(G)
nx.draw(G)

In [None]:
import json
from networkx.readwrite import json_graph

In [None]:
for n in G:
    row = wines.iloc[n]
    row = row.fillna('')
    G.node[n]['id'] = n
    G.node[n]['name'] = row['name']
    G.node[n]['year'] = row['year']
    G.node[n]['type'] = row['type']
    G.node[n]['variety'] = row['variety']
    G.node[n]['link'] = row['@id']
    G.node[n]['category'] = row['category']

In [None]:
d = json_graph.node_link_data(G)
json.dump(d, open('graph/data/graph.json','w'))

In [None]:
import flask
# Serve the file over http to allow for cross origin requests
app = flask.Flask(__name__, static_folder="graph")

@app.route('/<path:path>')
def static_proxy(path):
    return app.send_static_file(path)

print('\nGo to http://localhost:8000/graph.html to see the example\n')
app.run(port=8000)