In [52]:
import networkx as nx
import mpld3
import matplotlib.pyplot as plt
from numpy import random
import pandas as pd
import numpy as np
import sys
import seaborn as sns
import requests
from bs4 import BeautifulSoup
import time
import nltk
import gensim
from gensim.models.doc2vec import Doc2Vec
from scipy.spatial.distance import cosine
import re
import pickle

sns.set_palette('Dark2')
plt.rcParams['figure.figsize'] = (8,6)
%matplotlib inline

# custom file
import wine_config # wine_config.get_config() is a dict
config = wine_config.get_config()

In [54]:
path = config['path']

wines = pd.read_csv(
    path +'Google Drive/Data Science/WineData/cleaned_wine_data.csv', 
    encoding='utf-8')

In [55]:
wines.head()

Unnamed: 0,@context,@id,@type,alc,category,closure,description,foodnote,image,name,...,rs,size,sku,style,ta,type,variety,winemaker,wood,year
0,http://schema.org,https://wine.co.za/wine/wine.aspx?WINEID=2,Product,17.0,Muscat dAlexandrie,,,,http://images.wine.co.za/GetWineImage.ashx?Ima...,L Emigre Muscat d Alexandrie 1998,...,,,2,,,,Muscat dAlexandrie,,,1998.0
1,http://schema.org,https://wine.co.za/wine/wine.aspx?WINEID=4,Product,12.0,Cabernet Sauvignon,,,,http://images.wine.co.za/GetWineImage.ashx?Ima...,Genesis Cabernet Sauvignon 1997/1998,...,1.0,,4,,5.0,Red,Cabernet Sauvignon,Chris Kelly,,1998.0
2,http://schema.org,https://wine.co.za/wine/wine.aspx?WINEID=5,Product,12.0,Cabernet Sauvignon,,"Immense in all proportions, this wine defines ...",,http://images.wine.co.za/GetWineImage.ashx?Ima...,Rustenberg Peter Barlow 1996,...,1.0,,5,Dry,6.0,Red,Cabernet Sauvignon,Rod Easthope,wooded,1996.0
3,http://schema.org,https://wine.co.za/wine/wine.aspx?WINEID=6,Product,13.0,Shiraz,,This is an elegant and flavoursome Shiraz with...,,http://images.wine.co.za/GetWineImage.ashx?Ima...,Henri Roselt Shiraz 1999,...,2.0,,6,,5.0,Red,Shiraz,Andr van Dyk,,1999.0
4,http://schema.org,https://wine.co.za/wine/wine.aspx?WINEID=7,Product,12.0,Sauvignon Blanc,,"A well balanced, full tropical flavoured with ...",,http://images.wine.co.za/GetWineImage.ashx?Ima...,Boland Sauvignon Blanc 2000,...,4.0,,7,,6.0,White,Sauvignon Blanc,Johan Joubert,,2000.0


# Assigning profiles

Using the wine profiles described in https://winefolly.com/tutorial/wine-aroma-wheel-100-flavors/ and updating it with words seen in the wine descriptions, we have a list of wine flavour profiles and words that belong to each profile

In [50]:
complete_corpus = pickle.load(open('preprocessed_descriptions.pkl',
                 'wb'))

In [51]:
comlete_corpus[:10]

0    [immense, in, all, proportions, this, wine, de...
1    [this, is, an, elegant, and, flavoursome, shir...
2    [a, well, balanced, full, tropical, flavoured,...
3    [a, ruby, tipe, port, blended, from, ruby, cab...
4    [dry, wine, with, fruity, quava, bouquet, and,...
Name: description, dtype: object


In [6]:
profiles = pd.read_csv('flavourProfiles.csv', index_col=0)
profiles

Unnamed: 0,Profile,Varieties
0,Flower,"iris,peony,elderflower,acacia,lilac,jasmine,ho..."
1,Citrus,"lime,lemon,grapefruit,orange,marmalade,lemons,..."
2,Tree Fruit,"quince,apple,pear,nectarine,peach,apricot,pers..."
3,Tropical Fruit,"pineapple,mango,guava,passion fruit,lychee,bub..."
4,Red Fruit,"cranberry,red plum,pomegranate,sour cherry,str..."
5,Black Fruit,"boysenberry,blackcurrant,black cherry,plum,bla..."
6,Dried Fruit,"raisin,fig,date,fruit cake,figs,fruitcake"
7,Noble Rot,"bees wax,ginger,honey"
8,Spice,"white pepper,red pepper,black pepper,cinnamon,..."
9,Vegetable,"grass,tomato leaf,gooseberry,bell pepper,jalap..."


In [7]:
profiles['Varieties'] = profiles['Varieties'].apply(lambda x: x.split(','))
profiles.head()

Unnamed: 0,Profile,Varieties
0,Flower,"[iris, peony, elderflower, acacia, lilac, jasm..."
1,Citrus,"[lime, lemon, grapefruit, orange, marmalade, l..."
2,Tree Fruit,"[quince, apple, pear, nectarine, peach, aprico..."
3,Tropical Fruit,"[pineapple, mango, guava, passion fruit, lyche..."
4,Red Fruit,"[cranberry, red plum, pomegranate, sour cherry..."


In [8]:
import gensim

model = Doc2Vec.load('descriptions_doc2vec')

# Load Google's pre-trained Word2Vec model.
# model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)

In [9]:
profile_vectors = []
for p in profiles.Varieties:
    vec = model.infer_vector(p)
    profile_vectors.append(vec)

In [43]:
def most_likely_profiles(row, profiles, 
                         profile_vectors,
                         model):
    position = row.name
    similarities = [0] * len(profiles)
    for profile in range(len(profiles)):
        profile_vec = profile_vectors[profile]
        tokens = tokenize(row.description)
        text_vec = model.infer_vector(tokens)
        sim = cosine(text_vec, profile_vec)
        similarities[profile] = sim
    similarities = np.array(similarities)
    most_similar = similarities.argsort()[-3:][::-1]
    most_similar_profiles = list(
        profiles.Profile.loc[most_similar])
    row['First'] = most_similar_profiles[0]
    row['Second'] = most_similar_profiles[1]
    row['Third'] = most_similar_profiles[2]
    return row

In [44]:
top3_profiles = wines[['description']]
top3_profiles['First'] = ''
top3_profiles['Second'] = ''
top3_profiles['Third'] = ''
top3_profiles.apply(lambda x: most_likely_profiles(x,
                                                   profiles,
                                                   profile_vectors,
                                                   model),
                    axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,description,First,Second,Third
0,"Immense in all proportions, this wine defines ...",Sulfides & Mercaptans,Volatile Acidity (AceticAcid),Noble Rot
1,This is an elegant and flavoursome Shiraz with...,Sulfides & Mercaptans,Volatile Acidity (AceticAcid),Oak Aging
2,"A well balanced, full tropical flavoured with ...",Volatile Acidity (AceticAcid),Sulfides & Mercaptans,Oak Aging
3,"A Ruby tipe port blended from Ruby Cabernet, f...",Sulfides & Mercaptans,Volatile Acidity (AceticAcid),Microbial
4,Dry wine with fruity quava bouquet and flavour.,Sulfides & Mercaptans,Volatile Acidity (AceticAcid),Oak Aging
5,This wine is supported by a full bouquet of pe...,Volatile Acidity (AceticAcid),Sulfides & Mercaptans,Vegetable
6,"This wine shows lemon, melon and citrus flavou...",Volatile Acidity (AceticAcid),Sulfides & Mercaptans,Dried Fruit
7,"A refined, dry white wine, subtly wooded, with...",Volatile Acidity (AceticAcid),Sulfides & Mercaptans,Oak Aging
8,A naturally fermented Apple wine with the dept...,Oak Aging,General Aging,Brettanomyces
9,"Bright and clear, a light straw colour with a ...",Sulfides & Mercaptans,Volatile Acidity (AceticAcid),Vegetable


In [47]:
top3_profiles.First.value_counts()

Sulfides & Mercaptans            17079
Volatile Acidity (AceticAcid)    10509
Oak Aging                           33
Noble Rot                           22
Brettanomyces                        6
Microbial                            5
Vegetable                            5
Madeirized (orCooked)                4
Spice                                4
General Aging                        3
Red Fruit                            3
Tree Fruit                           3
Cork Taint(TCA)                      3
Dried Fruit                          2
Flower                               2
Tropical Fruit                       2
Earth                                1
Black Fruit                          1
Name: First, dtype: int64

# other method

In [5]:
descriptors = wines.description.apply(tokenize_and_stem)
print(descriptors.head())

0    [immens, in, all, proport, this, wine, defin, ...
1    [this, is, an, eleg, and, flavoursom, shiraz, ...
2    [a, well, balanc, full, tropic, flavour, with,...
3    [a, rubi, tipe, port, blend, from, rubi, caber...
4    [dri, wine, with, fruiti, quava, bouquet, and,...
Name: description, dtype: object


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=20000,
                                   min_df=0.2, stop_words='english',
                                   use_idf=True, 
                                   tokenizer=tokenize_and_stem,
                                   ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform( \
    wines.description) #fit the vectorizer to wine descr

print(tfidf_matrix.shape)

In [None]:
terms = tfidf_vectorizer.get_feature_names()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [None]:
from sklearn.cluster import KMeans

num_clusters = 10

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

In [None]:
vocab_frame = pd.DataFrame({'words': list(descriptors)}, index = descriptors)
print 'there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame'
print(vocab_frame.head())

In [None]:
from sklearn.externals import joblib

#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

# joblib.dump(km, 
#             '/Users/lauren/Google Drive/Data Science/WineData/descr_cluster.pkl')

km = joblib.load('/Users/lauren/Google Drive/Data Science/WineData/descr_cluster.pkl')
clusters = km.labels_.tolist()

In [None]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print "Cluster %d words:" % i 
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print ' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore')
        

In [None]:
from nltk.corpus import stopwords
import nltk
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
from nltk.tokenize import word_tokenize
import re

def review_to_wordlist( review, remove_stopwords=True ): 
    """
    from https://www.kaggle.com/c/word2vec-nlp-tutorial
    
    Function to convert a document to a sequence of words,
    optionally removing stop words.  Returns a list of words.
    """

    review_text = review
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    review_text = re.sub('gold', '', review_text)
    review_text = re.sub('silver', '', review_text)
    review_text = re.sub('bronze', '', review_text)
    review_text = re.sub('[0-9]+', '', review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)


# Define a function to split a review into parsed sentences
def review_to_sentences(review, tokenizer, remove_stopwords=False ):   #from https://www.kaggle.com/c/word2vec-nlp-tutorial
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    if isinstance(review,float):
        return ['']
    else:
        review = review.decode('utf-8')
        raw_sentences = tokenizer.tokenize(review.strip())

    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence,remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

def sentences_to_list(sentences):
    newList = []
    skipped = 0
    for s in range(len(sentences)):
        try:
            newList += sentences[s]
        except:
            skipped +=1
    print 'Skipping ',skipped,' lines since they are NoneType'
    return newList

def unicodeStrToStr(x):
    try:
        x = map(str, x)
    except:
        pass
    return x

There are also some words in the tasting notes that are superfluous or unhelpful, so I have defined some functions to remove these

In [None]:
with open('wine_stopwords.txt') as f:
    wine_stopwords = []
    lines = f.readlines()
    for line in lines:
        wine_stopwords.append(str(line.split('\n')[0]))

def removeWineStopWords(descriptionList):
    global wine_stopwords
    if isinstance(descriptionList,float):
        return descriptionList
    else:
        list_u = []
        [list_u.append(v) for v in descriptionList if v not in wine_stopwords]
        if 'tint' in descriptionList:
            c_index = list_u.index("tint")
            list_u = [list_u[:c_index+1],list_u[c_index+1:]]
        elif 'colour' in descriptionList:
            c_index = list_u.index("colour")
            list_u = [list_u[:c_index+1],list_u[c_index+1:]] 
        elif 'color' in descriptionList:
            c_index = list_u.index("color")
            list_u = [list_u[:c_index+1],list_u[c_index+1:]] 
        return list_u
    
def getFlavoursnotColors(description):
    if isinstance(description,float):
        return []
    elif len(description)==0:
        return description        
    elif isinstance(description[0],unicode):
        return description
    else:
        for i in description:
            if 'tint' in i:
                pass
            elif 'colour' in i:
                pass
            elif 'color' in i:
                pass
            else:
                return i

Now the normal stopwords are removed...

In [None]:
descriptions = wines.TastingNotes.str.lower()
descriptionWords = descriptions.apply(lambda x: review_to_sentences(x,tokenizer,True))
descriptionWords = descriptionWords.apply(sentences_to_list)
descriptionWords.head()

Followed by the wine stopwords. I also make sure that I am not using visible descriptions in my analysis

In [None]:
cleanDescriptions = descriptionWords.apply(removeWineStopWords)

cleanDescriptionFlavours = cleanDescriptions.apply(getFlavoursnotColors)
cleanDescriptionFlavours.head()

I then read in the flavour profiles

In [None]:
flavourProfiles = pd.read_csv('flavourProfiles.csv',skiprows=1,names=['Profile','Varieties'])
flavourProfiles.Varieties = flavourProfiles.Varieties.str.lower()#.str.split(',')
# profileText = flavourProfiles.Varieties.str.replace(',',' ').str.cat(sep='')
flavourProfiles


In [None]:
from difflib import SequenceMatcher

def similarList(lst, b):
    similarities = []
    for i in range(len(lst)):
        similarities.append(SequenceMatcher(None, lst, b).ratio())
    return similarities

def assignFlavourProfile(flavourList,flavourProfiles,winestopwords):
    """
    This function looks at the leftover words in the tasting notes and either:
    1) Finds extra stopwords that were missed
    2) Updates the flavour profile to recognise new (possibly misspelled) descriptors
    3) Adds the word to a dictionary so tasting notes can be mapped to Flavour Profiles
    """
    profileDict= {}
    #address easy fits and multi-word flavours first
    for n in nltk.bigrams(flavourList):
        combinedBigram = n[0]+' '+n[1]
        if flavourProfiles['Varieties'].str.contains(combinedBigram).any():
            location =  flavourProfiles['Profile'].where(flavourProfiles['Varieties'].str.contains(combinedBigram))
            profileDict[combinedBigram] = list(location.dropna())[0]    
            
    #now we consider the flavours with frivolous adjectives
    for n in range(len(flavourList)):
        if n>0 and n%50 ==0:
            contin = raw_input('Stop or continue?[[s]/c]')
        else: 
            contin = 'c'
        if contin == 'c':                             
            if flavourList[n] in profileDict:
                pass
            else:
                f = flavourList[n]
                profile = []
                if flavourProfiles['Varieties'].str.contains(f).any():

                    check = flavourProfiles['Varieties'].where(flavourProfiles['Varieties'].str.contains(f)).dropna()
                    checkList = list(check)[0]
                    location =  flavourProfiles['Profile'].where(flavourProfiles['Varieties'].str.contains(f))
                    profile.append(list(location.dropna())[0]) 
                else:
                    print f, 'does not seem to belong to any profile.'
                    print 'Profiles are\n',flavourProfiles.Profile
                    user_input = raw_input('What profile should %s belong to? Return "w" if this is an winestopword, "p" if you want to see the previous word or "n" if you want to see the next word, else return the profile number'%f)
                    if user_input == 'w': 
                        winestopwords.append(f)
                    elif user_input == 'p': 
                        try:
                            f = flavourList[n-1]+' '+flavourList[n]
                        except:
                            pass
                        user_input = raw_input('What profile should %s belong to in context %s? Return x if still cannot be classified, else the profile number'%(f,flavourList))
                    elif user_input == 'n': 
                        try:
                            f = flavourList[n]+' '+flavourList[n+1]
                        except:
                            pass
                        user_input = raw_input('What profile should %s belong to? Return x if still cannot be classified, else the profile number'%f)
                    if user_input not in ['w','x']:
                        user_input = int(user_input)
                        flavourProfiles.Varieties[user_input]+=','+str(f)
                        profile.append(flavourProfiles.Profile[user_input])
                if len(profile)>1:
                    print 'Flavour is ',flavourList[n],'and chosen profiles are',profile
                    profileChoice = raw_input('What profile should  be chosen? Please choose index or "n" to go back to original list')
                    if profileChoice == 'n':
                        print 'Profiles are\n',flavourProfiles.Profile
                        profileChoice = raw_input('What profile should %s belong to? Return the profile number'%flavourList[n])
                        profile = list(flavourProfiles.Profile[int(profileChoice)])
                    else: 
                        profile = profile[int(profileChoice)]
                    profile = list(profileChoice)
                if len(profile) > 0:
                    profileDict[flavourList[n]] = profile[0]
        else:
            return profileDict,winestopwords,flavourList
    return profileDict,winestopwords

Now I combine all words in the descriptions and take the set of descriptors to check for stoppwords and create the dictionary to map tasting notes to profiles

In [None]:
uniqueFlavours = sentences_to_list(list(cleanDescriptionFlavours))
print 'Description words ',len(uniqueFlavours)
uniqueFlavours = list(set(uniqueFlavours))
print 'Unique description words: ',len(uniqueFlavours)

In [None]:
assignDict,wineStopWords = assignFlavourProfile(uniqueFlavours,flavourProfiles,wine_stopwords)
print assignDict

with open('wine_stopwords.txt','w') as f:
    for a in wineStopWords:
        if a == wineStopWords[-1]:
            f.write(a)
        else:
            f.write(a+'\n')
flavourProfiles.to_csv('flavourProfiles.csv',columns =['Profile','Varieties'])

In [None]:
wines['clean_TastingNotes'] = cleanDescriptionFlavours 
wines.clean_TastingNotes.head()

### Map tasting notes to profile

In [None]:
def flavour_to_profile(assignDict,row):
    """
    Take the flavours identified from the tasting notes and map them to profiles
    The result will be a table where the row will show '1' if the profile is present in that wine and '0' otherwise
    """
    profiles = []
    for f in row.clean_TastingNotes:
        try:
            if row[assignDict[f]] ==1:
                pass
            else:
                row[assignDict[f]] +=1
        except:
            pass
    return row
    
wine_FlavourProfiles = wines[['WineName','WineType','Vintage_dateTime','clean_TastingNotes']].dropna()
wine_FlavourProfiles.clean_TastingNotes = wine_FlavourProfiles.clean_TastingNotes.apply(lambda y: np.nan if len(y)==0 else y)
for p in flavourProfiles.Profile:
    wine_FlavourProfiles[p] = 0
wine_FlavourProfiles = wine_FlavourProfiles.dropna()
wine_FlavourProfiles = wine_FlavourProfiles.apply(lambda row: flavour_to_profile(assignDict,row), axis = 1)
wine_FlavourProfiles.describe()

In [None]:
wines.info()
wines.to_csv('../WineData/WineData_withFlavours.csv', columns = wines.columns, encoding='utf-8')

In [None]:
print 'Number of instances of each profile: '
print wine_FlavourProfiles.sum(axis = 0)
wine_sums = wine_FlavourProfiles.sum(axis = 1)
print 'Number of wines where a profile has been identified: ',sum(wine_sums > 0)
wine_FlavourProfiles.to_csv('../WineData/wine_FlavourPofiles.csv', columns = wine_FlavourProfiles.columns, encoding='utf-8')