In [5]:
# -*- coding: utf8 -*-

from datasketch.minhash import MinHash
from datasketch.lsh import MinHashLSH
import pandas

# sudo pip3 install nltk
from nltk.stem.porter import *  # import porter stemmer from nltk (natural language toolkit) package

# nltk (natural language toolkit) is a collection of instruments to deal with text documets
# https://www.nltk.org/
from nltk import word_tokenize, download
from nltk.corpus import stopwords
from nltk.stem.porter import *  # import Porter stemmer algo from nltk (natural language toolkit) package

# download some data needed by nltk. It creates a cached version, 
# then the download occur only the first time
download("punkt")  # download data needed by word_tokenizer 
download("stopwords")  # download english stopwords


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\luca\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\luca\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
def get_keywords(s):
    """
    Return a list of keyword for a given text
    Eliminates the stop words
    optionally (uncomment it) does also the stemming
    
    :param s:
    :return:
    """

    en_stopwords = set(stopwords.words('english'))
    en_stopwords.add("aromas")  # manually appended some more words to stop words list
    
    # the function set converts lists to sets
    en_stopwords = set(en_stopwords)  # huge optimization, without it the process lasts forever
    
    keywords = []
    
    # some pre-processing needed in this particular case where some comments created problems
    # split creates a list from a string using white spaces as separator
    for w in s.split():
        if len(w) >= 3 and w.lower() not in en_stopwords:
            # some badly typed comments needs to remove punctuation
            tw = w.translate("".maketrans({".": None, ",": None}))
            #pippo.pluto  ---> pippopluto
            #Pippo. ---> Pippo 
            keywords.append(tw.lower())

    return keywords
    
    # try with inflection free keywords
    # stemmer = PorterStemmer()
    # return [stemmer.stem(w) for w in keywords]


In [7]:
def get_shingles(s, k):
    """
    Convert string s into shingles of len k
    :param s: 
    :return: 
    """
    # remove some punctuation symbols
    base_text = s.translate("".maketrans({".": None, ",": None}))
    
    # convert to lowercase
    base_text = base_text.lower()
    
    
    if len(s) <= k:  # if len s less than shingle length
        return [s]
    else:  # convert string to list of shingles of len k
        return [base_text[i:i + k] for i in range(len(s) - k + 1)]


In [8]:
# load winemag-data-130k-v2.csv in pandas dataframe
dataset = pandas.read_csv("winemag-data-130k-v2.csv")
# print(dataset.head())

# leave only title and description
dataset = dataset.drop(columns=["country", "designation", "points", "price", "province", "region_1", "region_2", "taster_name", "taster_twitter_handle",
                               "variety", "winery"])

# print out how many wines you have in the dataset
print("We have %d wines" % len(dataset))
dataset.head()

We have 129971 wines


Unnamed: 0.1,Unnamed: 0,description,title
0,0,"Aromas include tropical fruit, broom, brimston...",Nicosia 2013 Vulkà Bianco (Etna)
1,1,"This is ripe and fruity, a wine that is smooth...",Quinta dos Avidagos 2011 Avidagos Red (Douro)
2,2,"Tart and snappy, the flavors of lime flesh and...",Rainstorm 2013 Pinot Gris (Willamette Valley)
3,3,"Pineapple rind, lemon pith and orange blossom ...",St. Julian 2013 Reserve Late Harvest Riesling ...
4,4,"Much like the regular bottling from 2012, this...",Sweet Cheeks 2012 Vintner's Reserve Wild Child...


In [9]:
# clean all row with Nan values, it will cause issues later. See pandas doc.
# 0 means row (first axis)
dataset = dataset.dropna(axis=0)

dataset = dataset.truncate(after=20000, axis=0)

# print out how many wines you have after data cleanup
print("We have %d wines after cleanup" % len(dataset))

print("\n--------------")


We have 20001 wines after cleanup

--------------


In [10]:
wine_map = {}           # dictionary used to store wine names and description 
wine_min_hash_map = {}  # dict. used to store every computed minHash for later use
                        # allow to retrieve wine minhashes fron wine name (title fields)
    
threshold = 0.7 # The Jaccard similarity threshold between 0.0 and 1.0 for the index.
                # It determines how many similar item are in the result of the queries.
                # The initialized MinHash LSH will be optimized for the threshold by 
                # minizing the false positive and false negative.
num_perm = 128  # number of permutation

# creating LSH object. This is an index to store MinHashes objects.
# rows and bands parameter will be computed by threshold, num_perm and weight 
# arguments.
# weights is a tuple of weights of false negative and false positive, (0.5, 0.5) by default.
# If you want to pass row and bands directly use params=(num_bands, num_rows). This
# overwrite the computation of b and r based on threshold, num_perm and weights. 
# param argument is almost undocumented feature, take a look at the source of datasketch.
lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)

In [11]:
# iterate over dataset and construct a map
# for wine title -> wine description and a map for wine_title -> minhash of description
# then it insert our minhash into a MinHashLSH index to query all minhashes in sublinear time.
for row in dataset.itertuples():
    title = row.title  # wine name
    desc  = row.description

    # check whether there are duplicates
    if title in wine_map:
        continue  # if it already exists, skip it, to avoid duplicates

    wine_map[title] = desc # store the wine title and description

    # compute minhash of the wine description
    min_hash = MinHash(num_perm=num_perm)
    # for word in get_shingles(desc, 5):
    for word in get_keywords(desc):
        min_hash.update(word.encode('utf-8'))

    # insert the wine minhash into the LSH index with his on title
    lsh.insert(title, min_hash)
    wine_min_hash_map[title] = min_hash  # store the 

print("DONE!")

DONE!


In [12]:
# iterate over each wine in the dataset and query minhashLSH index for similar wines
# query each wine in dataset for similarities
for wine in wine_map:
    result = lsh.query(wine_min_hash_map[wine])

    if len(result) > 1:
        print("Wine %s has %d similar item:" % (wine, len(result) - 1))
        for item in result:
            if item != wine:
                print("    %s" % item)
        print("\n---------")
    # else:
    #    print("%s has not similar wines" % wine)


Wine Vignerons des Terres Secrètes 2015  Mâcon-Milly Lamartine has 4 similar item:
    Sanguinhal 2013 Casabel Red (Lisboa)
    Roche de Bellene 2014 Vieilles Vignes  (Meursault)
    Cave de Cleebourg 2013 Prestige Pinot Gris (Alsace)
    Domaine Pascal et Mireille Renaud 2015  Saint-Véran

---------
Wine Casca Wines 2015 Cascas Winemaker Selection Red (Lisboa) has 1 similar item:
    Garriotin 2011 Sélection Malbec (Cahors)

---------
Wine Gabriel Meffre 2016 La Châsse Chardonnay (Vin de France) has 1 similar item:
    Cave de Lugny 2012 Florières  (Mâcon-Villages)

---------
Wine Cave de Cleebourg 2013 Prestige Pinot Gris (Alsace) has 1 similar item:
    Vignerons des Terres Secrètes 2015  Mâcon-Milly Lamartine

---------
Wine Coteaux da Murta 2013 Quinta da Murta Arinto (Bucelas) has 1 similar item:
    Jean de la Fontaine NV L'Eloquente Brut  (Champagne)

---------
Wine Ridolfi 2014  Rosso di Montalcino has 1 similar item:
    Carpineto 2014  Rosso di Montalcino

---------
Wine Dom