# Importing libraries and loading data

In [236]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import re

%matplotlib inline

In [237]:
wine = pd.read_csv('data/clustered_data/det_cao_100_iter_12_clusters.csv')

# Preparing data for further exploration after clustering

## Dropping all rows with cluster 0

We decided to drop cluster 0 because after clustering with K-Modes this cluster had no particular features. 

In [238]:
wine = wine[wine['cluster']!=0]

In [239]:
wine['cluster'].value_counts()

5     10728
6      3982
3      3748
8      3224
7      2583
9      2375
2      2026
1      1283
11     1277
10     1152
4      1116
Name: cluster, dtype: int64

## Defining functions 

In [240]:
def variety_counter(cluster_number):
    
    variety_list = []
    
    wordfreq = {}
        
    for value in wine[wine['cluster']==cluster_number]['variety']:
        variety_list.append(value)
            
    for word in variety_list:
            if ( word not in wordfreq.keys() ): 
                wordfreq[word] = 1 
            else: 
                wordfreq[word] += 1   
                
    sorted_tuple = sorted(wordfreq.items(), key=lambda word: word[1], reverse=True)
                
    return sorted_tuple

In [241]:
def province_counter(cluster_number):
    
    province_list = []
    
    wordfreq = {}
        
    for value in wine[wine['cluster']==cluster_number]['province']:
        province_list.append(value)
            
    for word in province_list:
            if ( word not in wordfreq.keys() ): 
                wordfreq[word] = 1 
            else: 
                wordfreq[word] += 1   
                
    sorted_tuple = sorted(wordfreq.items(), key=lambda word: word[1], reverse=True)
                
    return sorted_tuple

In [242]:
def region_counter(cluster_number):
    
    region_list = []
    
    wordfreq = {}
        
    for value in wine[wine['cluster']==cluster_number]['region']:
        region_list.append(value)
            
    for word in region_list:
            if ( word not in wordfreq.keys() ): 
                wordfreq[word] = 1 
            else: 
                wordfreq[word] += 1   
                
    sorted_tuple = sorted(wordfreq.items(), key=lambda word: word[1], reverse=True)
                
    return sorted_tuple

In [243]:
def clean_text(string):
    
    string = re.sub(r'\W+',' ',string) # Replace everything non-alpahnumeric by ' '
    string = re.sub(r'\s+',' ',string) # Replace one or more whitespaces by  ' '
    string = re.sub(r'\d+',' ',string) # Replace one or more digits by  ' '
    
    return string

## Cleaning and tokenizing the description column again 

In [244]:
wine['description'] = list(map(clean_text, wine['description']))

In [245]:
wine['description'] = list(map(nltk.word_tokenize, wine['description']))

In [246]:
cluster_1_variety = variety_counter(1)
print('Cluster_1')
display(cluster_1_variety[0:11])

cluster_2_variety = variety_counter(2)
print('Cluster_2')
display(cluster_2_variety[0:11])

cluster_3_variety = variety_counter(3)
print('Cluster_3')
display(cluster_3_variety[0:11])

cluster_4_variety = variety_counter(4)
print('Cluster_4')
display(cluster_4_variety[0:11])

cluster_5_variety = variety_counter(5)
print('Cluster_5')
display(cluster_5_variety[0:11])

cluster_6_variety = variety_counter(6)
print('Cluster_6')
display(cluster_6_variety[0:11])

cluster_7_variety = variety_counter(7)
print('Cluster_7')
display(cluster_7_variety[0:11])

cluster_8_variety = variety_counter(8)
print('Cluster_8')
display(cluster_8_variety[0:11])

cluster_9_variety = variety_counter(9)
print('Cluster_9')
display(cluster_9_variety[0:11])

cluster_10_variety = variety_counter(10)
print('Cluster_10')
display(cluster_10_variety[0:11])

cluster_11_variety = variety_counter(11)
print('Cluster_11')
display(cluster_11_variety[0:11])

Cluster_1


[('cabernet sauvignon', 179),
 ('red blend', 166),
 ('nebbiolo', 152),
 ('pinot noir', 128),
 ('sangiovese', 76),
 ('bordeaux-style red blend', 74),
 ('tempranillo', 61),
 ('malbec', 47),
 ('syrah', 42),
 ('zinfandel', 39),
 ('chardonnay', 34)]

Cluster_2


[('red blend', 357),
 ('pinot noir', 267),
 ('nebbiolo', 198),
 ('sangiovese', 159),
 ('cabernet sauvignon', 137),
 ('syrah', 117),
 ('tempranillo', 86),
 ('zinfandel', 66),
 ('tempranillo blend', 50),
 ('rhône-style red blend', 46),
 ('bordeaux-style red blend', 43)]

Cluster_3


[('pinot noir', 687),
 ('red blend', 529),
 ('nebbiolo', 435),
 ('cabernet sauvignon', 374),
 ('sangiovese', 176),
 ('syrah', 173),
 ('malbec', 169),
 ('bordeaux-style red blend', 168),
 ('tempranillo', 113),
 ('rhône-style red blend', 85),
 ('merlot', 80)]

Cluster_4


[('pinot noir', 159),
 ('red blend', 152),
 ('nebbiolo', 139),
 ('cabernet sauvignon', 116),
 ('sangiovese', 44),
 ('bordeaux-style red blend', 41),
 ('syrah', 40),
 ('malbec', 39),
 ('chardonnay', 37),
 ('tempranillo', 36),
 ('merlot', 33)]

Cluster_5


[('chardonnay', 2173),
 ('riesling', 1103),
 ('sauvignon blanc', 775),
 ('sparkling blend', 747),
 ('white blend', 681),
 ('rosé', 429),
 ('champagne blend', 395),
 ('pinot noir', 337),
 ('glera', 332),
 ('pinot gris', 257),
 ('gewürztraminer', 226)]

Cluster_6


[('red blend', 769),
 ('pinot noir', 721),
 ('nebbiolo', 256),
 ('sangiovese', 197),
 ('zinfandel', 174),
 ('syrah', 170),
 ('cabernet sauvignon', 164),
 ('rhône-style red blend', 106),
 ('rosé', 103),
 ('merlot', 101),
 ('bordeaux-style red blend', 100)]

Cluster_7


[('cabernet sauvignon', 595),
 ('red blend', 378),
 ('bordeaux-style red blend', 209),
 ('pinot noir', 174),
 ('syrah', 154),
 ('malbec', 136),
 ('zinfandel', 111),
 ('tempranillo', 103),
 ('merlot', 94),
 ('sangiovese', 58),
 ('rhône-style red blend', 54)]

Cluster_8


[('pinot noir', 884),
 ('red blend', 353),
 ('cabernet sauvignon', 245),
 ('nebbiolo', 153),
 ('bordeaux-style red blend', 139),
 ('malbec', 122),
 ('tempranillo', 120),
 ('sangiovese', 114),
 ('rhône-style red blend', 100),
 ('syrah', 98),
 ('rosé', 95)]

Cluster_9


[('red blend', 330),
 ('syrah', 325),
 ('cabernet sauvignon', 289),
 ('zinfandel', 174),
 ('bordeaux-style red blend', 136),
 ('malbec', 128),
 ('tempranillo', 77),
 ('pinot noir', 69),
 ('rhône-style red blend', 64),
 ('petite sirah', 56),
 ('chardonnay', 55)]

Cluster_10


[('chardonnay', 615),
 ('sauvignon blanc', 70),
 ('pinot noir', 60),
 ('white blend', 46),
 ('tempranillo', 31),
 ('sparkling blend', 26),
 ('viognier', 25),
 ('cabernet sauvignon', 17),
 ('pinot gris', 16),
 ('malbec', 15),
 ('rhône-style white blend', 14)]

Cluster_11


[('pinot noir', 326),
 ('cabernet sauvignon', 159),
 ('red blend', 154),
 ('sangiovese', 65),
 ('bordeaux-style red blend', 60),
 ('tempranillo', 59),
 ('nebbiolo', 53),
 ('merlot', 44),
 ('syrah', 42),
 ('malbec', 33),
 ('shiraz', 27)]

In [247]:
cluster_1_province = province_counter(1)
print('Cluster_1')
display(cluster_1_province[0:11])

cluster_2_province = province_counter(2)
print('Cluster_2')
display(cluster_2_province[0:11])

cluster_3_province = province_counter(3)
print('Cluster_3')
display(cluster_3_province[0:11])

cluster_4_province = province_counter(4)
print('Cluster_4')
display(cluster_4_province[0:11])

cluster_5_province = province_counter(5)
print('Cluster_5')
display(cluster_5_province[0:11])

cluster_6_province = province_counter(6)
print('Cluster_6')
display(cluster_6_province[0:11])

cluster_7_province = province_counter(7)
print('Cluster_7')
display(cluster_7_province[0:11])

cluster_8_province = province_counter(8)
print('Cluster_8')
display(cluster_8_province[0:11])

cluster_9_province = province_counter(9)
print('Cluster_9')
display(cluster_9_province[0:11])

cluster_10_province = province_counter(10)
print('Cluster_10')
display(cluster_10_province[0:11])

cluster_11_province = province_counter(11)
print('Cluster_11')
display(cluster_11_province[0:11])

Cluster_1


[('california', 355),
 ('washington', 198),
 ('piedmont', 151),
 ('tuscany', 145),
 ('northern spain', 113),
 ('mendoza province', 63),
 ('oregon', 44),
 ('veneto', 21),
 ('sicily & sardinia', 19),
 ('south australia', 18),
 ('new york', 17)]

Cluster_2


[('california', 531),
 ('tuscany', 341),
 ('piedmont', 219),
 ('washington', 204),
 ('northern spain', 164),
 ('oregon', 99),
 ('veneto', 84),
 ('mendoza province', 60),
 ('southern italy', 58),
 ('sicily & sardinia', 53),
 ('central italy', 34)]

Cluster_3


[('california', 1266),
 ('piedmont', 453),
 ('tuscany', 443),
 ('washington', 347),
 ('mendoza province', 224),
 ('northern spain', 215),
 ('oregon', 114),
 ('sicily & sardinia', 86),
 ('new york', 49),
 ('veneto', 48),
 ('central italy', 47)]

Cluster_4


[('california', 415),
 ('piedmont', 149),
 ('tuscany', 115),
 ('northern spain', 83),
 ('washington', 83),
 ('mendoza province', 58),
 ('oregon', 23),
 ('central italy', 20),
 ('sicily & sardinia', 19),
 ('south australia', 18),
 ('southern italy', 16)]

Cluster_5


[('california', 2708),
 ('alsace', 754),
 ('new york', 660),
 ('washington', 580),
 ('veneto', 565),
 ('oregon', 509),
 ('champagne', 448),
 ('northeastern italy', 395),
 ('burgundy', 392),
 ('loire valley', 357),
 ('catalonia', 336)]

Cluster_6


[('california', 1426),
 ('tuscany', 553),
 ('piedmont', 355),
 ('washington', 231),
 ('sicily & sardinia', 162),
 ('veneto', 154),
 ('oregon', 140),
 ('southern italy', 113),
 ('northern spain', 112),
 ('central italy', 89),
 ('mendoza province', 77)]

Cluster_7


[('california', 1178),
 ('washington', 279),
 ('northern spain', 198),
 ('mendoza province', 186),
 ('tuscany', 160),
 ('oregon', 106),
 ('south australia', 91),
 ('veneto', 43),
 ('central italy', 37),
 ('sicily & sardinia', 35),
 ('piedmont', 35)]

Cluster_8


[('california', 1031),
 ('washington', 381),
 ('oregon', 281),
 ('northern spain', 228),
 ('tuscany', 202),
 ('piedmont', 168),
 ('mendoza province', 156),
 ('new york', 112),
 ('sicily & sardinia', 56),
 ('catalonia', 51),
 ('rhône valley', 41)]

Cluster_9


[('california', 1189),
 ('mendoza province', 179),
 ('tuscany', 159),
 ('northern spain', 155),
 ('washington', 117),
 ('sicily & sardinia', 67),
 ('piedmont', 53),
 ('south australia', 50),
 ('veneto', 48),
 ('southern italy', 45),
 ('central italy', 41)]

Cluster_10


[('california', 661),
 ('northern spain', 63),
 ('oregon', 55),
 ('washington', 53),
 ('mendoza province', 52),
 ('northeastern italy', 46),
 ('burgundy', 23),
 ('tuscany', 19),
 ('new york', 19),
 ('piedmont', 14),
 ('catalonia', 13)]

Cluster_11


[('california', 536),
 ('oregon', 114),
 ('washington', 110),
 ('tuscany', 106),
 ('northern spain', 90),
 ('piedmont', 62),
 ('mendoza province', 38),
 ('south australia', 32),
 ('rhône valley', 22),
 ('veneto', 20),
 ('southern italy', 15)]

In [248]:
cluster_1_region = region_counter(1)
print('Cluster_1')
display(cluster_1_region[0:11])

cluster_2_region = region_counter(2)
print('Cluster_2')
display(cluster_2_region[0:11])

cluster_3_region = region_counter(3)
print('Cluster_3')
display(cluster_3_region[0:11])

cluster_4_region = region_counter(4)
print('Cluster_4')
display(cluster_4_region[0:11])

cluster_5_region = region_counter(5)
print('Cluster_5')
display(cluster_5_region[0:11])

cluster_6_region = region_counter(6)
print('Cluster_6')
display(cluster_6_region[0:11])

cluster_7_region = region_counter(7)
print('Cluster_7')
display(cluster_7_region[0:11])

cluster_8_region = region_counter(8)
print('Cluster_8')
display(cluster_8_region[0:11])

cluster_9_region = region_counter(9)
print('Cluster_9')
display(cluster_9_region[0:11])

cluster_10_region = region_counter(10)
print('Cluster_10')
display(cluster_10_region[0:11])

cluster_11_region = region_counter(11)
print('Cluster_11')
display(cluster_11_region[0:11])

Cluster_1


[('barolo', 108),
 ('columbia valley (wa)', 83),
 ('rioja', 53),
 ('brunello di montalcino', 43),
 ('mendoza', 40),
 ('chianti classico', 39),
 ('napa valley', 38),
 ('barbaresco', 36),
 ('red mountain', 29),
 ('paso robles', 27),
 ('russian river valley', 26)]

Cluster_2


[('barolo', 124),
 ('chianti classico', 105),
 ('rioja', 95),
 ('brunello di montalcino', 93),
 ('columbia valley (wa)', 70),
 ('toscana', 67),
 ('paso robles', 58),
 ('barbaresco', 54),
 ('russian river valley', 51),
 ('amarone della valpolicella classico', 47),
 ('willamette valley', 46)]

Cluster_3


[('barolo', 267),
 ('mendoza', 149),
 ('chianti classico', 135),
 ('barbaresco', 132),
 ('columbia valley (wa)', 131),
 ('toscana', 117),
 ('napa valley', 106),
 ('rioja', 103),
 ('russian river valley', 99),
 ('sta. rita hills', 91),
 ('paso robles', 79)]

Cluster_4


[('barolo', 88),
 ('napa valley', 50),
 ('rioja', 48),
 ('russian river valley', 45),
 ('mendoza', 41),
 ('barbaresco', 39),
 ('paso robles', 37),
 ('columbia valley (wa)', 34),
 ('toscana', 33),
 ('chianti classico', 29),
 ('brunello di montalcino', 20)]

Cluster_5


[('alsace', 666),
 ('finger lakes', 450),
 ('champagne', 448),
 ('russian river valley', 335),
 ('columbia valley (wa)', 263),
 ('cava', 258),
 ('willamette valley', 227),
 ('napa valley', 203),
 ('mendoza', 146),
 ('alto adige', 139),
 ('california', 137)]

Cluster_6


[('russian river valley', 165),
 ('barolo', 149),
 ('toscana', 146),
 ('paso robles', 135),
 ('chianti classico', 114),
 ('napa valley', 93),
 ('columbia valley (wa)', 85),
 ('sta. rita hills', 81),
 ('barbaresco', 75),
 ('sonoma coast', 74),
 ('brunello di montalcino', 71)]

Cluster_7


[('napa valley', 240),
 ('paso robles', 149),
 ('mendoza', 126),
 ('columbia valley (wa)', 121),
 ('rioja', 72),
 ('ribera del duero', 62),
 ('alexander valley', 51),
 ('toscana', 50),
 ('walla walla valley (wa)', 50),
 ('sonoma county', 40),
 ('russian river valley', 40)]

Cluster_8


[('columbia valley (wa)', 153),
 ('russian river valley', 136),
 ('willamette valley', 134),
 ('mendoza', 111),
 ('rioja', 104),
 ('barolo', 98),
 ('sonoma coast', 87),
 ('sta. rita hills', 74),
 ('paso robles', 70),
 ('walla walla valley (wa)', 63),
 ('finger lakes', 62)]

Cluster_9


[('napa valley', 148),
 ('paso robles', 135),
 ('mendoza', 114),
 ('ribera del duero', 65),
 ('toscana', 58),
 ('santa barbara county', 57),
 ('dry creek valley', 56),
 ('santa ynez valley', 52),
 ('california', 52),
 ('russian river valley', 49),
 ('brunello di montalcino', 41)]

Cluster_10


[('russian river valley', 116),
 ('napa valley', 70),
 ('carneros', 48),
 ('mendoza', 37),
 ('sonoma coast', 34),
 ('santa maria valley', 32),
 ('columbia valley (wa)', 32),
 ('santa lucia highlands', 31),
 ('rioja', 31),
 ('california', 26),
 ('willamette valley', 24)]

Cluster_11


[('napa valley', 82),
 ('russian river valley', 68),
 ('rioja', 58),
 ('willamette valley', 49),
 ('columbia valley (wa)', 46),
 ('sonoma coast', 41),
 ('barolo', 33),
 ('mendoza', 31),
 ('brunello di montalcino', 25),
 ('toscana', 24),
 ('chianti classico', 21)]

In [249]:
scent_dict = {'floral': ['floral','iris', 'peony', 'elderflower', 'acacia', 'lilac', 'jasmine', 'honeysuckle', 'violet', 'lavender', 'rose', 'potpourri', 'hibiscus','flower','chamomile','daffodil' ], 
              'citrus' : ['lemongrass','citrus','citric', 'citrusy','lime','lemon','lemony','grapefruit','orange','marmalade','tangerine'],
              'tree_fruit' : ['quince','apple','pear','peach','apricot','persimmon','prune','nectarine'],
              'tropical_fruit' : ['banana','coconut','exotic','tropical','pineapple','mango','guava','kiwi','lychee', 'bubblegum','melon','papaya','honeydew','watermelon'],
              'red_fruit' : ['cranberry','plum','currant','cherry','strawberry', 'raspberry','gooseberry'],
              'black_fruit' : ['boysenberry','cassis','blackberry','blueberry','olive','elderberry'],
              'dried_fruit' : ['raisin','raisiny','fig','date','fruitcake','candied','candy','sugar'],
              'noble_rot' : ['beeswax','ginger','saffron','honey'],
              'herbal' : ['minty','herbs','herb','herbal','thyme','mint','sage','rosemary','dill','menthol','oregano'],
              'spice' : ['woodspice','nutmeg','pepper','peppercorn','cinammon','cardammom','anise','spice', 'fennel','eucalyptus','spicy','liquorice','clove'],
              'vegetable' : ['rhubarb','tomato','jalapeno','bell','tomato-leaf','grass'],
              'nut' : ['nut','pine','almond','nutty','hazelnut'],
              'earth' : ['stoney','root','tea','earth','earthy','petroleum','petrol','kerosene','rocks','beet','soil','gravel','slate','clay','mineral','salt','salty','chalk'],
              'microbial' : ['yeasty','yeast','microbial','mushroom','truffle','lager','sourdough','cream','milk','buttery','bread','creamy','butter','balsamic','vinegar'],
              'sauvage' : ['sauvage','indigenous','wild','animal','meat','gamey','meaty','game'],
              'aldehyde' : ['leather','chocolate','vanilla','cola','caramel','cocoa','leathery','toffee'],
              'fumee' : ['barbecue','smoke','smokey','cigar','tobacco','barrel','coffee','mocha','smoked','charred'],
              'wood' : ['wood','woody','cedar','oak','oaky','sandalwood','forest','acacia','pine']}

## Exploring clusters

### Creating a dictionary with clusters as keys and flavor profiles as values 

In [250]:
cluster_dictionary = {1:['red_fruit','herbal','spice','aldehyde','fumee','wood'],2:['red_fruit','spice','earth','aldehyde','fumee'],3:['red_fruit','herbal','spice','wood'],4:['red_fruit','herbal','spice','aldehyde','wood'],5:['citrus','tree_fruit'],6:['red_fruit','spice'], 7:['red_fruit','spice'],8:['red_fruit','black_fruit','aldehyde','fumee'],9:['black_fruit','spice','aldehyde'],10:['citrus','tropical_fruit','microbial','aldehyde','wood'],11:['red_fruit','aldehyde','fumee','wood']}

In [251]:
cluster_dictionary

{1: ['red_fruit', 'herbal', 'spice', 'aldehyde', 'fumee', 'wood'],
 2: ['red_fruit', 'spice', 'earth', 'aldehyde', 'fumee'],
 3: ['red_fruit', 'herbal', 'spice', 'wood'],
 4: ['red_fruit', 'herbal', 'spice', 'aldehyde', 'wood'],
 5: ['citrus', 'tree_fruit'],
 6: ['red_fruit', 'spice'],
 7: ['red_fruit', 'spice'],
 8: ['red_fruit', 'black_fruit', 'aldehyde', 'fumee'],
 9: ['black_fruit', 'spice', 'aldehyde'],
 10: ['citrus', 'tropical_fruit', 'microbial', 'aldehyde', 'wood'],
 11: ['red_fruit', 'aldehyde', 'fumee', 'wood']}

### Looking for possible notes in every cluster

In [252]:
def find_possible_notes(cluster_number):   
    
    possible_notes = []

    for key, value in scent_dict.items():
            
        if key in cluster_dictionary[cluster_number]:
            possible_notes.append(value)
        
    from itertools import chain
    pos_list = list(chain(*possible_notes))   

    return pos_list     

In [253]:
notes_cluster_1 = find_possible_notes(1)
notes_cluster_2 = find_possible_notes(2)
notes_cluster_3 = find_possible_notes(3)
notes_cluster_4 = find_possible_notes(4)
notes_cluster_5 = find_possible_notes(5)
notes_cluster_6 = find_possible_notes(6)
notes_cluster_7 = find_possible_notes(7)
notes_cluster_8 = find_possible_notes(8)
notes_cluster_9 = find_possible_notes(9)
notes_cluster_10 = find_possible_notes(10)
notes_cluster_11 = find_possible_notes(11)

### Finding most common words in every cluster

We wanted to find most common words in every cluster to be able to define the flavor profile of each cluster more exactly and in detail. 

In [254]:
def wordfreq_counter():
    
    wordfreq = {}
        
    for value in wine[wine['cluster']==cluster_number]['province']:
        province_list.append(value)
            
    for word in province_list:
            if ( word not in wordfreq.keys() ): 
                wordfreq[word] = 1 
            else: 
                wordfreq[word] += 1   
                
    sorted_tuple = sorted(wordfreq.items(), key=lambda word: word[1], reverse=True)
                
    return sorted_tuple

In [255]:
def find_most_common_notes_in_a_cluster(cluster_number, list_of_scents):
            
    wordfreq = {}
    
    for word in list_of_scents:
        wordfreq[word] = 0
    
    for cell in wine[wine['cluster']==cluster_number]['description']:
        for word in cell:
            if ( word in wordfreq.keys() ): 
                wordfreq[word] += 1 
            else: 
                continue 
                
    sorted_tuple = sorted(wordfreq.items(), key=lambda word: word[1], reverse=True)
    
    return sorted_tuple

In [256]:
clust_1 = find_most_common_notes_in_a_cluster(1,notes_cluster_1)
print('Cluster_1')
display(clust_1[0:21])

clust_2 = find_most_common_notes_in_a_cluster(2,notes_cluster_2)
print('Cluster_2')
display(clust_2[0:21])

clust_3 = find_most_common_notes_in_a_cluster(3,notes_cluster_3)
print('Cluster_3')
display( clust_3[0:21])

clust_4 = find_most_common_notes_in_a_cluster(4,notes_cluster_4)
print('Cluster_4')
display( clust_4[0:21])

clust_5 = find_most_common_notes_in_a_cluster(5,notes_cluster_5)
print('Cluster_5')
display( clust_5[0:21])

clust_6 = find_most_common_notes_in_a_cluster(6,notes_cluster_6)
print('Cluster_6')
display( clust_6[0:21])

clust_7 = find_most_common_notes_in_a_cluster(7,notes_cluster_7)
print('Cluster_7')
display( clust_7[0:21])

clust_8 = find_most_common_notes_in_a_cluster(8,notes_cluster_8)
print('Cluster_8')
display( clust_8[0:21])

clust_9 = find_most_common_notes_in_a_cluster(9,notes_cluster_9)
print('Cluster_9')
display( clust_9[0:21])

clust_10 = find_most_common_notes_in_a_cluster(10,notes_cluster_10)
print('Cluster_10')
display( clust_10[0:21])

clust_11 = find_most_common_notes_in_a_cluster(11,notes_cluster_11)
print('Cluster_11')
display( clust_11[0:21])

Cluster_1


[('oak', 646),
 ('cherry', 548),
 ('vanilla', 508),
 ('spice', 478),
 ('chocolate', 300),
 ('herb', 289),
 ('coffee', 268),
 ('tobacco', 252),
 ('plum', 249),
 ('leather', 239),
 ('pepper', 200),
 ('barrel', 190),
 ('wood', 183),
 ('cedar', 146),
 ('cola', 137),
 ('mocha', 136),
 ('spicy', 129),
 ('mint', 126),
 ('herbs', 118),
 ('herbal', 118),
 ('sage', 108)]

Cluster_2


[('cherry', 1073),
 ('spice', 937),
 ('tobacco', 716),
 ('leather', 544),
 ('plum', 527),
 ('vanilla', 457),
 ('earth', 432),
 ('chocolate', 424),
 ('earthy', 414),
 ('pepper', 411),
 ('coffee', 391),
 ('cola', 252),
 ('raspberry', 241),
 ('mocha', 225),
 ('barrel', 197),
 ('mineral', 191),
 ('spicy', 186),
 ('clove', 165),
 ('smoke', 161),
 ('soil', 158),
 ('anise', 137)]

Cluster_3


[('cherry', 1988),
 ('spice', 1666),
 ('oak', 1200),
 ('plum', 1012),
 ('herb', 1006),
 ('pepper', 979),
 ('raspberry', 606),
 ('spicy', 588),
 ('herbal', 553),
 ('anise', 427),
 ('currant', 423),
 ('mint', 401),
 ('sage', 386),
 ('clove', 374),
 ('cedar', 321),
 ('herbs', 313),
 ('wood', 293),
 ('cranberry', 292),
 ('menthol', 273),
 ('strawberry', 226),
 ('oaky', 196)]

Cluster_4


[('cherry', 631),
 ('spice', 591),
 ('oak', 482),
 ('vanilla', 458),
 ('chocolate', 278),
 ('plum', 271),
 ('leather', 241),
 ('pepper', 236),
 ('herb', 211),
 ('cola', 187),
 ('raspberry', 148),
 ('spicy', 148),
 ('anise', 122),
 ('currant', 121),
 ('mint', 103),
 ('herbal', 97),
 ('wood', 90),
 ('cedar', 90),
 ('clove', 86),
 ('menthol', 72),
 ('oaky', 69)]

Cluster_5


[('citrus', 4227),
 ('apple', 3718),
 ('lemon', 3571),
 ('lime', 2107),
 ('peach', 2024),
 ('orange', 1997),
 ('pear', 1563),
 ('grapefruit', 1326),
 ('apricot', 758),
 ('tangerine', 713),
 ('nectarine', 527),
 ('citrusy', 447),
 ('lemony', 427),
 ('citric', 232),
 ('lemongrass', 117),
 ('marmalade', 99),
 ('quince', 92),
 ('prune', 22),
 ('persimmon', 8)]

Cluster_6


[('cherry', 2682),
 ('spice', 2192),
 ('pepper', 1253),
 ('plum', 1119),
 ('raspberry', 772),
 ('spicy', 584),
 ('currant', 457),
 ('clove', 443),
 ('strawberry', 356),
 ('anise', 298),
 ('cranberry', 267),
 ('nutmeg', 130),
 ('peppercorn', 63),
 ('fennel', 59),
 ('eucalyptus', 53),
 ('woodspice', 21),
 ('gooseberry', 13),
 ('cinammon', 0),
 ('cardammom', 0),
 ('liquorice', 0)]

Cluster_7


[('cherry', 1252),
 ('plum', 688),
 ('currant', 381),
 ('raspberry', 236),
 ('cranberry', 57),
 ('strawberry', 57),
 ('spice', 29),
 ('pepper', 14),
 ('spicy', 6),
 ('gooseberry', 1),
 ('nutmeg', 1),
 ('anise', 1),
 ('clove', 1),
 ('woodspice', 0),
 ('peppercorn', 0),
 ('cinammon', 0),
 ('cardammom', 0),
 ('fennel', 0),
 ('eucalyptus', 0),
 ('liquorice', 0)]

Cluster_8


[('cherry', 1915),
 ('plum', 1070),
 ('raspberry', 652),
 ('cranberry', 363),
 ('strawberry', 316),
 ('currant', 275),
 ('blackberry', 175),
 ('vanilla', 149),
 ('cola', 134),
 ('chocolate', 132),
 ('leather', 129),
 ('tobacco', 113),
 ('coffee', 93),
 ('cassis', 72),
 ('blueberry', 67),
 ('smoke', 55),
 ('mocha', 48),
 ('olive', 46),
 ('barrel', 44),
 ('leathery', 41),
 ('cocoa', 39)]

Cluster_9


[('blackberry', 1568),
 ('spice', 1062),
 ('pepper', 726),
 ('chocolate', 604),
 ('vanilla', 469),
 ('cassis', 450),
 ('blueberry', 409),
 ('spicy', 345),
 ('leather', 222),
 ('boysenberry', 169),
 ('cola', 161),
 ('olive', 143),
 ('anise', 142),
 ('clove', 138),
 ('cocoa', 85),
 ('nutmeg', 76),
 ('elderberry', 72),
 ('caramel', 67),
 ('peppercorn', 61),
 ('leathery', 50),
 ('fennel', 44)]

Cluster_10


[('oak', 788),
 ('vanilla', 585),
 ('creamy', 316),
 ('pineapple', 297),
 ('tropical', 231),
 ('lemon', 183),
 ('citrus', 180),
 ('orange', 154),
 ('oaky', 149),
 ('caramel', 133),
 ('cream', 129),
 ('melon', 122),
 ('lime', 120),
 ('butter', 120),
 ('wood', 97),
 ('buttery', 87),
 ('coconut', 78),
 ('exotic', 77),
 ('mango', 72),
 ('chocolate', 69),
 ('banana', 45)]

Cluster_11


[('oak', 977),
 ('cherry', 689),
 ('vanilla', 506),
 ('plum', 339),
 ('chocolate', 301),
 ('coffee', 203),
 ('raspberry', 201),
 ('cola', 201),
 ('wood', 195),
 ('cedar', 188),
 ('tobacco', 171),
 ('barrel', 164),
 ('currant', 149),
 ('mocha', 121),
 ('leather', 101),
 ('oaky', 86),
 ('smoke', 82),
 ('sandalwood', 72),
 ('caramel', 63),
 ('strawberry', 60),
 ('cranberry', 52)]

In [257]:
dict_of_clusters = {'cluster1': clust_1[0:11],'cluster2':clust_2[0:11],'cluster3':clust_3[0:11],'cluster4':clust_4[0:11],'cluster5':clust_5[0:11],'cluster6':clust_6[0:11],'cluster7':clust_7[0:11],'cluster8':clust_8[0:11],'cluster9':clust_9[0:11],'cluster10':clust_10[0:11],'cluster11':clust_11[0:11]}

In [258]:
df = pd.DataFrame(
    [[k, v[0], v[1]] for k, ls_v in dict_of_clusters.items() for v in ls_v],
    columns=['cluster','scent_note','count']
    )

df

Unnamed: 0,cluster,scent_note,count
0,cluster1,oak,646
1,cluster1,cherry,548
2,cluster1,vanilla,508
3,cluster1,spice,478
4,cluster1,chocolate,300
...,...,...,...
116,cluster11,raspberry,201
117,cluster11,cola,201
118,cluster11,wood,195
119,cluster11,cedar,188


## Finding final names for each cluster

### Version with wine examples

In [259]:
cluster_description_dict = {'cluster1' : "wood and smoky notes, cherry, plum, vanilla, coffee, tobacco, leather; wine examples : pinot noir, barolo, rioja, brunello di montalcino, chianti, barbaresco",
                            'cluster2' : "earthy and mineral notes, red fruit, tobacco, chocolate and spices; wine examples: zinfandel, syrah,  barbaresco, tempranillo, rioja, tempranillo blends",
                            'cluster3' : "strong red fruit notes, sage, mint and spices;wine examples: bordeaux style blends, rhone style blends, sangiovese, rioja, rose",
                            'cluster4' : "cherry and berry notes, oak wood, spices, vanilla and chocolate;wine examples: pinot noir, cabernet sauvignon, red blends, malbec, brunello di montalcino",
                            'cluster5' : "citrus fruit, lemongrass, marmalade, peach, apricot, pear and apple;wine examples: lighter white wines, chardonnay, riesling, sauvignon blanc, sparkling blend, rose, pinot gris, champagne, verdejo",
                            'cluster6' : "berries, strong spicy notes of nutmeg, pepper, cloves and anise; wine examples: rose blends, sparkling rose, lambrusco, red blends, barbera, nero d’avola, grenache noir, salice salentino",
                            "cluster7" : "fruity notes of cherry, plums, currants, raspberry and strawberry;wine examples: tempranillo blend, ribera del duero, pinot noir, malbec, cabernet franc, negroamaro",
                            'cluster8' : "strong fruit notes mixed with vanilla, chocolate, coffee, tobacco and cocoa;wine examples: rose, malbec, heavier white wines, nebbiolo",
                            "cluster9" : "creamy notes of vanilla, caramel and coconut mixed with tropical and citrus fruit;wine examples: heavier white wines, semillon-chardonnay, chardonnay, pinot grigio, champagne, tempranillo, pinot noir",
                            'cluster10' : "creamy notes of vanilla, caramel and coconut mixed with tropical and citrus fruit;wine examples: heavier white wines, semillon-chardonnay, chardonnay, pinot grigio, champagne, tempranillo, pinot noir",
                            'cluster11' : "barrel notes, coffee, caramel and chocolate mixed with fruit and wood notes like oak cedar, and sandalwood;wine examples: pinot, cabernet, ribera, rioja, sangiovese"
                           }

### Version without wine examples

In [261]:
cluster_name_dict = {'cluster1' : "wood and smoky notes, cherry, plum, vanilla, coffee, tobacco, leather; wine examples",
                    'cluster2' : "earthy and mineral notes, red fruit, tobacco, chocolate and spices",
                    'cluster3' : "strong red fruit notes, sage, mint and spices",
                    'cluster4' : "cherry and berry notes, oak wood, spices, vanilla and chocolate",
                    'cluster5' : "citrus fruit, lemongrass, marmalade, peach, apricot, pear and apple",
                    'cluster6' : "berries, strong spicy notes of nutmeg, pepper, cloves and anise",
                    "cluster7" : "fruity notes of cherry, plums, currants, raspberry and strawberry",
                    'cluster8' : "strong fruit notes mixed with vanilla, chocolate, coffee, tobacco and cocoa",
                    "cluster9" : "strong black fruit notes of blackberry, cassis or elderberry, mixed with various spices",
                    'cluster10' : "creamy notes of vanilla, caramel and coconut mixed with tropical and citrus fruit",
                    'cluster11' : "barrel notes, coffee, caramel and chocolate mixed with fruit and wood notes like oak, cedar, and sandalwood"
                    }