### Part 1: Import Libraries & Data

In [64]:
# Import Libraries
import csv
import pandas as pd 
import numpy as np
import nltk
import os
import nltk.corpus
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Import dataset
data = pd.read_csv('wine.csv')

In [65]:
# Only take the first 5 to test out text mining.
# data = data.head(1000)
data

Unnamed: 0,ything,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,type
0,1.0,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87.0,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red,Portuguese Red,Quinta dos Avidagos,Red
1,2.0,US,"Tart and snappy, the flavors of lime flesh and...",,87.0,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris,Pinot Gris,Rainstorm,White
2,3.0,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87.0,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling,Riesling,St. Julian,White
3,4.0,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87.0,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,Red
4,5.0,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87.0,15.0,Northern Spain,Navarra,,Michael Schachner,@wineschach,Tandem 2011 Ars In Vitro Tempranillo-Merlot,Tempranillo-Merlot,Tandem,White
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1299.0,Italy,Aromas suggest red berry and plum with a hint ...,Cornell Villa Nigra,89.0,49.0,Northeastern Italy,Alto Adige,,Kerin O’Keefe,@kerinokeefe,Colterenzio 2011 Cornell Villa Nigra Pinot Nero,Pinot Nero,Colterenzio,Red
996,1300.0,US,This is a successful straight-ahead Chardonnay...,,89.0,35.0,California,Santa Barbara County,Central Coast,Matt Kettmann,@mattkettmann,Demetria 2012 Chardonnay,Chardonnay,Demetria,White
997,1301.0,France,"Lightly herbaceous, this is a ripe, lively win...",,89.0,20.0,Loire Valley,Menetou-Salon,,Roger Voss,@vossroger,Domaine de l'Ermitage 2013 Menetou-Salon,Sauvignon Blanc,Domaine de l'Ermitage,White
998,1302.0,France,"This is a ripe, fresh and fruity wine that's f...",,89.0,21.0,Loire Valley,Sancerre,,Roger Voss,@vossroger,Domaine de Rome 2013 Sancerre,Sauvignon Blanc,Domaine de Rome,White


### Part 2: Tokenize description boxes. 

In [66]:
# Break up the sentences into lists of individual words
for i in range(len(data)):
    text = data['description'][i]
    data['description'][i] = word_tokenize(text.lower())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [67]:
from nltk.corpus import stopwords
a = set(stopwords.words("english"))

for i in range(len(data)):
    data['description'][i] = [x for x in data['description'][i] if x not in a]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [68]:
# Part-of-Speech Tagging
# Definitions: https://www.guru99.com/pos-tagging-chunking-nltk.html
# NN is the main part of speech we want to keep here. Everything else can go. 
# - NN = singular

# Add the type of speech to each word in the list. 
for i in range(len(data)):
    words = []
    for token in data['description'][i]:
        words.append(nltk.pos_tag([token]))
        data['description'][i] = words       

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [69]:
for i in range(len(data)):
    new_list = []
    final_set = []
    for tag in range(len(data['description'][i])):
        if data['description'][i][tag][0][1] == 'NN' or data['description'][i][tag][0][1] == 'JJ':
            final_set.append(data['description'][i][tag][0][0])
    data['description'][i] = final_set

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


### Part 3: Match with Flavor Categories

In [70]:
# Create new box, "category"
data['category'] = np.empty((len(data), 0)).tolist()

In [71]:
# Source: https://winefolly.com/deep-dive/wine-tasting-terms-to-use/
# Source 2: https://www.decanter.com/learn/advice/understand-tasting-notes-decoded-344920/#fruit

# ========== Flavors ============ #

# Fruity
fruit_forward_red = ["Berry", 'black-cherry', "Raspberry", "Cherry", "Coconut", "Cassis", 'red-berry', "Blackberry", "Blueberry", "Jam", "Prune", "Candied", "Raisin", "Toffee", "Vanilla", "Bergamot", "Olive", "Bramble", "Cranberry", "Fig" , "Jammy", "Juniper" , "Kirsch", "Loganberry" , "Plum", "Raisin", "Raspberry" , "Strawberry", "Fruit", "Fruity"]
fruit_forward_white = ["Lemon", "Apple", "Peach", "Mango", "Pear", "Berry", "Cantaloupe", "Creme Brulee", "Crème Brûlée", "Caramel", "Vanilla", "Apricot", "Banana", "Candied", "Citrus", "Honey", "Gooseberry", "Kiwi", "Lychee", "Marmalade", "Melon", "Orange", "Papaya", "Passion Fruit", "Pineapple", "Prune", "Sherbet", "Fruit", "Fruity"]

# Savory 
savory_red = ["Savory", "Cranberry", "Soy", "Onion", "Rhubarb"," Black Currant", "Cassis", "Pepper", 'lemon-zest', "Peppercorn", "Olive", "Mulberry", "Bilberry", "Dried Herbs", "Game", "Sage", "Leather", "Tobacco", "Charcoal", "Tar", "Underbrush", "Garrigue", "Gravel", "Torrefaction", "Mineral", "Woodsmoke"]
savory_white = ["Savory", "Lime", "Pith", "Quince", "Almond", "Gooseberry", "Jalapeno", "Grapefruit", "Papaya", "Thyme", "Chervil", "Grass", "Flint", "Chalk", "Chalky", "Petrichor", "Minerally", "Mineral", "biscuit", "brioche", "buttery", "butter", "caramel", "cereal", "cream", "marzipan", "croissant", "pastry"]

# Earthy
earthy_red = ["Earthy", "Rough", "Tannic", "Rusty", "Rustic", 'lead', "Earthy", "Balsamic", 'herbal', 'woody','spicy', 'clove', "Eucalyptus", "Pepper", "Leafy", "Medicinal", "Mint", "Mushroom", "Rhubarb", "Tomato", "beetroot", "tea", "meat", "tobacco", "cardboard", "iodine", "charcoal", "chocolate", "coffee", "leather","tar", "smoke", "wood", "vinyl", "velvet", "velvety", "Pepper", "Spice", "Spices", "Cedar", "Cinnamon", "Clove", "Cola", "Cumin", "Licorice"]
earthy_white = ["Earthy", "Asparagus", "Cabbage", "Fennel", "Grass", "Hay", "Hedgerow", "Lemongrass", 'herbal', 'woody','spicy', 'clove', "Vegetal", "Chalky", "flint", "chalk", "graphite", "mineral", "oyster", "salt", "slate", "steely", "wool", "almond", "beeswax", "petrol", "gasoline", "smoky", "Rough", "Tannic", "Rusty", "Rustic", "Earthy", "smoke", "toffee", "vanilla", "walnut", "wax", "match", "Pepper", "Spice", "Spices", "Cedar", "Cinnamon"]
                      
# Floral 
floral_red = ["Floral", "Blossom", "Rosy", "Rose", 'fragrant', "Lavender", "Peony", "Flower", "Flowery", "Rose", "Turkish Delight", "Violet"]
floral_white = ["Floral", "Blossom", 'Camomile', "Geranium", "Elderflower", 'fragrant', "Honeysuckle", "Jasmine", "Ginger", "Flower", "Flowery", "Rosy", "Rose"]
  
# Bitter 
bitter_red = ["Chewy", "Muscular", "Structured", "Firm", "Rigid", "Closed", "Dried Herbs", "Herby", "Oregano", "Bay Leaf", "Bitter Chocolate", "Baker’s Chocolate", "Bitter Herbs", "Austere", "Angular", "Grippy", "Harsh", "Coarse", "Dense"]
bitter_white = ["Austere", "Citrus Pith", "Quince", "Bitter", "Almond", "Green", "Almond", "Chalk", "Chalky"]

# ========== Body Profile ============ #
# Light-Bodied
light_bodied_red = ["Light-bodied", 'summer', "Light", "fresh:, ""Subtle", "Delicate", "Elegant", "Crisp", "Thin", "Finesse", "Bright", "Floral"]
light_bodied_white = ["Light-bodied", 'summer', "Light","Zesty", "Airy", "Lean", "fresh", "Racy", "Crisp", "Zippy", "Austere", "Long Tingly Finish", "Brilliant", "Lively"]

# Full-Bodied
full_bodied_red = ["Full-bodied", "Rich", "Lush", "Opulent", 'richness', 'syrah', "Rigid", "Intense", "Extracted", "High Alcohol", "High Tannin", "Firm", "Structured", "Muscular", "Concentrated", "Hot",'ripe', 'luscious','heft', 'bold', 'lavish']
full_bodied_white = ["Full-bodied", "Rich", "Lush", "Oily", "Buttery", 'richness', "biscuit", "brioche", "buttery", "butter", "caramel", "shortcake", "cereal", "cream", "marzipan", "croissant", "pastry",'ripe', 'luscious','heft', 'bold', 'lavish']


In [72]:
# Put in lowercase: 
fruit_forward_red = [x.lower() for x in fruit_forward_red]
fruit_forward_white = [x.lower() for x in fruit_forward_white]

savory_red = [x.lower() for x in savory_red]
savory_white = [x.lower() for x in savory_white]

earthy_red = [x.lower() for x in earthy_red]
earthy_white = [x.lower() for x in earthy_white]

floral_red = [x.lower() for x in floral_red]
floral_white = [x.lower() for x in floral_white]

bitter_red = [x.lower() for x in bitter_red]
bitter_white = [x.lower() for x in bitter_white]

light_bodied_red = [x.lower() for x in light_bodied_red]
light_bodied_white = [x.lower() for x in light_bodied_white]

full_bodied_red = [x.lower() for x in full_bodied_red]
full_bodied_white = [x.lower() for x in full_bodied_white]

In [73]:
for i in range(len(data)):
    
    # ======================== Red Wines ================================================================
    if ' Red ' in data["type"][i].split():
    # =====================================================
    # Fruity Reds
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in fruit_forward_red:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Fruity")
    # =====================================================
    # Savory Reds
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in savory_red:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Savory") 
            
    # =====================================================
    # Earthy Reds
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in earthy_red:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Earthy") 
    # =====================================================
    # Floral Reds
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in floral_red:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Floral") 
            
    # =====================================================
    # Bitter Reds
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in bitter_red:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Bitter") 
            
    # =====================================================
    # Light-Bodied Red
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in light_bodied_red:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Light-Bodied")
            
    # =====================================================
    # Full-Bodied Red
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in full_bodied_red:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Full-Bodied")
                     
                
  # ======================== White Wines ================================================================
    elif ' White ' in data["type"][i].split():
    # =====================================================
    # Fruity Whites
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in fruit_forward_white:
                  sum += 1
        if sum >= 2:
            data["category"][i].append("Fruity")

    # =====================================================
    # Savory Whites
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in savory_white:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Savory")
    # =====================================================   
    # Earthy Whites
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in earthy_white:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Earthy") 
    # =====================================================
    # Floral Whites
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in floral_white:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Floral") 
        
    # =====================================================
    # Bitter Reds
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in bitter_white:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Bitter") 
            
    # =====================================================
    # Light-Bodied White
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in light_bodied_white:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Light-Bodied")   
    # =====================================================
    # Full-Bodied White
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in full_bodied_white:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Full-Bodied")  
            
  # ======================== Rosé and Blends ================================================================
    else: 
        # Fruity
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in fruit_forward_red or data["description"][i][j] in fruit_forward_white:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Fruity")
            
        # Savory 
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in savory_red or data["description"][i][j] in savory_white:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Savory") 
            
       # Earthy 
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in earthy_red or data["description"][i][j] in earthy_white:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Earthy") 
              
        # Floral
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in floral_red or data["description"][i][j] in floral_white:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Floral") 
        
        # Bitter
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in bitter_red or data["description"][i][j] in bitter_white:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Bitter") 
            
        # Light-Bodied
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in light_bodied_red or data["description"][i][j] in light_bodied_white:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Light-Bodied")      
                 
        # Full-Bodied 
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in full_bodied_red or data["description"][i][j] in full_bodied_white:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Full-Bodied")

In [75]:
'''
[' Red ', ' White ', ' White Blend', ' Red Blend',
       'Sparkling Red Blend', ' Rosé ', 'Sparkling White Blend',
       'Sparkling White ', ' Rosé Blend', 'Sparkling Red ']
'''
#for i in range(len(data)):
#    if "Blend" in data["type"][i]:
#        print(data["category"][i])
for i in range(len(data)):
    if "Rosé" in data["type"][i]:
        print(i, data["description"][i])

61 ['pale', 'copper', 'hue', 'wine', 'passion', 'fruit', 'palate', 'muskmelon', 'crisp', 'racy', 'medium', 'body', 'summertime', 'refreshment', 'drink']
214 ['easy', 'selection', 'streak', 'soft', 'red', 'currant', 'cherry', 'core', 'wine', 'light', 'breezy', 'palate', 'thin', 'fleeting', 'finish']
215 ['wine', 'earthy', 'rustic', 'fruit', 'spicy', 'pepper', 'edge', 'chill']
275 ['blossom', 'berry', 'nut', 'light-bodied', 'wine', 'zesty', 'lithe', 'palate', 'ideal', 'summer-to-fall', 'apéritif', 'porch', 'sipper', 'finish', 'hint', 'waxy']
331 ['big', 'spicy', 'wine', 'ripe', 'red', 'ss', 'pepper', 'tight', 'lemon-zest', 'acidity']
332 ['watermelon', 'natural', 'vanilla', 'mark', 'bouquet', 'palate', 'fleshy', 'crisp', 'nectarine', 'red', 'apple', 'strawberry', 'finish', 'sweetness']
382 ['remote', 'estate', 'edge', 'lubéron', 'north', 'provence', 'wine', 'rich', 'great', 'concentration', 'structure', 'cabernet', 'sauvignon', 'syrah', 'blend', 'full', 'ripe', 'red', 'dense', 'end']
384

In [76]:
for i in range(len(data)):
    if "Rosé" in data["type"][i]:
        print(i, data["category"][i])

61 ['Light-Bodied']
214 ['Light-Bodied']
215 ['Earthy']
275 ['Light-Bodied']
331 ['Savory', 'Earthy']
332 ['Fruity']
382 ['Full-Bodied']
384 ['Fruity', 'Savory', 'Earthy', 'Full-Bodied']
404 []
407 ['Fruity', 'Full-Bodied']
409 ['Fruity', 'Light-Bodied']
513 ['Earthy', 'Full-Bodied']
514 ['Fruity']
515 ['Bitter', 'Full-Bodied']
517 ['Fruity', 'Light-Bodied']
518 ['Full-Bodied']
519 []
528 ['Full-Bodied']
531 ['Fruity', 'Light-Bodied']
537 ['Fruity']
548 ['Full-Bodied']
651 ['Earthy', 'Light-Bodied', 'Full-Bodied']
683 ['Fruity', 'Light-Bodied']
727 ['Fruity', 'Light-Bodied']
896 ['Light-Bodied']


In [None]:
# Save back to CSV
data.to_csv("wine_final.csv", index=False)