### Part 1: Import Libraries & Data

In [None]:
# Import Libraries
import csv
import pandas as pd 
import numpy as np
import nltk
import os
import nltk.corpus
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Import dataset
data = pd.read_csv('wine.csv')

In [None]:
# This group was causing errors before (NaNs in "type"), so this is a check
# to make sure the issue is fixed. 
data[13998:14002]

### Part 2: Tokenize description boxes. 

In [None]:
# Break up the sentences into lists of individual words
for i in range(len(data)):
    text = data['description'][i]
    data['description'][i] = word_tokenize(text.lower())

In [None]:
from nltk.corpus import stopwords
a = set(stopwords.words("english"))

for i in range(len(data)):
    data['description'][i] = [x for x in data['description'][i] if x not in a]

In [None]:
# Part-of-Speech Tagging
# Definitions: https://www.guru99.com/pos-tagging-chunking-nltk.html
# NN is the main part of speech we want to keep here. Everything else can go. 
# - NN = singular

# Add the type of speech to each word in the list. 
for i in range(len(data)):
    words = []
    for token in data['description'][i]:
        words.append(nltk.pos_tag([token]))
        data['description'][i] = words       

In [None]:
for i in range(len(data)):
    new_list = []
    final_set = []
    for tag in range(len(data['description'][i])):
        if data['description'][i][tag][0][1] == 'NN' or data['description'][i][tag][0][1] == 'JJ':
            final_set.append(data['description'][i][tag][0][0])
    data['description'][i] = final_set

### Part 3: Match with Flavor Categories

In [None]:
# Create new box, "category"
data['category'] = np.empty((len(data), 0)).tolist()

In [None]:
# Source: https://winefolly.com/deep-dive/wine-tasting-terms-to-use/
# Source 2: https://www.decanter.com/learn/advice/understand-tasting-notes-decoded-344920/#fruit

# ========== Flavors ============ #

# Fruity
fruit_forward_red = ["Berry", 'black-cherry', "Raspberry", "Cherry", "Coconut", "Cassis", 'red-berry', "Blackberry", "Blueberry", "Jam", "Prune", "Candied", "Bergamot", "Olive", "Bramble", "Cranberry", "Fig" , "Jammy", "Juniper" , "Kirsch", "Loganberry" , "Plum", "Raisin", "Raspberry" , "Strawberry", "Fruit", "Fruity"]
fruit_forward_white = ["Lemon", "Apple", "Peach", "Mango", "Pear", "Berry", "Cantaloupe", "Creme Brulee", "Crème Brûlée", "Caramel", "Vanilla", "Apricot", "Banana", "Candied", "Citrus", "Honey", "Gooseberry", "Kiwi", "Lychee", "Marmalade", "Melon", "Orange", "Papaya", "Passion Fruit", "Pineapple", "Prune", "Sherbet", "Fruit", "Fruity"]

# Savory 
savory_red = ["Savory", "Cranberry", "Soy", "Onion", "Rhubarb"," Black Currant", "Cassis", "Pepper", 'lemon-zest', "Peppercorn", "Olive", "Mulberry", "Bilberry", "Dried Herbs", "Game", "Sage", "Leather", "Tobacco", "Charcoal", "Tar", "Underbrush", "Garrigue", "Gravel", "Torrefaction", "Mineral", "Woodsmoke"]
savory_white = ["Savory", "Lime", "Pith", "Quince", "Almond", "Gooseberry", "Jalapeno", "Grapefruit", "Papaya", "Thyme", "Chervil", "Grass", "Flint", "Chalk", "Chalky", "Petrichor", "Minerally", "Mineral", "biscuit", "brioche", "buttery", "butter", "caramel", "cereal", "cream", "marzipan", "croissant", "pastry"]

# Earthy
earthy_red = ["Earthy", "Rough", "Tannic", "Rusty", "Rustic", 'lead', "Earthy", "Balsamic", 'herbal', 'woody','spicy', 'clove', "Eucalyptus", "Pepper", "Leafy", "Medicinal", "Mint", "Mushroom", "Rhubarb", "Tomato", "beetroot", "tea", "meat", "tobacco", "cardboard", "iodine", "charcoal", "chocolate", "coffee", "leather","tar", "smoke", "wood", "vinyl", "velvet", "velvety", "Pepper", "Spice", "Spices", "Cedar", "Cinnamon", "Clove", "Cola", "Cumin", "Licorice"]
earthy_white = ["Earthy", "Asparagus", "Cabbage", "Fennel", "Grass", "Hay", "Hedgerow", "Lemongrass", 'herbal', 'woody','spicy', 'clove', "Vegetal", "Chalky", "flint", "chalk", "graphite", "mineral", "oyster", "salt", "slate", "steely", "wool", "almond", "beeswax", "petrol", "gasoline", "smoky", "Rough", "Tannic", "Rusty", "Rustic", "Earthy", "smoke", "toffee", "vanilla", "walnut", "wax", "match", "Pepper", "Spice", "Spices", "Cedar", "Cinnamon"]
                      
# Floral 
floral_red = ["Floral", "Blossom", "Rosy", "Rose", 'fragrant', "Lavender", "Peony", "Flower", "Flowery", "Rose", "Turkish Delight", "Violet"]
floral_white = ["Floral", "Blossom", 'Camomile', "Geranium", "Elderflower", 'fragrant', "Honeysuckle", "Jasmine", "Ginger", "Flower", "Flowery", "Rosy", "Rose"]
  
# Bitter 
bitter_red = ["Chewy", "Muscular", "Structured", "Firm", "Rigid", "Closed", "Dried Herbs", "Herby", "Oregano", "Bay Leaf", "Bitter Chocolate", "Baker’s Chocolate", "Bitter Herbs", "Austere", "Angular", "Grippy", "Harsh", "Coarse", "Dense"]
bitter_white = ["Austere", "Citrus Pith", "Quince", "Bitter", "Almond", "Green", "Almond", "Chalk", "Chalky"]

# ========== Body Profile ============ #
# Light-Bodied
light_bodied_red = ["Light-bodied", 'summer', "Light", "fresh:, ""Subtle", "Delicate", "Elegant", "Crisp", "Thin", "Finesse", "Bright", "Floral"]
light_bodied_white = ["Light-bodied", 'summer', "Light","Zesty", "Airy", "Lean", "fresh", "Racy", "Crisp", "Zippy", "Austere", "Long Tingly Finish", "Brilliant", "Lively"]

# Full-Bodied
full_bodied_red = ["Full-bodied", "Rich", "Lush", "Opulent", 'richness', 'syrah', "Rigid", "Intense", "Extracted", "High Alcohol", "High Tannin", "Firm", "Structured", "Muscular", "Concentrated", "Hot",'ripe', 'luscious','heft', 'bold', 'lavish']
full_bodied_white = ["Full-bodied", "Rich", "Lush", "Oily", "Buttery", 'richness', "biscuit", "brioche", "buttery", "butter", "caramel", "shortcake", "cereal", "cream", "marzipan", "croissant", "pastry",'ripe', 'luscious','heft', 'bold', 'lavish']


In [None]:
# Put in lowercase: 
fruit_forward_red = [x.lower() for x in fruit_forward_red]
fruit_forward_white = [x.lower() for x in fruit_forward_white]

savory_red = [x.lower() for x in savory_red]
savory_white = [x.lower() for x in savory_white]

earthy_red = [x.lower() for x in earthy_red]
earthy_white = [x.lower() for x in earthy_white]

floral_red = [x.lower() for x in floral_red]
floral_white = [x.lower() for x in floral_white]

bitter_red = [x.lower() for x in bitter_red]
bitter_white = [x.lower() for x in bitter_white]

light_bodied_red = [x.lower() for x in light_bodied_red]
light_bodied_white = [x.lower() for x in light_bodied_white]

full_bodied_red = [x.lower() for x in full_bodied_red]
full_bodied_white = [x.lower() for x in full_bodied_white]

In [None]:
# Turns out there were some "nulls" in the "type" column. 
# Taking those outs allows for 
data = data[data['type'].notnull()]

In [None]:
for i in range(len(data)):
    
    # ======================== Red Wines ================================================================
    if ' Red ' in data["type"][i].split():
    # =====================================================
    # Fruity Reds
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in fruit_forward_red:
                sum += 1
        if sum >= 3:
            data["category"][i].append("Fruity")
    # =====================================================
    # Savory Reds
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in savory_red:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Savory") 
            
    # =====================================================
    # Earthy Reds
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in earthy_red:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Earthy") 
    # =====================================================
    # Floral Reds
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in floral_red:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Floral") 
            
    # =====================================================
    # Bitter Reds
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in bitter_red:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Bitter") 
            
    # =====================================================
    # Light-Bodied Red
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in light_bodied_red:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Light-Bodied")
            
    # =====================================================
    # Full-Bodied Red
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in full_bodied_red:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Full-Bodied")
            
            
    # ======================== White Wines ================================================================
    elif ' White ' in data["type"][i].split():
    # =====================================================
    # Fruity Whites
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in fruit_forward_white:
                  sum += 1
        if sum >= 3:
            data["category"][i].append("Fruity")

    # =====================================================
    # Savory Whites
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in savory_white:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Savory")
    # =====================================================   
    # Earthy Whites
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in earthy_white:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Earthy") 
    # =====================================================
    # Floral Whites
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in floral_white:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Floral") 
        
    # =====================================================
    # Bitter Reds
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in bitter_white:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Bitter") 
            
    # =====================================================
    # Light-Bodied White
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in light_bodied_white:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Light-Bodied")   
    # =====================================================
    # Full-Bodied White
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in full_bodied_white:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Full-Bodied") 
            
  # ======================== Rosé and Blends ================================================================
    else: 
        # Fruity
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in fruit_forward_red or data["description"][i][j] in fruit_forward_white:
                sum += 1
        if sum >= 3:
            data["category"][i].append("Fruity")
            
        # Savory 
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in savory_red or data["description"][i][j] in savory_white:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Savory") 
            
       # Earthy 
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in earthy_red or data["description"][i][j] in earthy_white:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Earthy") 
              
        # Floral
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in floral_red or data["description"][i][j] in floral_white:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Floral") 
        
        # Bitter
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in bitter_red or data["description"][i][j] in bitter_white:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Bitter") 
            
        # Light-Bodied
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in light_bodied_red or data["description"][i][j] in light_bodied_white:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Light-Bodied")      
                 
        # Full-Bodied 
        sum = 0
        for j in range(len(data["description"][i])):
            if data["description"][i][j] in full_bodied_red or data["description"][i][j] in full_bodied_white:
                sum += 1
        if sum >= 2:
            data["category"][i].append("Full-Bodied")
                     

In [None]:
# for i in range(len(data)):
#    data['category'][i] = set(data['category'][i])  

In [None]:
# Save back to CSV
data.to_csv("wine_final.csv", index=False)