In [294]:
# Import Libraries
import csv
import pandas as pd 
import numpy as np
import nltk
import os
import nltk.corpus
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Import dataset
data = pd.read_csv('Wine V2.csv')

In [295]:
# Only take the first 5 to test out text mining.
data = data.head(1000)
data['description'][4]

"Much like the regular bottling from 2012, this comes across as rather rough and tannic, with rustic, earthy, herbal characteristics. Nonetheless, if you think of it as a pleasantly unfussy country wine, it's a good companion to a hearty winter stew."

In [296]:
# Identify a single column
print(data['description'])

# Print individual entries. 
data['description'][0]
data['description'][1]
data['description'][2]
data['description'][3]
data['description'][4]

0      Aromas include tropical fruit, broom, brimston...
1      This is ripe and fruity, a wine that is smooth...
2      Tart and snappy, the flavors of lime flesh and...
3      Pineapple rind, lemon pith and orange blossom ...
4      Much like the regular bottling from 2012, this...
                             ...                        
995    Edèlmio is a sophisticated and toasty blend of...
996    Here's a Syrah with bursting aromas of mature ...
997    Blended from a patchwork of old vineyards thro...
998    Rich in the mouth, this creamy and textural wi...
999    Creamy and textural, this brings on a nice mix...
Name: description, Length: 1000, dtype: object


"Much like the regular bottling from 2012, this comes across as rather rough and tannic, with rustic, earthy, herbal characteristics. Nonetheless, if you think of it as a pleasantly unfussy country wine, it's a good companion to a hearty winter stew."

In [297]:
# Break up the sentences into lists of individual words
for i in range(len(data)):
    text = data['description'][i]
    data['description'][i] = word_tokenize(text.lower())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [298]:
# Check that everything 1) was make lowercase, 2) was split up into lists.
data['description']

0      [aromas, include, tropical, fruit, ,, broom, ,...
1      [this, is, ripe, and, fruity, ,, a, wine, that...
2      [tart, and, snappy, ,, the, flavors, of, lime,...
3      [pineapple, rind, ,, lemon, pith, and, orange,...
4      [much, like, the, regular, bottling, from, 201...
                             ...                        
995    [edèlmio, is, a, sophisticated, and, toasty, b...
996    [here, 's, a, syrah, with, bursting, aromas, o...
997    [blended, from, a, patchwork, of, old, vineyar...
998    [rich, in, the, mouth, ,, this, creamy, and, t...
999    [creamy, and, textural, ,, this, brings, on, a...
Name: description, Length: 1000, dtype: object

In [301]:
from nltk.corpus import stopwords
a = set(stopwords.words("english"))

for i in range(len(data)):
    data['description'][i] = [x for x in data['description'][i] if x not in a]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [302]:
# Test to make sure each point had stop words taken out. 
data['description'][2]

['tart',
 'snappy',
 ',',
 'flavors',
 'lime',
 'flesh',
 'rind',
 'dominate',
 '.',
 'green',
 'pineapple',
 'pokes',
 ',',
 'crisp',
 'acidity',
 'underscoring',
 'flavors',
 '.',
 'wine',
 'stainless-steel',
 'fermented',
 '.']

In [303]:
# Part-of-Speech Tagging
# Definitions: https://www.guru99.com/pos-tagging-chunking-nltk.html
# NN is the main part of speech we want to keep here. Everything else can go. 
# - NN = singular noun

# Add the type of speech to each word in the list. 
for i in range(len(data)):
    words = []
    for token in data['description'][i]:
        words.append(nltk.pos_tag([token]))
        data['description'][i] = words       

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [304]:
for i in range(len(data)):
    new_list = []
    final_set = []
    for tag in range(len(data['description'][i])):
        if data['description'][i][tag][0][1] == 'NN':
            final_set.append(data['description'][i][tag][0][0])
    data['description'][i] = final_set

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [305]:
data['description'][50]

['blend',
 'nero',
 "d'avola",
 'syrah',
 'savory',
 'meat',
 'berry',
 'cassis',
 'tobacco',
 'wet',
 'earth',
 'touch',
 'almond',
 'bitterness',
 'finish']

In [279]:
# Add Stemmer
# Words like "Earthy" and "Earth [Flavors]" should match, "Almondy" and "Almond", etc. 

['ripe', 'fruity', 'wine', 'smooth', 'firm', 'juicy', 'berry', 'acidity']

In [306]:
# Add Frequency Count
# This will be interesting to see what words are most often used to describe the wines. 

# 1) Make one big list of all the lists. 
full_list = []
for i in range(len(data['description'])):
    for j in range(len(data['description'][i])):
        full_list.append(data['description'][i][j])

# 2) Look at the top 25 words frequently used as descriptors
token_freq = FreqDist(full_list)
top_25_freq = token_freq.most_common(25)
top_25_freq

[('wine', 579),
 ('fruit', 345),
 ('palate', 331),
 ('finish', 281),
 ('acidity', 260),
 ('drink', 245),
 ('ripe', 219),
 ('cherry', 203),
 ('%', 151),
 ('spice', 141),
 ('berry', 127),
 ('oak', 115),
 ('plum', 107),
 ('apple', 106),
 ('citrus', 106),
 ('dry', 104),
 ('sweet', 102),
 ('crisp', 97),
 ('blend', 96),
 ('texture', 93),
 ('light', 93),
 ('cabernet', 83),
 ('blackberry', 82),
 ('pepper', 82),
 ('bright', 81)]