In [1]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup

import matplotlib.pyplot as plt
import seaborn as sns

import re

import nltk

%matplotlib inline

wine = pd.read_csv('data/wine_cleaned_dataframe.csv')

In [13]:
from nltk.stem import WordNetLemmatizer

lem = WordNetLemmatizer()

In [21]:
from nltk.corpus import stopwords

stop_words = list(stopwords.words('english')) 

In [6]:
def clean_text(string):
    
    string = re.sub(r'\W+',' ',string) # Replace everything non-alpahnumeric by ' '
    string = re.sub(r'\s+',' ',string) # Replace one or more whitespaces by  ' '
    string = re.sub(r'\d+',' ',string) # Replace one or more digits by  ' '
    
    return string


def tokenize_and_lemmatize_string(string):
    
    tokenized_string = nltk.word_tokenize(string)
    
    lemmatized_tokens = []
    
    for word in tokenized_string:
        lem.lemmatize(word)
        lemmatized_tokens.append(word) 
        
    return lemmatized_tokens


def pos_tagger(lemmatized_tokens):
    
    tagged_words = nltk.pos_tag(lemmatized_tokens)
    
    return tagged_words
    

def remove_unnecessary_pos_tags(tagged_words):
    
    selected_tags = ['JJ','NN','NNP']
    
    selected_words = []

    for tup in tagged_words:
        if tup[1] in selected_tags:
            selected_words.append(tup[0])
        else:
            continue
    
    return selected_words


    
def remove_stopwords(selected_words):
    
    selected_words_1 = []

    for word in selected_words:
        if word not in stop_words:
            selected_words_1.append(word)
        else:
            continue
        
    return selected_words_1  
    
    

def remove_short_words(selected_words_1):
    
    selected_words_2 = []

    for word in selected_words_1:
        if len(word) > 2:
            selected_words_2.append(word)
        else:
            continue

    return selected_words_2
   

In [4]:
example = wine['description'][0]

In [8]:
example

"aromas include tropical fruit, broom, brimstone and dried herb. the palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity."

In [11]:
## step 1 cleaning

clean_example = clean_text (example)

clean_example

'aromas include tropical fruit broom brimstone and dried herb the palate isn t overly expressive offering unripened apple citrus and dried sage alongside brisk acidity '

In [16]:
## step 2 tokenize

tokenized_example = tokenize_and_lemmatize_string (clean_example)
tokenized_example

['aromas',
 'include',
 'tropical',
 'fruit',
 'broom',
 'brimstone',
 'and',
 'dried',
 'herb',
 'the',
 'palate',
 'isn',
 't',
 'overly',
 'expressive',
 'offering',
 'unripened',
 'apple',
 'citrus',
 'and',
 'dried',
 'sage',
 'alongside',
 'brisk',
 'acidity']

In [18]:
tagged_example = pos_tagger (tokenized_example)
tagged_example

[('aromas', 'NNS'),
 ('include', 'VBP'),
 ('tropical', 'JJ'),
 ('fruit', 'NN'),
 ('broom', 'NN'),
 ('brimstone', 'NN'),
 ('and', 'CC'),
 ('dried', 'VBD'),
 ('herb', 'NN'),
 ('the', 'DT'),
 ('palate', 'NN'),
 ('isn', 'NN'),
 ('t', 'VBD'),
 ('overly', 'RB'),
 ('expressive', 'JJ'),
 ('offering', 'NN'),
 ('unripened', 'JJ'),
 ('apple', 'NN'),
 ('citrus', 'NN'),
 ('and', 'CC'),
 ('dried', 'JJ'),
 ('sage', 'NN'),
 ('alongside', 'RB'),
 ('brisk', 'JJ'),
 ('acidity', 'NN')]

In [19]:
filtered_tagged_example = remove_unnecessary_pos_tags(tagged_example)
filtered_tagged_example

['tropical',
 'fruit',
 'broom',
 'brimstone',
 'herb',
 'palate',
 'isn',
 'expressive',
 'offering',
 'unripened',
 'apple',
 'citrus',
 'dried',
 'sage',
 'brisk',
 'acidity']

In [23]:
len (filtered_tagged_example)

16

In [24]:
len (no_stopwords_example)

15

In [22]:
no_stopwords_example = remove_stopwords (filtered_tagged_example)
no_stopwords_example 

['tropical',
 'fruit',
 'broom',
 'brimstone',
 'herb',
 'palate',
 'expressive',
 'offering',
 'unripened',
 'apple',
 'citrus',
 'dried',
 'sage',
 'brisk',
 'acidity']

In [25]:
final_example = remove_short_words(no_stopwords_example)
final_example

['tropical',
 'fruit',
 'broom',
 'brimstone',
 'herb',
 'palate',
 'expressive',
 'offering',
 'unripened',
 'apple',
 'citrus',
 'dried',
 'sage',
 'brisk',
 'acidity']

In [26]:
len(final_example)

15