In [3]:
import pandas as pd
import string
from nltk import pos_tag
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.tokenize import casual_tokenize, word_tokenize

In [4]:
review = pd.read_csv('review.csv')

In [22]:
df = review.copy().head(100)

In [6]:
def lemmatize(x):
    from nltk.stem import WordNetLemmatizer
    from nltk.corpus.reader.wordnet import NOUN, VERB, ADJ, ADV
    
    word, pos = x
    wordnet_lemmatizer = WordNetLemmatizer()
    
    if pos == 'NOUN':
        return wordnet_lemmatizer.lemmatize(word, NOUN)
    elif pos == 'VERB':
        return wordnet_lemmatizer.lemmatize(word, VERB)
    elif pos == 'ADJ':
        return wordnet_lemmatizer.lemmatize(word, ADJ)
    elif pos == 'ADV':
        return wordnet_lemmatizer.lemmatize(word, ADV)
    else:
        return word

In [27]:
# Lowercase conversion
df['text'] = df['text'].str.lower()

# Remove stopwords
df['text'] = df['text'].apply(lambda row: ' '.join([x for x in casual_tokenize(row) if x not in ENGLISH_STOP_WORDS]))

## Expand contractions

# Remove punctuations
punct = r'[{}]'.format(string.punctuation)
df['text'] = df['text'].str.replace(punct, '')

# Remove digits
digit = r'\d+'
df['text'] = df['text'].str.replace(digit, '')

# Whitespace removal
whitespace = r'\s+'
df['text'] = df['text'].str.replace(whitespace, ' ')

# Strip trailing whitespace
df['text'] = df['text'].str.strip()

# Tag part of speech
df['text'] = df['text'].apply(lambda x: pos_tag(word_tokenize(x), tagset='universal'))

# Lemmatization
df['text'] = df['text'].apply(lambda row: ' '.join([lemmatize(x) for x in row])).tolist()

In [29]:
df['text'].head().tolist()

['chevos chandler delicious ahwatukee different reason order chicken roll taco today tiny lil piece chicken basically roll deep fry tortilla yuck flavor order carne asada taco meat taste old like cook earlier just throw grill warm dissapointed',
 'place dirty grimy twice customer service horrible',
 'holy portion size lot bang buck service super fast love tempura avocado appetizer',
 'flavor actually pretty good use eat menudo tortilla pleasant lemonade good flavor ask refill bring service prompt food table time really busy maybe help good experience',
 'place great flavor server thing ask bring chip salsa begin meal great flavor bread menudo toast spread butter home flavor gladly recommend place star instead star coke zero expire gladly replace new question ask great service']

In [28]:
df['text'].tail().tolist()

['carlsbad truly favorite place valley usually opportunity lunch place hold special place heart real meal arizona lunch trip interview favorite place coworkers visit thing atmosphere fantastic inside outside great seat love welcome feel arrive staff decor outdoor seat area best ive youve sit outdoor bar pond truly miss food fantastic portion price blue corn enchilada favorites warn eat fan cheese live cheese love lunch special incredible definitely worth try dont know thats disappointed time roast chicken enchilada mushroom cream sauce say eat enjoy fantastic food amazingly personable staff great setting',
 'abridge good food hotel restaurant feel head alexis friend good old carbs veggie run saturday night arrive little worried starch burgundy table decor lack patron think stiff uncomfortable hotel restaurant server friendly knowledgeable menu look fantastic soso price end primavera dish stay water drink food great bread devour pleasure end dont think patron staff amuse ordinary conver

## textblob

In [40]:
! pip install textblob

Collecting textblob
  Using cached https://files.pythonhosted.org/packages/60/f0/1d9bfcc8ee6b83472ec571406bd0dd51c0e6330ff1a51b2d29861d389e85/textblob-0.15.3-py2.py3-none-any.whl
Installing collected packages: textblob
Successfully installed textblob-0.15.3


In [41]:
# pip install textblob
from textblob import TextBlob, Word

# Lemmatize a word
word = 'stripes'
w = Word(word)
w.lemmatize()
#> stripe

'stripe'

In [42]:
# Lemmatize a sentence
sentence = "The striped bats are hanging on their feet for best"
sent = TextBlob(sentence)
" ". join([w.lemmatize() for w in sent.words])
#> 'The striped bat are hanging on their foot for best'

'The striped bat are hanging on their foot for best'

In [43]:
# Define function to lemmatize each word with its POS tag
def lemmatize_with_postag(sentence):
    sent = TextBlob(sentence)
    tag_dict = {"J": 'a', 
                "N": 'n', 
                "V": 'v', 
                "R": 'r'}
    words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]    
    lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
    return " ".join(lemmatized_list)

# Lemmatize
sentence = "The striped bats are hanging on their feet for best"
lemmatize_with_postag(sentence)

'The striped bat be hang on their foot for best'

In [None]:
# Filter POS tag: e.g. VERB, JJ, NN

In [None]:
# Find better lemmatizer: try spacy

In [None]:
# Expand contractions