In [20]:
import numpy as np 
import pandas as pd

import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [21]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

In [33]:
train_data = pd.read_csv('training_data.csv')
test_data = pd.read_csv('test_data.csv')

In [9]:
train_data_text = train_data['text']
train_label = train_data['stars']
test_data_text = test_data['text']

In [10]:
# patterns that used to find or/and replace particular chars or words
# to find chars that are not a letter, a blank or a quotation
pat_letter = re.compile(r'[^a-zA-Z \']+')
# to find the 's following the pronouns. re.I is refers to ignore case
pat_is = re.compile("(it|he|she|that|this|there|here)(\'s)", re.I)
# to find the 's following the letters
pat_s = re.compile("(?<=[a-zA-Z])\'s")
# to find the ' following the words ending by s
pat_s2 = re.compile("(?<=s)\'s?")
# to find the abbreviation of not
pat_not = re.compile("(?<=[a-zA-Z])n\'t")
# to find cannot
pat_cannot = re.compile("cannot")
# to find the abbreviation of would
pat_would = re.compile("(?<=[a-zA-Z])\'d")
# to find the abbreviation of will
pat_will = re.compile("(?<=[a-zA-Z])\'ll")
# to find the abbreviation of am
pat_am = re.compile("(?<=[I|i])\'m")
# to find the abbreviation of are
pat_are = re.compile("(?<=[a-zA-Z])\'re")
# to find the abbreviation of have
pat_ve = re.compile("(?<=[a-zA-Z])\'ve")
# to find the abbreviation of a.m./p.m.
pat_AMPM = re.compile("a m|p m")

In [11]:
lmtzr = WordNetLemmatizer()

In [12]:
# Use Regular Expression and Stop Words to clean the text
with_negative_words = 0 # Set 1 if apply LSTM; set 0 if apply Xgboost Regressor

with open('common_english_words.txt') as f:
    content = f.readlines()
my_stop_words = content[0].split(',')
stop_words = set(ENGLISH_STOP_WORDS.union(my_stop_words))
stop_words.add('ca')
stop_words.remove('cry')
stop_words.remove('dear')
stop_words.remove('interest')
stop_words.remove('like')
stop_words.remove('never')
stop_words.remove('please')
stop_words.remove('serious')
stop_words.remove('top')
stop_words.remove('well')
if with_negative_words==1:
    stop_words.remove('against')
    stop_words.remove('except')
    stop_words.remove('neither')
    stop_words.remove('no')
    stop_words.remove('nor')
    stop_words.remove('not')
    stop_words.remove('none')
stop_words = frozenset(stop_words)

In [14]:
# The function to replace abbreviation
def replace_abbreviations(text):
    new_text = text
    new_text = pat_letter.sub(' ', text).strip().lower()
    new_text = pat_is.sub(r"\1 is", new_text)
    new_text = pat_s.sub("", new_text)
    new_text = pat_s2.sub("", new_text)
    new_text = pat_not.sub(" not", new_text)
    new_text = pat_cannot.sub(" can not", new_text)
    new_text = pat_would.sub(" would", new_text)
    new_text = pat_will.sub(" will", new_text)
    new_text = pat_am.sub(" am", new_text)
    new_text = pat_are.sub(" are", new_text)
    new_text = pat_ve.sub(" have", new_text)
    new_text = pat_AMPM.sub("", new_text)
    new_text = new_text.replace('\'', ' ')
    return new_text

In [15]:
# The function to get the word's pos
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif treebank_tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return ''

In [16]:
# The function to lemmatize each words
def lemmatize_word(words):
    new_words = []
    for word in words:
        if word:
            tag = nltk.pos_tag(word_tokenize(word)) # tag is like [('bigger', 'JJR')]
            pos = get_wordnet_pos(tag[0][1])
            if pos:
                lemmatized_word = lmtzr.lemmatize(word, pos)
                new_words.append(lemmatized_word)
            else:
                new_words.append(word)
    return new_words

In [17]:
# The function to clean the stop words
def clean_stop_words(words,stops):
    new_words = [w for w in words if not w in stops]
    return new_words

In [18]:
# Combine the functions above to preprocess the text data
def preprocess_text(text,stops):
    words = clean_stop_words(lemmatize_word(replace_abbreviations(text).split()),stops)
    text = ' '.join(words)
    return text

In [None]:
train_data['text'] = train_data['text'].map(lambda x: preprocess_text(x,stop_words))
test_data['text'] = test_data['text'].map(lambda x: preprocess_text(x,stop_words))

In [15]:
i=5
print(train_data.loc[i,'text'])
print(train_data.loc[i,'stars'])

avoid cost awful service slow hell disposable plasticware plate add landfill insult heinous food despite tell rush catch plane multiple reassurance food arrive plenty time lollygagged end pay seriously piss hungry food blech
1


In [16]:
if with_negative_words==1:
    train_data.to_csv('train_after_nltk_with_negative.csv')
    test_data.to_csv('test_after_nltk_with_negative.csv')
else:
    train_data.to_csv('train_after_nltk_without_negative.csv')
    test_data.to_csv('test_after_nltk_without_negative.csv')