# Text Processing and Tokenization

In [20]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import textacy
import pickle
import time

### Load Data
Loading in our data, im only pulling in the 'text' column to save on memory usage

In [5]:
rests = pd.read_csv('/Users/jonathanbeatty/DSI-US-5/Capstone/restaurants_review_final.csv', compression='gzip', usecols=['text'])

In [6]:
rests.shape

(3638625, 1)

In [10]:
df_subset = rests.sample(n=100000)

### Preprocessing Text
Its important to preprocess our text to ensure the computer can easily understand the words and eventually feed them into a vectorizer to conver them to numbers for the computer to read for machine learning. Using textacy's preprocess method, I converted all review text to lowercase and remove numbers, URLs, and punctuation. The textacy preprocessor will convert numbers to the string 'numb' and URLs to the string 'URL'. I chose to combine numbers because time and price descriptions are likely to be very common in reviews. Rather than having individual tokens for '5 minutes' and '10 dollars', documents processed in this way will contain a 'numb' token whenever a numeric term is encountered.

In [9]:
rests.head()

Unnamed: 0,text
0,"""The pizza was okay. Not the best I've had. I ..."
1,I love this place! My fiance And I go here atl...
2,Terrible. Dry corn bread. Rib tips were all fa...
3,Back in 2005-2007 this place was my FAVORITE t...
4,Delicious healthy food. The steak is amazing. ...


In [11]:
%%time
df_subset['processed'] = df_subset['text'].map(lambda x: textacy.preprocess.preprocess_text(x, lowercase=True, 
                                                                                    no_urls=True, 
                                                                                    no_punct=True, 
                                                                                    no_numbers=True))

CPU times: user 31.4 s, sys: 156 ms, total: 31.6 s
Wall time: 31.8 s


### Checking for non English reviews
In yelp there are reviews from around the world, even if they are given from a state or country we have not dropped. To get better results for our modeling we are going to remove all non english characters, sentences etc using the function below. I define the function isEnglish to filter out non-ASCII characters as non-ASCII characters would predominantly be used by users typing reviews in non-English languages.

In [12]:
def isEnglish(s):
    try:
        s.encode('ascii')
    except UnicodeEncodeError:
        return False
    else:
        return True

In [13]:
df_subset['isEnglish'] = df_subset['processed'].astype('str').astype('unicode').apply(lambda x: isEnglish(x) == True)

In [14]:
df_subset = df_subset[df_subset['isEnglish'] == True]

In [15]:
np.save('subset_rests_eng_index.npy', df_subset[df_subset['isEnglish'] == True].index)

### Tokenizing
In this section we are setting up a spacy tokenizer. We will disable part-of-speech tagging, semantic parsing, and text categorization to reduce overall memory usage, and retain the lemmas of each token. We also create a filter function to eliminate stopwords and short tokens (less than 3 characters). The tokenized documents are then added to a list which we can pass through a vectorizer later on.

In [16]:
def token_filter(token): 
    return not (token.is_stop | len(token.text) <= 4)


In [24]:
nlp = textacy.load_spacy("en_core_web_sm", disable = ("tagger", "parser", "ner", "textcat"))

In [25]:
docs = df_subset['processed'].astype('str').astype('unicode').tolist()

In [26]:
filtered_tokens = []
start = time.time()
i = 1
for doc in nlp.pipe(docs, disable=['tagger', 'parser', 'ner', 'textcat'], batch_size=10000):
    try:
        tokens = [token.lemma_ for token in doc if token_filter(token)]
        filtered_tokens.append(tokens)
        i += 1
        if i % 10000 == 0:
            print(f'Tokenized {i} documents in {(time.time()-start)/60} minutes')
    except:
        print(f'Document {i} has an encoding error/has error characters.')

Tokenized 10000 documents in 0.3950824499130249 minutes
Tokenized 20000 documents in 0.7973081151644389 minutes
Tokenized 30000 documents in 1.2111178835233052 minutes
Tokenized 40000 documents in 1.5982892513275146 minutes
Tokenized 50000 documents in 1.9746153473854064 minutes
Tokenized 60000 documents in 2.357376515865326 minutes
Tokenized 70000 documents in 2.7354410688082376 minutes
Tokenized 80000 documents in 3.123135312398275 minutes
Tokenized 90000 documents in 3.512531514962514 minutes
Tokenized 100000 documents in 3.8927324175834657 minutes


In [27]:
with open('tokenized_rest_reviews.pkl', 'wb') as f:
    pickle.dump(filtered_tokens, f)

In [28]:
filtered_tokens

[['lobster',
  'tacos',
  'must',
  'they',
  'size',
  'crispy',
  'full',
  'flavor',
  'be',
  'here',
  'twice',
  'happy',
  'appetizer',
  'haven',
  'be',
  'disappoint',
  'pricey',
  'what',
  'restaurant',
  'vega',
  'between',
  'decorative',
  'ceiling',
  'atmosphere',
  'great'],
 ['very',
  'unique',
  'friendly',
  'staff',
  'lunch',
  'with',
  'family',
  'great',
  'with',
  'great',
  'never',
  'rush',
  'plenty',
  'refill',
  'side',
  'dish',
  'would',
  'return',
  'again'],
 ['dinuguan',
  'blood',
  'director',
  'birthday',
  'crave',
  'filipino',
  'from',
  'even',
  'though',
  'next',
  'scarfed',
  'love',
  'figure',
  'that'],
 ['here',
  'when',
  'chill',
  'middle',
  'vega',
  'europeans',
  'fight',
  'hangover',
  'greasy',
  'food',
  'good',
  'drink',
  'enough',
  'have',
  'hangover',
  'agree',
  'with',
  'everything',
  'europeans',
  'house',
  'that',
  'greasy',
  'have',
  'that',
  'onto',
  'review',
  'nfood',
  'nportions',
 