## Preparation

In [0]:
import os
import json
import ast
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import keras
from wordcloud import WordCloud, STOPWORDS

%matplotlib inline

Using TensorFlow backend.


In [0]:
# before use data
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
# get the work directory
os.chdir('/content/gdrive/My Drive/STAT-628-3')
# show the files in the wd
print('The work directory contains:')
!ls

The work directory contains:
business.json  images		models	     tip.json
data	       load_data.ipynb	review.json  user.json


## load the chosen data 
(Start here if the ```.csv``` file was generated)

In [0]:
final_data = pd.read_csv('data/final_data.csv')
text = final_data.text.fillna('').values
star = final_data.stars.values

## Pre-processing
### Tokenizer

In [0]:
# hyper parameters
MAX_NUM_WORDS = 1000 # the maximum number of words to keep, based on word frequency
MAX_SEQUENCE_LENGTH=100 # max number of words in a review to use (if length of review not enough, padding with 0 in the left)

In [0]:
# lowercase and tokenization
tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_NUM_WORDS, lower=True, split=" ")
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)

word_index = tokenizer.word_index # the generated dictionary
print('The length of the dictionary: %s' % len(word_index))

data = keras.preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
label = keras.utils.to_categorical(np.asarray(star))

print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', label.shape)

The length of the dictionary: 122150
Shape of data tensor: (403941, 100)
Shape of label tensor: (403941, 6)


In [0]:
# view the data
data[1]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0, 605, 171, 155,  20,   1,   2,   1,
       729,  62, 133,   4,  45, 397, 327,  17, 107, 159, 594, 176,   2,
        33,   5, 812,  10, 137,  29,  31,  35,  98], dtype=int32)

In [0]:
# check the dictionary
word_index["pick"]

605

In [0]:
# check the text data
text[1]

'Pick any meat on the planet and the chef will make a Mexican style dish with amazing flavor! Wow, fish and lamb tacos to die for. Drinks are great as well.'

In [0]:
## build df for some analhyses
# senti = ["not"] * len(star)
# for i in range(len(star)):
#     if star[i] > 3:
#         senti[i] = 'Posi'
#     else:
#         senti[i] = 'Nega'
# senti[:10]

# output = pd.DataFrame()
# word_dict = pd.DataFrame()
# output['text'] = sequences
# output['star'] = star
# output['senti'] = senti

# word_dict['dict_keys'] = keys
# word_dict['dict_values'] = values

# output.to_csv("data/mexican_token.csv")
# word_dict.to_csv("data/word_dict.csv")

# output.head(3)

Unnamed: 0,text,star,senti
0,"[3, 87, 12, 2, 3, 87, 45, 12, 71, 86, 52, 362,...",3.0,Nega
1,"[605, 171, 155, 20, 1, 2, 1, 729, 62, 133, 4, ...",5.0,Posi
2,"[474, 7, 140, 64, 1, 176, 33, 209, 241, 204, 2...",5.0,Posi


### Word Embedding

In [0]:
# load the GloVe models
GLOVE_PATH = './models/'
EMBEDDING_DIM = 50

embeddings_index = {}
f = open(os.path.join(GLOVE_PATH, 'glove.6B.50d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Length of word embedding dictionary: %s' % len(embeddings_index))

Length of word embedding dictionary: 400000


In [0]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [0]:
embedding_matrix[2]

array([ 0.26818001,  0.14346001, -0.27877   ,  0.016257  ,  0.11384   ,
        0.69923002, -0.51332003, -0.47367999, -0.33074999, -0.13834   ,
        0.27020001,  0.30937999, -0.45012   , -0.4127    , -0.09932   ,
        0.038085  ,  0.029749  ,  0.10076   , -0.25058001, -0.51818001,
        0.34558001,  0.44922   ,  0.48791   , -0.080866  , -0.10121   ,
       -1.37769997, -0.10866   , -0.23201001,  0.012839  , -0.46507999,
        3.84629989,  0.31362   ,  0.13643   , -0.52244002,  0.33019999,
        0.33706999, -0.35600999,  0.32431   ,  0.12041   ,  0.35120001,
       -0.069043  ,  0.36884999,  0.25167999, -0.24517   ,  0.25380999,
        0.1367    , -0.31178001, -0.63209999, -0.25027999, -0.38097   ])

In [0]:
# define embedding layer
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

122150