In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.utils import to_categorical

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
raw_data_path = '/home/adam/R/Yelp/dataset/model_dataset_large.csv'

In [4]:
EMBEDDING_FILE = f'/home/adam/R/Yelp/dataset/glove.6B.50d.txt'

In [5]:
full = pd.read_csv(raw_data_path, usecols = ['stars', 'text'])

In [None]:
full.head()

In [None]:
full.shape

In [6]:
# Pos/Neg 
reviews = full[full.stars!=3]

In [7]:
reviews.stars.unique()

array([5, 4, 1, 2])

In [7]:
reviews['labels'] = reviews['stars'].apply(lambda x:1 if x > 3 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [8]:
reviews = reviews.drop("stars", axis = 1)

In [9]:
reviews.head()

Unnamed: 0,text,labels
0,Stopped in on a Thursday around 5. Soft lighti...,1
1,"AYCE appetizers, AYCE daily specials and AYCE ...",1
2,Food here is always great. Place looks insigni...,1
6,SO yummy! We came here for breakfast and to sa...,1
7,Favorite spots in Vegas during the night after...,1


In [10]:
texts = reviews['text'].values
labels = reviews['labels'].values

In [20]:
texts

array(["Stopped in on a Thursday around 5. Soft lighting, patio seating with very little outside nose, right next to a quaint little courtyard. Very relaxed atmosphere. Comfy for a small space, but with enough room that you're not on top of other guests. \n\nHappy hour prices were very reasonable. 2 flatbreads and 2 beers for $16. The flatbreads we got (Margarita and BBQ Chicken) were delicious and good sized. \n\nService was quick and very friendly. He stopped the table when he was needed and left us alone when he wasn't. \n\nHighly recommend popping in for a drink and a bite to eat! I will definitely be back!",
       "AYCE appetizers, AYCE daily specials and AYCE dessert.  That, to me, is what makes Goyemon stand out above the competition.  We're not just talking miso soup or mochi ice cream either!  Delicious pork belly, yummy grilled veggies, daily sushi specials, green tea cake, black sesame creme brulee...these are some unique offerings.\n\nSushi here is good, and the daily spec

In [11]:
MAX_NUM_WORDS=20000 # how many unique words to use (i.e num rows in embedding vector)
MAX_SEQUENCE_LENGTH=100 # max number of words in a review to use

tokenizer = Tokenizer(num_words =MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [12]:
labels = to_categorical(np.asarray(labels))

In [None]:
labels

In [13]:
valid_split = 0.2

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(valid_split * data.shape[0])

x_train = data[: -nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]
word_index = tokenizer.word_index

In [14]:
embeddings_index = {}
GLOVE_DIR = '/home/adam/R/Yelp/dataset/'
f = open(os.path.join(GLOVE_DIR, 'glove.6B.50d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [15]:
EMBEDDING_DIM = 50
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [16]:
embedding_layer = Embedding(len(word_index) + 1,
                           EMBEDDING_DIM,
                           weights = [embedding_matrix],
                           input_length = MAX_SEQUENCE_LENGTH,
                           trainable = False)

In [17]:
inp = Input(shape = (MAX_SEQUENCE_LENGTH,))
x = embedded_seqeuences = embedding_layer(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(2, activation='sigmoid')(x)
model = Model(inputs = inp, outputs = x)
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])


In [18]:
model.fit(x_train, y_train, validation_data= (x_val, y_val),
         epochs = 2, batch_size = 1024)

Train on 208687 samples, validate on 52171 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f976e8dafd0>