In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.utils import to_categorical
from sklearn import metrics

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train_path = '/home/adam/R/Yelp/dataset/model_train.csv'
test_path = '/home/adam/R/Yelp/dataset/model_test.csv'

In [64]:
EMBEDDING_FILE = f'/home/adam/R/Yelp/dataset/glove.6B.50d.txt'
EMBEDDING_FILE_LARGE = f'/home/adam/R/Yelp/dataset/glove.6B.100d.txt'

In [4]:
# config parameters
embed_size = 50 # How big is each word vector
max_features = 4000 # how many unqiue words to use(# rows in embedding vector)
maxlen = 100 # max # of words in review to use

In [6]:
# Going to use only about half the training data this time to see if I can up other training features
# It's a memory issue at this point

train = pd.read_csv(train_path, usecols = ['stars', 'text'])
train = pd.get_dummies(train, columns = ['stars'])

train = train.sample(frac = .5)

In [7]:
train.shape

(140000, 6)

In [8]:
test = pd.read_csv(test_path, usecols = ['stars', 'text'])
test = pd.get_dummies(test, columns = ['stars'])

In [9]:
list_sentences_train = train['text'].values

In [10]:
list_classes = ['stars_1', 'stars_2', 'stars_3', 'stars_4', 'stars_5']
y = train[list_classes].values

In [11]:
y_te = test[list_classes].values

In [12]:
list_sentences_test = test['text'].values

In [13]:
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [14]:
print('Shape of data tensor:', X_t.shape)

Shape of data tensor: (140000, 100)


In [15]:
print('Shape of data tensor:', y.shape)

Shape of data tensor: (140000, 5)


In [16]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

In [17]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

(0.020940498, 0.6441043)

In [18]:
word_index = tokenizer.word_index

In [19]:
len(word_index)

84785

In [20]:
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [21]:
embedding_matrix.shape

(2000, 50)

In [22]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights = [embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(5, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [23]:
model.fit(X_t, y, batch_size = 256, epochs = 2, validation_split = 0.1)

Train on 126000 samples, validate on 14000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fb2383796d8>

In [21]:
y_test = model.predict([X_te], batch_size = 1024, verbose = 1)



In [24]:
scores = model.evaluate(X_te, y_te, verbose = 1, batch_size = 1024)



In [25]:
scores

[0.30282640877451217, 0.8572314246995109]

In [27]:
# config parameters
embed_size = 50 # How big is each word vector
max_features = 10000 # how many unqiue words to use(# rows in embedding vector)
maxlen = 100 # max # of words in review to use

In [28]:
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [29]:
word_index = tokenizer.word_index

In [30]:
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [31]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights = [embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(5, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [32]:
model.fit(X_t, y, batch_size = 256, epochs = 2, validation_split = 0.1)

Train on 126000 samples, validate on 14000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fb2c5560400>

In [33]:
# config parameters
embed_size = 50 # How big is each word vector
max_features = 10000 # how many unqiue words to use(# rows in embedding vector)
maxlen = 100 # max # of words in review to use

In [34]:
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [35]:
word_index = tokenizer.word_index

In [36]:
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [37]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights = [embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(5, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [38]:
model.fit(X_t, y, batch_size = 256, epochs = 2, validation_split = 0.1)

Train on 126000 samples, validate on 14000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fb235615978>

In [39]:
# config parameters
embed_size = 50 # How big is each word vector
max_features = 10000 # how many unqiue words to use(# rows in embedding vector)
maxlen = 150 # max # of words in review to use

In [40]:
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [41]:
word_index = tokenizer.word_index

In [42]:
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [43]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights = [embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(5, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [44]:
model.fit(X_t, y, batch_size = 256, epochs = 2, validation_split = 0.1)

Train on 126000 samples, validate on 14000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fb2bbf1d4e0>

In [65]:
# config parameters
embed_size = 100 # How big is each word vector
max_features = 10000 # how many unqiue words to use(# rows in embedding vector)
maxlen = 150 # max # of words in review to use

In [66]:
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [67]:
word_index = tokenizer.word_index

In [68]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE_LARGE))

In [69]:
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: break
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [70]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights = [embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(5, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [71]:
model.fit(X_t, y, batch_size = 256, epochs = 2, validation_split = 0.1)

Train on 126000 samples, validate on 14000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fb21a8c57f0>