In [23]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.utils import to_categorical
from sklearn import metrics

In [2]:
train_path = '/home/adam/R/Yelp/dataset/model_train.csv'
test_path = '/home/adam/R/Yelp/dataset/model_test.csv'

In [3]:
EMBEDDING_FILE = f'/home/adam/R/Yelp/dataset/glove.6B.50d.txt'

In [4]:
# config parameters
embed_size = 50 # How big is each word vector
max_features = 2000 # how many unqiue words to use(# rows in embedding vector)
maxlen = 100 # max # of words in review to use

In [5]:
train = pd.read_csv(train_path, usecols = ['stars', 'text'])
train = pd.get_dummies(train, columns = ['stars'])

In [6]:
test = pd.read_csv(test_path, usecols = ['stars', 'text'])
test = pd.get_dummies(test, columns = ['stars'])

In [7]:
list_sentences_train = train['text'].values

In [8]:
list_classes = ['stars_1', 'stars_2', 'stars_3', 'stars_4', 'stars_5']
y = train[list_classes].values

In [24]:
y_te = test[list_classes].values

In [9]:
list_sentences_test = test['text'].values

In [10]:
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [11]:
print('Shape of data tensor:', X_t.shape)

Shape of data tensor: (280000, 100)


In [12]:
print('Shape of data tensor:', y.shape)

Shape of data tensor: (280000, 5)


In [13]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

In [14]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

(0.020940498, 0.6441043)

In [15]:
word_index = tokenizer.word_index

In [16]:
len(word_index)

119817

In [17]:
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [18]:
embedding_matrix.shape

(2000, 50)

In [26]:
embedding_matrix = np.zeros((len(word_index) + 1, embed_size))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [52]:
embedding_matrix.shape

(5000, 50)

In [19]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights = [embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(5, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [20]:
model.fit(X_t, y, batch_size = 256, epochs = 2, validation_split = 0.1)

Train on 252000 samples, validate on 28000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f4acc190dd8>

In [21]:
y_test = model.predict([X_te], batch_size = 1024, verbose = 1)



In [29]:
y_test[0]

array([0.00059915, 0.0035895 , 0.09894684, 0.5776345 , 0.29376775],
      dtype=float32)

In [26]:
metrics.accuracy_score(y_te, y_test)

ValueError: Classification metrics can't handle a mix of multilabel-indicator and continuous-multioutput targets

In [32]:
scores = model.evaluate(X_te, y_te, verbose = 1, batch_size = 1024)



In [33]:
scores

[0.2918419709273747, 0.8611542702402387]