In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn import metrics

In [3]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPool1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
EMBEDDING_FILE = '/home/adam/R/Yelp/dataset/crawl-300d-2M.vec'

In [5]:
train = pd.read_csv('/home/adam/R/Yelp/dataset/model_train.csv', usecols=['text', 'stars'])
test = pd.read_csv('/home/adam/R/Yelp/dataset/model_test.csv', usecols = ['text', 'stars'])

In [6]:
train = pd.get_dummies(train, columns = ['stars'])
test = pd.get_dummies(test, columns = ['stars'])

In [7]:
train = train.sample(frac = .5)

In [8]:
X_train = train['text'].values
y_train = train[['stars_1', 'stars_2', 'stars_3', 'stars_4', 'stars_5']]
X_test = test['text'].values
y_test = test[['stars_1', 'stars_2', 'stars_3', 'stars_4', 'stars_5']]

In [9]:
max_features = 10000
maxlen = 100
embed_size = 300

In [10]:
tokenizer = text.Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))

In [11]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen = maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [12]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

In [13]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: break
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [14]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval = 1):
        super(Callback, self).__init__()
        
        self.interval = interval
        self.X_val, self.y_val = validation_data
        
    def on_epoch_end(self, epoch, logs = {}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose = 0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - socre: %.6f \n" % (epoch+1, score))

In [15]:
def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights = [embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPool1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(5, activation = 'sigmoid')(conc)
    
    model = Model(inputs = inp, outputs = outp)
    model.compile(loss='binary_crossentropy',
                  optimizer = 'adam',
                 metrics = ['accuracy'])
    
    return model

In [16]:
model = get_model()

In [17]:
batch_size = 256
epochs = 2

In [18]:
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size = 0.95, random_state = 233)



In [19]:
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval = 1)

In [20]:
hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                callbacks = [RocAuc], verbose = 2)

Train on 133000 samples, validate on 7000 samples
Epoch 1/2
 - 91s - loss: 0.3365 - acc: 0.8452 - val_loss: 0.2965 - val_acc: 0.8590

 ROC-AUC - epoch: 1 - socre: 0.884945 

Epoch 2/2
 - 90s - loss: 0.2870 - acc: 0.8647 - val_loss: 0.2836 - val_acc: 0.8652

 ROC-AUC - epoch: 2 - socre: 0.894973 



In [21]:
y_test = model.predict([x_test], batch_size=1024, verbose = 1)



In [26]:
p = pd.DataFrame(y_test, columns = [['stars_1', 'stars_2', 'stars_3', 'stars_4', 'stars_5']])

In [28]:
p.to_csv('gru_fasttext_preds.csv', index = False)

In [29]:
p

Unnamed: 0,stars_1,stars_2,stars_3,stars_4,stars_5
0,0.001430,0.002741,0.057203,0.592002,0.356129
1,0.001737,0.002654,0.010059,0.122033,0.887182
2,0.000700,0.000317,0.003364,0.117154,0.885646
3,0.123683,0.377102,0.308473,0.047827,0.011795
4,0.001725,0.001485,0.011115,0.122977,0.886352
5,0.307285,0.389155,0.253022,0.032887,0.009971
6,0.004798,0.012381,0.106488,0.446022,0.554152
7,0.001388,0.000744,0.005315,0.213599,0.839225
8,0.000298,0.000165,0.001636,0.090557,0.924011
9,0.001129,0.007314,0.143498,0.594986,0.377356


In [30]:
scores = model.evaluate(x_test, y_test, verbose=1, batch_size = 1024)



In [31]:
scores

[0.2816612017086574, 0.0]

In [44]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 100, 300)     3000000     input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 100, 300)     0           embedding_1[0][0]                
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 100, 160)     182880      spatial_dropout1d_1[0][0]        
__________________________________________________________________________________________________
global_ave

In [51]:
def get_model2():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPool1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(5, activation = 'sigmoid')(conc)
    
    model = Model(inputs = inp, outputs = outp)
    model.compile(loss='binary_crossentropy',
                  optimizer = 'adam',
                 metrics = ['accuracy'])
    
    return model

In [46]:
model2 = get_model2()

In [47]:
hist2 = model2.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                callbacks = [RocAuc], verbose = 2)

Train on 133000 samples, validate on 7000 samples
Epoch 1/2
 - 78s - loss: 0.3518 - acc: 0.8400 - val_loss: 0.3168 - val_acc: 0.8519

 ROC-AUC - epoch: 1 - socre: 0.865366 

Epoch 2/2
 - 77s - loss: 0.3095 - acc: 0.8548 - val_loss: 0.3006 - val_acc: 0.8576

 ROC-AUC - epoch: 2 - socre: 0.880859 



In [48]:
scores = model2.evaluate(x_test, y_test, verbose=1, batch_size = 1024)



In [49]:
scores

[0.299527583530971, 0.8585457019124713]

In [52]:
def get_model3():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(GRU(128, dropout = 0.3, recurrent_dropout=0.5, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPool1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(5, activation = 'sigmoid')(conc)
    
    model = Model(inputs = inp, outputs = outp)
    model.compile(loss='binary_crossentropy',
                  optimizer = 'adam',
                 metrics = ['accuracy'])
    
    return model

In [53]:
model3 = get_model3()

In [54]:
hist3 = model3.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                callbacks = [RocAuc], verbose = 2)

Train on 133000 samples, validate on 7000 samples
Epoch 1/2
 - 89s - loss: 0.3695 - acc: 0.8343 - val_loss: 0.3281 - val_acc: 0.8499

 ROC-AUC - epoch: 1 - socre: 0.856875 

Epoch 2/2
 - 89s - loss: 0.3251 - acc: 0.8496 - val_loss: 0.3101 - val_acc: 0.8545

 ROC-AUC - epoch: 2 - socre: 0.873605 

