In [1]:
import pandas as pd
import numpy as np
import math
import string
import copy
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from collections import Counter
from tqdm import tqdm
from tensorflow_addons.metrics import F1Score
import warnings
import os
warnings.filterwarnings("ignore")

In [41]:
stfold = StratifiedKFold(n_splits=5, shuffle=True)
train_l = list()
val_l = list()
test_l = list()
maxlen = 100
cnt = 0
for train_index, test_index in stfold.split(data['comment'], data['label']):
    cnt += 1
    train = data.iloc[train_index]
    train_x, train_y = train['comment'], train['label']
    tmp = data.iloc[test_index]

    val, test = train_test_split(tmp, test_size=0.5, stratify=tmp['label'])    
    val_x, val_y = val['comment'], val['label']
    test_x, test_y = test['comment'], test['label']    
    
    train_x = keras.preprocessing.sequence.pad_sequences(train_x, maxlen=maxlen)
    val_x = keras.preprocessing.sequence.pad_sequences(val_x, maxlen=maxlen)
    test_x = keras.preprocessing.sequence.pad_sequences(test_x, maxlen=maxlen)
    
    embed_dim = 32  # Embedding size for each token
    vocab_size = n_word_unique + 1

    inputs = layers.Input(shape=(maxlen,))
    embedding_layer = layers.Embedding(input_dim = vocab_size, output_dim = embed_dim, input_length=maxlen)
    x = embedding_layer(inputs)
    x = layers.Bidirectional(layers.SimpleRNN(32, input_dim=(None, embed_dim), 
                                         return_sequences=False, activation='tanh', 
                                         dropout=0.4))(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)

    model = keras.Model(inputs=inputs, outputs=outputs)
    model_path = "rnn_models/{itr:04}/".format(itr=cnt)
    checkpoint_path = model_path + "{epoch:04d}.ckpt"
    checkpoint_dir = os.path.dirname(checkpoint_path)
    !mkdir $model_path
    
    
    model.compile('adam', "binary_crossentropy", metrics=["accuracy", 
                                                          tf.keras.metrics.AUC(name='auc')])
    
    
    #create checkpoint to save model
    #with best validation loss
    model.save_weights(checkpoint_path.format(epoch=0))

    checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=1,
            save_weights_only=True, save_best_only=True, mode='auto')
    
    history = model.fit(
                        train_x, train_y, 
                        batch_size=128, epochs=10, 
                        validation_data=(val_x, val_y),
                        callbacks = [checkpoint]
                        )

    latest = tf.train.latest_checkpoint(checkpoint_dir)
    model.load_weights(latest)
    
    train_eval = model.evaluate(train_x, train_y)
    val_eval = model.evaluate(val_x, val_y)
    test_eval = model.evaluate(test_x, test_y)
    
    test_l.append(test_eval)
    val_l.append(val_eval)
    train_l.append(train_eval)

mkdir: cannot create directory ‘rnn_models/0001/’: No such file or directory
Epoch 1/10
Epoch 00001: val_loss improved from inf to 0.65479, saving model to rnn_models/0001/0001.ckpt
Epoch 2/10
Epoch 00002: val_loss improved from 0.65479 to 0.54949, saving model to rnn_models/0001/0002.ckpt
Epoch 3/10
Epoch 00003: val_loss did not improve from 0.54949
Epoch 4/10
Epoch 00004: val_loss did not improve from 0.54949
Epoch 5/10
Epoch 00005: val_loss did not improve from 0.54949
Epoch 6/10
Epoch 00006: val_loss did not improve from 0.54949
Epoch 7/10
Epoch 00007: val_loss did not improve from 0.54949
Epoch 8/10
Epoch 00008: val_loss did not improve from 0.54949
Epoch 9/10
Epoch 00009: val_loss did not improve from 0.54949
Epoch 10/10
Epoch 00010: val_loss did not improve from 0.54949
Epoch 1/10
Epoch 00001: val_loss improved from inf to 0.66639, saving model to rnn_models/0002/0001.ckpt
Epoch 2/10
Epoch 00002: val_loss improved from 0.66639 to 0.63292, saving model to rnn_models/0002/0002.ck

In [42]:
test_l = np.array(test_l)
val_l = np.array(val_l)
train_l = np.array(train_l)

print("test avg loss: ", np.mean(test_l[:, 0]), "+/-" ,np.std(test_l[:, 0]))
print("test avg acc: ", np.mean(test_l[:, 1]),  "+/-" ,np.std(test_l[:, 1]))
print("test avg auc: ", np.mean(test_l[:, 2]),  "+/-" ,np.std(test_l[:, 2]))
print('\n')


print("val avg loss: ", np.mean(val_l[:, 0]), "+/-" ,np.std(val_l[:, 0]))
print("val avg acc: ", np.mean(val_l[:, 1]),  "+/-" ,np.std(val_l[:, 1]))
print("val avg auc: ", np.mean(val_l[:, 2]),  "+/-" ,np.std(val_l[:, 2]))

print('\n')


print("train avg loss: ", np.mean(train_l[:, 0]), "+/-" ,np.std(train_l[:, 0]))
print("train avg acc: ", np.mean(train_l[:, 1]),  "+/-" ,np.std(train_l[:, 1]))
print("train avg auc: ", np.mean(train_l[:, 2]),  "+/-" ,np.std(train_l[:, 2]))


test avg loss:  0.5448884963989258 +/- 0.06499129011133134
test avg acc:  0.7584000110626221 +/- 0.04081959976389082
test avg auc:  0.8318245768547058 +/- 0.038958615474521216


val avg loss:  0.5375609397888184 +/- 0.022291112132782263
val avg acc:  0.7685999870300293 +/- 0.027954242640204272
val avg auc:  0.8390363693237305 +/- 0.022960162845348783


train avg loss:  0.091223568841815 +/- 0.10088357637319861
train avg acc:  0.9791249990463257 +/- 0.028066106593395763
train avg auc:  0.9945710659027099 +/- 0.0098548423747382
