In [1]:
# Imports
from tensorflow.keras.layers import TextVectorization
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense 
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers


In [2]:
# Import data
data = pd.read_csv("../../data/mturk_experiment_2.csv",encoding='unicode_escape')
labels = data["Formality"]
samples = data["Sentence"]

train_samples, test_samples, train_labels,test_labels = train_test_split(samples, labels, test_size=0.2)

train_samples = np.array(train_samples)
test_samples = np.array(test_samples)
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [3]:
vectoriser = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
vectoriser.adapt(text_ds)

In [4]:
voc = vectoriser.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [5]:
glove_path = "glove.6B.100d.txt"
embeddings_index = {}
with open(glove_path,encoding="utf8") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs,"f",sep= " ")
        embeddings_index[word] = coefs

In [6]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

embedding_matrix = np.zeros((num_tokens,embedding_dim))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1

In [7]:
embedding_layer = Embedding(num_tokens,embedding_dim,embeddings_initializer=keras.initializers.Constant(embedding_matrix),trainable=False)

In [8]:
int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.GRU(256,return_sequences=True)(embedded_sequences)
x = layers.SimpleRNN(128)(x)
out = layers.Dense(1,activation='relu')(x)

model = keras.Model(int_sequences_input,out)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 100)         1372100   
_________________________________________________________________
gru (GRU)                    (None, None, 256)         274944    
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 128)               49280     
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 1,696,453
Trainable params: 324,353
Non-trainable params: 1,372,100
_________________________________________________________________


In [9]:
x_train = vectoriser(np.array([[s] for s in train_samples])).numpy()
x_val = vectoriser(np.array([[s] for s in test_samples])).numpy()

y_train = np.array(train_labels)
y_val = np.array(test_labels)

In [10]:
model.compile(optimizer=Adam(learning_rate=0.0001),loss='mean_squared_error',metrics=[tf.keras.metrics.MeanSquaredError(),tf.keras.losses.MeanAbsoluteError(),tf.keras.losses.MeanAbsolutePercentageError()])

In [11]:
history = model.fit(x=x_train,y=y_train,batch_size=32,epochs=80)

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80

KeyboardInterrupt: 

In [None]:
mse = history.history['mean_squared_error']
mae = history.history['mean_absolute_error']
mpe = history.history['mean_absolute_percentage_error']

scores = pd.DataFrame()
scores["MSE"] = mse
scores["MAE"] = mae
scores["% Error"] = mpe

scores

Unnamed: 0,MSE,MAE,% Error
0,4.522540,1.743243,41.392445
1,1.148622,0.866527,24.020020
2,0.987804,0.793727,22.710367
3,0.794971,0.709071,20.334349
4,0.710212,0.670712,19.134657
...,...,...,...
95,1.268346,0.940937,28.151360
96,1.268001,0.941780,28.214973
97,1.268312,0.941148,28.205439
98,1.267933,0.942255,28.153749


In [None]:
scores = model.evaluate(x=x_val,y=y_val)
scores[1:]



[1.3393315076828003,
 1.3393315076828003,
 0.9713089466094971,
 29.133699417114258]