In [1]:
# Imports
from tensorflow.keras.layers import TextVectorization
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense 
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers


In [2]:
# Import data
data = pd.read_csv("../../data/mturk_experiment_2.csv",encoding='unicode_escape')
labels = data["Formality"]
samples = data["Sentence"]

train_samples, test_samples, train_labels,test_labels = train_test_split(samples, labels, test_size=0.2)

train_samples = np.array(train_samples)
test_samples = np.array(test_samples)
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [3]:
vectoriser = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
vectoriser.adapt(text_ds)

In [4]:
voc = vectoriser.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [5]:
glove_path = "glove.6B.300d.txt"
embeddings_index = {}
with open(glove_path,encoding="utf8") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs,"f",sep= " ")
        embeddings_index[word] = coefs

In [6]:
num_tokens = len(voc) + 2
embedding_dim = 300
hits = 0
misses = 0

embedding_matrix = np.zeros((num_tokens,embedding_dim))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1

In [7]:
embedding_layer = Embedding(num_tokens,embedding_dim,embeddings_initializer=keras.initializers.Constant(embedding_matrix),trainable=False)

In [8]:
int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.GRU(300,return_sequences=True)(embedded_sequences)
x = layers.SimpleRNN(300)(x)
out = layers.Dense(1,activation='relu')(x)

model = keras.Model(int_sequences_input,out)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 300)         4125300   
_________________________________________________________________
gru (GRU)                    (None, None, 300)         541800    
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 300)               180300    
_________________________________________________________________
dense (Dense)                (None, 1)                 301       
Total params: 4,847,701
Trainable params: 722,401
Non-trainable params: 4,125,300
_________________________________________________________________


In [9]:
x_train = vectoriser(np.array([[s] for s in train_samples])).numpy()
x_val = vectoriser(np.array([[s] for s in test_samples])).numpy()

y_train = np.array(train_labels)
y_val = np.array(test_labels)

In [10]:
model.compile(optimizer=Adam(learning_rate=0.0001),loss='mean_squared_error',metrics=[tf.keras.metrics.MeanSquaredError(),tf.keras.losses.MeanAbsoluteError(),tf.keras.losses.MeanAbsolutePercentageError()])

In [11]:
history = model.fit(x=x_train,y=y_train,batch_size=32,epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [12]:
mse = history.history['mean_squared_error']
mae = history.history['mean_absolute_error']
mpe = history.history['mean_absolute_percentage_error']

scores = pd.DataFrame()
scores["MSE"] = mse
scores["MAE"] = mae
scores["% Error"] = mpe

scores

Unnamed: 0,MSE,MAE,% Error
0,2.203917,1.150421,32.619602
1,1.413933,0.977252,29.146376
2,1.401496,0.973975,29.020006
3,1.400865,0.972558,28.997356
4,1.398536,0.972866,29.018929
5,1.387576,0.971273,28.973028
6,1.400633,0.972999,29.014864
7,1.402478,0.973969,29.063896
8,1.396552,0.97414,29.07361
9,1.402896,0.974347,28.980837


In [13]:
scores = model.evaluate(x=x_val,y=y_val)
scores[1:]



[1.321852684020996, 0.9411579370498657, 28.015836715698242]