In [1]:
# Imports
from tensorflow.keras.layers import TextVectorization
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

In [2]:
# Import data
data = pd.read_csv("../../data/mturk_experiment_2.csv",encoding='unicode_escape')
labels = data["Formality"]
samples = data["Sentence"]

train_samples, test_samples, train_labels,test_labels = train_test_split(samples, labels, test_size=0.2)

train_samples = np.array(train_samples)
test_samples = np.array(test_samples)
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [3]:
# Embedding setup
vectoriser = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
vectoriser.adapt(text_ds)

voc = vectoriser.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

glove_path = "glove.6B.100d.txt"
embeddings_index = {}
with open(glove_path,encoding="utf8") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs,"f",sep= " ")
        embeddings_index[word] = coefs

num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

embedding_matrix = np.zeros((num_tokens,embedding_dim))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1

embedding_layer = Embedding(num_tokens,embedding_dim,embeddings_initializer=keras.initializers.Constant(embedding_matrix),trainable=False)

In [4]:
# CNN setup
int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.Conv1D(128, 1, activation="relu")(embedded_sequences)
x = layers.MaxPooling1D(1)(x)
x = layers.Conv1D(128, 1, activation="relu")(x)
x = layers.MaxPooling1D(1)(x)
x = layers.Conv1D(128, 1, activation="relu")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
preds = layers.Dense(units=1)(x)
cnn = keras.Model(int_sequences_input, preds)
cnn.compile(optimizer=Adam(learning_rate=0.0001),loss='mean_squared_error',metrics=[tf.keras.losses.MeanAbsoluteError(),tf.keras.losses.MeanAbsolutePercentageError()])

In [5]:
# LSTM setup

int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.LSTM(4,input_shape=(1,5))(embedded_sequences)
out = layers.Dense(1,activation='relu')(x)

lstm = keras.Model(int_sequences_input,out)

In [6]:
# RNN setup
int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.GRU(256,return_sequences=True)(embedded_sequences)
x = layers.SimpleRNN(128)(x)
out = layers.Dense(1,activation='relu')(x)

rnn = keras.Model(int_sequences_input,out)

In [7]:
# Data setup
x_train = vectoriser(np.array([[s] for s in train_samples])).numpy()
x_val = vectoriser(np.array([[s] for s in test_samples])).numpy()

y_train = np.array(train_labels)
y_val = np.array(test_labels)

results = pd.DataFrame()

In [8]:
# Try CNN with batch sizes 10,32 and epochs 50,75 and 100
cnn.fit(x=x_train,y=y_train,batch_size=10,epochs=100)
results["CNN E100 B10"] = cnn.evaluate(x=x_val,y=y_val,verbose=0)

cnn.fit(x=x_train,y=y_train,batch_size=32,epochs=100)
results["CNN E100 B32"] = cnn.evaluate(x=x_val,y=y_val,verbose=0)

cnn.fit(x=x_train,y=y_train,batch_size=10,epochs=75)
results["CNN E75 B10"] = cnn.evaluate(x=x_val,y=y_val,verbose=0)

cnn.fit(x=x_train,y=y_train,batch_size=32,epochs=75)
results["CNN E75 B32"] = cnn.evaluate(x=x_val,y=y_val,verbose=0)

cnn.fit(x=x_train,y=y_train,batch_size=10,epochs=50)
results["CNN E50 B10"] = cnn.evaluate(x=x_val,y=y_val,verbose=0)

cnn.fit(x=x_train,y=y_train,batch_size=32,epochs=50)
results["CNN E50 B32"] = cnn.evaluate(x=x_val,y=y_val,verbose=0)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100

In [None]:
results

In [None]:
# Try LSTM with batch sizes 10,32 and epochs 50,75 and 100
lstm.fit(x=x_train,y=y_train,batch_size=10,epochs=100)
results["LSTM E100 B10"] = cnn.evaluate(x=x_val,y=y_val,verbose=0)

lstm.fit(x=x_train,y=y_train,batch_size=32,epochs=100)
results["LSTM E100 B32"] = cnn.evaluate(x=x_val,y=y_val,verbose=0)

lstm.fit(x=x_train,y=y_train,batch_size=10,epochs=75)
results["LSTM E75 B10"] = cnn.evaluate(x=x_val,y=y_val,verbose=0)

lstm.fit(x=x_train,y=y_train,batch_size=32,epochs=75)
results["LSTM E75 B32"] = cnn.evaluate(x=x_val,y=y_val,verbose=0)

lstm.fit(x=x_train,y=y_train,batch_size=10,epochs=50)
results["LSTM E50 B10"] = cnn.evaluate(x=x_val,y=y_val,verbose=0)

lstm.fit(x=x_train,y=y_train,batch_size=32,epochs=50)
results["LSTM E50 B32"] = cnn.evaluate(x=x_val,y=y_val,verbose=0)

In [None]:
results

In [None]:
# Try RNN with batch sizes 10,32 and epochs 50,75 and 100
rnn.fit(x=x_train,y=y_train,batch_size=10,epochs=100)
results["RNN E100 B10"] = cnn.evaluate(x=x_val,y=y_val,verbose=0)

rnn.fit(x=x_train,y=y_train,batch_size=32,epochs=100)
results["RNN E100 B32"] = cnn.evaluate(x=x_val,y=y_val,verbose=0)

rnn.fit(x=x_train,y=y_train,batch_size=10,epochs=75)
results["RNN E75 B10"] = cnn.evaluate(x=x_val,y=y_val,verbose=0)

rnn.fit(x=x_train,y=y_train,batch_size=32,epochs=75)
results["RNN E75 B32"] = cnn.evaluate(x=x_val,y=y_val,verbose=0)

rnn.fit(x=x_train,y=y_train,batch_size=10,epochs=50)
results["RNN E50 B10"] = cnn.evaluate(x=x_val,y=y_val,verbose=0)

rnn.fit(x=x_train,y=y_train,batch_size=32,epochs=50)
results["RNN E50 B32"] = cnn.evaluate(x=x_val,y=y_val,verbose=0)

In [None]:
results