In [1]:
# Input 
import pandas as pd
import numpy as np
import bert
import os
from sklearn.model_selection import train_test_split
import tensorflow
from bert import bert_tokenization
from tensorflow.keras.optimizers import SGD

max_seq_length = 256
adapter_size = 64

In [2]:
# Import data
data = pd.read_csv("../../data/mturk_experiment_2.csv",encoding='unicode_escape')
labels = data["Formality"]
samples = data["Sentence"]


train_samples, test_samples, train_labels,test_labels = train_test_split(samples, labels, test_size=0.2)

train_samples = np.array(train_samples)
test_samples = np.array(test_samples)
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [3]:
model_name = "uncased_L-12_H-768_A-12"
model_dir = bert.fetch_google_bert_model(model_name, ".models")
model_ckpt = os.path.join(model_dir, "bert_model.ckpt")

bert_params = bert.params_from_pretrained_ckpt(model_dir)
l_bert = bert.BertModelLayer.from_params(bert_params, name="bert")
max_seq_len = 100
l_input_ids = tensorflow.keras.layers.Input(shape=(1,), dtype='int32')

model_dir + "\ vocab.txt"

Already  fetched:  uncased_L-12_H-768_A-12.zip
already unpacked at: .models\uncased_L-12_H-768_A-12


'.models\\uncased_L-12_H-768_A-12\\uncased_L-12_H-768_A-12\\ vocab.txt'

In [4]:
tokenizer = bert_tokenization.FullTokenizer(vocab_file=model_dir + "/vocab.txt")

tokenised_train_samples = []

for sample in train_samples:
    tokenised_train_samples.append(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sample)))
    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sample))

max_length = max(map(len, tokenised_train_samples))
max_length = 5625

train_x = []

for sample in tokenised_train_samples:
    sample = np.array(sample)
    if len(sample) < max_length:
        values_to_add = max_length - len(sample)
        train_x.append(np.pad(sample,(0,values_to_add),'constant'))
    else:
        train_x.append(sample)

In [5]:
tokenizer = bert_tokenization.FullTokenizer(vocab_file=model_dir + "/vocab.txt")

tokenised_test_samples = []

for sample in test_samples:
    tokenised_test_samples.append(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sample)))
    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sample))


max_length = max(map(len, tokenised_train_samples))
max_length = 1407

test_x = []

for sample in tokenised_test_samples:
    sample = np.array(sample)
    if len(sample) < max_length:
        values_to_add = max_length - len(sample)
        test_x.append(np.pad(sample,(0,values_to_add),'constant'))
    else:
        test_x.append(sample)

In [6]:
bertLayer = l_bert(l_input_ids)
x = tensorflow.keras.layers.Conv1D(128, 1, activation="relu")(bertLayer)
x = tensorflow.keras.layers.MaxPooling1D(1)(x)
x = tensorflow.keras.layers.Conv1D(128, 1, activation="relu")(x)
x = tensorflow.keras.layers.MaxPooling1D(1)(x)
x = tensorflow.keras.layers.Conv1D(128, 1, activation="relu")(x)
x = tensorflow.keras.layers.GlobalMaxPooling1D()(x)
x = tensorflow.keras.layers.Dense(128, activation="relu")(x)
x = tensorflow.keras.layers.Dropout(0.5)(x)
preds = tensorflow.keras.layers.Dense(units=1)(x)
model = tensorflow.keras.Model(l_input_ids, preds)
model.summary()


model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 1)]               0         
_________________________________________________________________
bert (BertModelLayer)        (None, None, 768)         108890112 
_________________________________________________________________
conv1d (Conv1D)              (None, None, 128)         98432     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, None, 128)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 128)         16512     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, None, 128)         0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, None, 128)         16512 

In [7]:
model.compile(optimizer=SGD(learning_rate=0.01),loss='mean_absolute_error',metrics=[tensorflow.keras.metrics.MeanSquaredError(),tensorflow.keras.losses.MeanAbsoluteError(),tensorflow.keras.losses.MeanAbsolutePercentageError()])

In [8]:
model.fit(x=train_x,y=train_labels,batch_size=10,epochs=1)

Train on 5625 samples


<tensorflow.python.keras.callbacks.History at 0x19af9a61e48>

In [9]:
scores = model.evaluate(x=test_x,y=test_labels)
scores



[1.0947403808222989, 1.8591206, 1.0946693, 37.186615]