In [None]:
import os
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy import stats
import datetime

In [None]:
data_dict = {'answers': 'tokenized_answers.xlsx',
             'blogs': 'tokenized_blog.xlsx',
             'emails': 'tokenized_email.xlsx',
             'news': 'tokenized_email.xlsx'}
data_folder = '../data/formality'

In [None]:
tfhub_handle_encoder = '../models/bert_en_cased_L-12_H-768_A-12_3'
tfhub_handle_preprocess = '../models/bert_en_cased_preprocess_3'

In [None]:
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(384, activation='linear', name='dense_1')(net)  
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(192, activation='linear', name='dense_2')(net)
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(96, activation='linear', name='dense_3')(net)
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(48, activation='linear', name='dense_4')(net)
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(32, activation='linear', name='dense_5')(net)
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(1, activation='linear', name='classifier')(net)
  return tf.keras.Model(text_input, net)

In [None]:
start_time = datetime.datetime.now()
print(f"Training start time: {start_time}")

In [None]:
results = []
genres = ['answers', 'blogs', 'emails', 'news']
for genre in genres:
    data_file = data_dict[genre]
    data_file_path = os.path.join(data_folder, data_file)
    data_df = pd.read_excel(data_file_path)
    X_df = data_df[['text']]
    y_df = data_df['score']
    X_train_df, X_test_df, y_train_df, y_test_df =  train_test_split(X_df, y_df, test_size = 0.20, random_state=11)
    X_train_df, X_val_df, y_train_df, y_val_df =  train_test_split(X_train_df, y_train_df, test_size = 0.20, random_state=11)
    X_train_np = X_train_df.values
    X_val_np = X_val_df.values
    X_test_np = X_test_df.values
    y_train_np = y_train_df.values
    y_val_np = y_val_df.values
    y_test_np = y_test_df.values
    model = build_classifier_model()
    loss = tf.keras.losses.MeanSquaredError()
    #metrics = tf.metrics.MeanSquaredError()
    epochs = 30
    batch = 32
    lr = 3e-5
    optimizer = tf.keras.optimizers.Adam(lr)
    model.compile(optimizer=optimizer,
                              loss=loss)
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                                patience=5,
                                                restore_best_weights=True)
    callbacks = [early_stopping]
    history = model.fit(x=X_train_np,
                                    y=y_train_np,
                                    batch_size=batch,
                                    validation_data=(X_val_np, y_val_np),
                                    epochs=epochs,
                                    callbacks=callbacks)
    train_results = model.predict(X_train_np)
    train_mse = mean_squared_error(train_results, y_train_np)
    train_spearman_r = stats.spearmanr(train_results, y_train_np)
    
    val_results = model.predict(X_val_np)
    val_mse = mean_squared_error(val_results, y_val_np)
    val_spearman_r = stats.spearmanr(val_results, y_val_np)
    
    test_results = model.predict(X_test_np)
    test_mse = mean_squared_error(test_results, y_test_np)
    test_spearman_r = stats.spearmanr(test_results, y_test_np)
    
    result = [train_mse, train_spearman_r, 
              val_mse, val_spearman_r, 
              test_mse, test_spearman_r]
    results.append(result)

In [None]:
end_time = datetime.datetime.now()
print(f"Training end time: {end_time}")

In [None]:
columns = ['train_mse', 'train_spearman_r', 'val_mse', 'val_spearman_r', 'test_mse', 'test_spearman_r']
results_df = pd.DataFrame(results, index=genres, columns=columns)
results_path = '../results/formality/results_4_genres_bert_linear_1.csv'
results_df.to_csv(results_path)