Combine training samples from TDT and XED, but keep their labels separate. This does not seem to benefit either data set in evaluation.

In [1]:
# Set the file paths here
tdt_train_fn = '/content/tdt-sentiment-151020-train-clean.tsv'
tdt_eval_fn = '/content/tdt-sentiment-151020-dev.tsv'
xed_nonneutrals_fn = '/content/fi-annotated.tsv'
xed_neutrals_fn = '/content/neu_fi.txt'

In [None]:
!pip install transformers

In [4]:
# Choose model and set up input

from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast
import tensorflow as tf
import random

def transpose(l):
  return [list(t) for t in zip(*l)]

def load_fields(fn):
  return transpose([l.rstrip('\n').split('\t') for l in open(fn).readlines()])

def label_vector(num_labels, labels):
  binary_labels = []
  for ls in labels:
    b = [0]*num_labels
    for l in ls:
      b[l] = 1
    binary_labels.append(b)
  return binary_labels

tdt_train_texts, tdt_train_labels_raw = load_fields(tdt_train_fn)[1:3]
tdt_label_dictionary = {'positive': 9, 'negative': 10, 'other': 11, 'neutral': 12}
tdt_train_labels = [[tdt_label_dictionary[l]] for l in tdt_train_labels_raw]

tdt_eval_texts, tdt_eval_labels_raw = load_fields(tdt_eval_fn)[1:3]
tdt_eval_labels = [[tdt_label_dictionary[l]] for l in tdt_eval_labels_raw]

neutral_texts = load_fields(xed_neutrals_fn)[1]
neutral_labels = [[8]]*len(neutral_texts)

nonneutral_texts, labels_raw = load_fields(xed_nonneutrals_fn)[:2]
nonneutral_labels = [[int(s) for s in l.replace('8', '0').split(',')] for l in labels_raw]

xed_texts = neutral_texts + nonneutral_texts
xed_labels = neutral_labels + nonneutral_labels

num_labels = 13

tdt_train_label_vectors = label_vector(num_labels, tdt_train_labels)
tdt_eval_label_vectors = label_vector(num_labels, tdt_eval_labels)
xed_label_vectors = label_vector(num_labels, xed_labels)

xed_train_texts, xed_eval_texts, xed_train_label_vectors, xed_eval_label_vectors = train_test_split(xed_texts, xed_label_vectors, test_size=0.1)

train_texts = tdt_train_texts + xed_train_texts
train_label_vectors = tf.constant(tdt_train_label_vectors + xed_train_label_vectors, dtype='float32')

eval_texts = tdt_eval_texts + xed_eval_texts
eval_label_vectors = tf.constant(tdt_eval_label_vectors + xed_eval_label_vectors, dtype='float32')

#model_name = "TurkuNLP/bert-base-finnish-cased-v1"
model_name = "TurkuNLP/bert-base-finnish-uncased-v1"

tokenizer = BertTokenizerFast.from_pretrained(model_name)

input_size = 128

train_encodings = tokenizer(train_texts, truncation=True, padding='longest', max_length=input_size)
eval_encodings = tokenizer(eval_texts, truncation=True, padding='longest', max_length=input_size)

In [5]:
# Set up training
from transformers import TFBertForSequenceClassification, optimization_tf

init_lr = 2e-5

epochs = 2
batch_size_train = 16
batch_size_eval = 16

def train(model, t, train_labels, eval):
  size_train = len(train_labels)
  steps_per_epoch = int(size_train/batch_size_train)
  steps_train = steps_per_epoch*epochs
  steps_warmup = int(epochs * size_train * 0.1 / batch_size_train)
  optimizer, _ = optimization_tf.create_optimizer(init_lr=init_lr,
                                                  num_train_steps=steps_train,
                                                  num_warmup_steps=steps_warmup,
                                                  weight_decay_rate=0.01)
  model.compile(optimizer=optimizer, loss=tf.nn.sigmoid_cross_entropy_with_logits, metrics=[])
  model.fit(t,
            train_labels,
            validation_data=eval,
            batch_size=batch_size_train,
            epochs=epochs)
  return model

In [6]:
# Evaluate
from sklearn.metrics import classification_report, accuracy_score, f1_score
import numpy as np

def train_evaluate(train_x, train_y, eval_x, eval_y, num_labels, run_count):
  runs = []
  for i in range(run_count):
    bert = TFBertForSequenceClassification.from_pretrained(model_name,
                                                           num_labels=num_labels)
    bert = train(bert, train_x, train_y, (eval_x, eval_y))
    runs.append(bert.predict(eval_x)[0])
  return runs

def format_floats(l):
  return ', '.join(f'{x:.4f}' for x in l)

def print_results(train_x, train_y, eval_x, eval_y, num_labels, run_count):
  runs = train_evaluate(train_x, train_y, eval_x, eval_y, num_labels, run_count)
  preds = [(tf.math.sigmoid(r) >= 0.5).numpy().tolist() for r in runs]
  print(f"Model: {model_name}, initial learning rate = {init_lr}, input size = {input_size}, batch size = {batch_size_train}, epochs = {epochs}")
  unpredicted_formatted = [f"{sum([1 if not 1 in v else 0 for v in p])} out of {len(p)}" for p in preds]
  print(f"Number of sentences with no predicted labels: {unpredicted_formatted}")
  accuracy = [accuracy_score(eval_label_vectors, p) for p in preds]
  weighted_f1 = [f1_score(eval_label_vectors, p, average='weighted') for p in preds]
  print(f"Accuracy: {format_floats(accuracy)}")
  print(f"Weighted F-score: {format_floats(weighted_f1)}")
  print(f'Average accuracy: {np.mean(accuracy):.4f}, stdev: {np.std(accuracy):.4f}')
  print(f'Average weighted F-score: {np.mean(weighted_f1):.4f}, stdev: {np.std(weighted_f1):.4f}')
  max_i = accuracy.index(max(accuracy))
  print(classification_report(eval_label_vectors, preds[max_i], target_names=['H:trust', 'H:anger', 'H:anticipation', 'H:disgust', 'H:fear', 'H:joy', 'H:sadness', 'H:surprise', 'H:neutral', 'T:positive', 'T:negative', 'T:other', 'T:neutral'], digits=4))

In [None]:
# Evaluate with no source label in sentences

t = [tf.constant(train_encodings.data['input_ids']),
     tf.constant(train_encodings.data['attention_mask']),
     tf.constant(train_encodings.data['token_type_ids'])]

e = [tf.constant(eval_encodings.data['input_ids']),
     tf.constant(eval_encodings.data['attention_mask']),
     tf.constant(eval_encodings.data['token_type_ids'])]

print_results(t, train_label_vectors, e, eval_label_vectors, num_labels, 3)

Some layers from the model checkpoint at TurkuNLP/bert-base-finnish-uncased-v1 were not used when initializing TFBertForSequenceClassification: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at TurkuNLP/bert-base-finnish-uncased-v1 and are newly initialized: ['classifier', 'dropout_379']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some layers from the model ch

Model: TurkuNLP/bert-base-finnish-uncased-v1, initial learning rate = 2e-05, input size = 128, batch size = 16, epochs = 1
Number of sentences with no predicted labels: ['0 out of 3706', '0 out of 3706', '0 out of 3706']
Accuracy: 0.0000, 0.0000, 0.0000
Weighted F-score: 0.1755, 0.0819, 0.1409
Average accuracy: 0.0000, stdev: 0.0000
Average weighted F-score: 0.1327, stdev: 0.0387
                precision    recall  f1-score   support

       H:trust     0.0347    0.1129    0.0530       248
       H:anger     0.0304    0.0579    0.0399       311
H:anticipation     0.0586    0.8073    0.1094       218
     H:disgust     0.0204    0.0047    0.0076       215
        H:fear     0.0523    0.4054    0.0927       222
         H:joy     0.0244    0.0084    0.0125       239
     H:sadness     0.0560    0.9798    0.1060       198
    H:surprise     0.0000    0.0000    0.0000       185
     H:neutral     0.4003    0.2726    0.3244      1104
    T:positive     0.0274    0.8962    0.0532       106


In [None]:
# Evaluate with source label in sentences

from sklearn.metrics import classification_report, accuracy_score, f1_score
import numpy as np

train_texts_source = ['T:' + t for t in tdt_train_texts] + ['H:' + t for t in xed_train_texts]
eval_texts_source = ['T:' + t for t in tdt_eval_texts] + ['H:' + t for t in xed_eval_texts]

tdt_eval_texts_source = ['T:' + t for t in tdt_eval_texts]
xed_eval_texts_source = ['H:' + t for t in xed_eval_texts]

train_encodings = tokenizer(train_texts_source, truncation=True, padding='longest', max_length=input_size)
eval_encodings = tokenizer(eval_texts_source, truncation=True, padding='longest', max_length=input_size)

tdt_eval_encodings = tokenizer(tdt_eval_texts_source, truncation=True, padding='longest', max_length=input_size)
xed_eval_encodings = tokenizer(xed_eval_texts_source, truncation=True, padding='longest', max_length=input_size)

t_source = [tf.constant(train_encodings.data['input_ids']),
            tf.constant(train_encodings.data['attention_mask']),
            tf.constant(train_encodings.data['token_type_ids'])]

e_source = [tf.constant(eval_encodings.data['input_ids']),
            tf.constant(eval_encodings.data['attention_mask']),
            tf.constant(eval_encodings.data['token_type_ids'])]

e_tdt = [tf.constant(tdt_eval_encodings.data['input_ids']),
         tf.constant(tdt_eval_encodings.data['attention_mask']),
         tf.constant(tdt_eval_encodings.data['token_type_ids'])]

e_xed = [tf.constant(xed_eval_encodings.data['input_ids']),
         tf.constant(xed_eval_encodings.data['attention_mask']),
         tf.constant(xed_eval_encodings.data['token_type_ids'])]

eval_xs = [e_tdt, e_xed, e_source]
eval_ys = [tf.constant(tdt_eval_label_vectors, dtype='float32'),
           tf.constant(xed_eval_label_vectors, dtype='float32'),
           eval_label_vectors]

runs = []
for i in range(3):
  bert = TFBertForSequenceClassification.from_pretrained(model_name,
                                                         num_labels=num_labels)
  bert = train(bert, t_source, train_label_vectors, (e_source, eval_label_vectors))
  runs.append([bert.predict(x)[0] for x in eval_xs])

runs = transpose(runs)

def format_floats(l):
  return ', '.join(f'{x:.4f}' for x in l)

print(f"Model: {model_name}, initial learning rate = {init_lr}, input size = {input_size}, batch size = {batch_size_train}, epochs = {epochs}")
for eval_y, result, name in zip(eval_ys, runs, ['TDT', 'XED', 'TDT+XED']):
  print(f'Results for evaluation on {name}')
  preds_raw = [(tf.math.sigmoid(r) >= 0.5).numpy().tolist() for r in result]
  preds = [[p if 1 in p else [e == max(r) for e in r] for r, p in zip(run.tolist(), pred)] for run, pred in zip(result, preds_raw)]
  unpredicted_formatted = [f"{sum([1 if not 1 in v else 0 for v in p])} out of {len(p)}" for p in preds]
  accuracy = [accuracy_score(eval_y, p) for p in preds]
  weighted_f1 = [f1_score(eval_y, p, average='weighted') for p in preds]
  print(f"Accuracy: {format_floats(accuracy)}")
  print(f"Weighted F-score: {format_floats(weighted_f1)}")
  print(f'Average accuracy: {np.mean(accuracy):.4f}, stdev: {np.std(accuracy):.4f}')
  print(f'Average weighted F-score: {np.mean(weighted_f1):.4f}, stdev: {np.std(weighted_f1):.4f}')
  max_i = accuracy.index(max(accuracy))
  print(classification_report(eval_y, preds[max_i], target_names=['H:trust', 'H:anger', 'H:anticipation', 'H:disgust', 'H:fear', 'H:joy', 'H:sadness', 'H:surprise', 'H:neutral', 'T:positive', 'T:negative', 'T:other', 'T:neutral'], digits=4))

Some layers from the model checkpoint at TurkuNLP/bert-base-finnish-uncased-v1 were not used when initializing TFBertForSequenceClassification: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at TurkuNLP/bert-base-finnish-uncased-v1 and are newly initialized: ['classifier', 'dropout_265']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2
Epoch 2/2


Some layers from the model checkpoint at TurkuNLP/bert-base-finnish-uncased-v1 were not used when initializing TFBertForSequenceClassification: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at TurkuNLP/bert-base-finnish-uncased-v1 and are newly initialized: ['classifier', 'dropout_303']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2
Epoch 2/2


Some layers from the model checkpoint at TurkuNLP/bert-base-finnish-uncased-v1 were not used when initializing TFBertForSequenceClassification: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at TurkuNLP/bert-base-finnish-uncased-v1 and are newly initialized: ['classifier', 'dropout_341']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2
Epoch 2/2
Model: TurkuNLP/bert-base-finnish-uncased-v1, initial learning rate = 2e-05, input size = 128, batch size = 16, epochs = 2
Results for evaluation on TDT
Accuracy: 0.8772, 0.8755, 0.8679
Weighted F-score: 0.8745, 0.8703, 0.8648
Average accuracy: 0.8736, stdev: 0.0041
Average weighted F-score: 0.8699, stdev: 0.0040
                precision    recall  f1-score   support

       H:trust     0.0000    0.0000    0.0000         0
       H:anger     0.0000    0.0000    0.0000         0
H:anticipation     0.0000    0.0000    0.0000         0
     H:disgust     0.0000    0.0000    0.0000         0
        H:fear     0.0000    0.0000    0.0000         0
         H:joy     0.0000    0.0000    0.0000         0
     H:sadness     0.0000    0.0000    0.0000         0
    H:surprise     0.0000    0.0000    0.0000         0
     H:neutral     0.0000    0.0000    0.0000         0
    T:positive     0.7982    0.8585    0.8273       106
    T:negative     0.8021    0.6016    0.6875   

  average, "true nor predicted", 'F-score is', len(true_sum)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.6333, 0.6400, 0.6303
Weighted F-score: 0.6294, 0.6331, 0.6280
Average accuracy: 0.6346, stdev: 0.0041
Average weighted F-score: 0.6302, stdev: 0.0022
                precision    recall  f1-score   support

       H:trust     0.5198    0.4163    0.4623       221
       H:anger     0.5238    0.3507    0.4201       345
H:anticipation     0.5463    0.2479    0.3410       238
     H:disgust     0.5419    0.4236    0.4755       229
        H:fear     0.4653    0.3300    0.3862       203
         H:joy     0.6332    0.5800    0.6054       250
     H:sadness     0.5000    0.4465    0.4717       215
    H:surprise     0.4940    0.1806    0.2645       227
     H:neutral     0.6907    0.7940    0.7387      1063
    T:positive     0.7706    0.7925    0.7814       106
    T:negative     0.7812    0.5859    0.6696       128
       T:other     0.7937    0.5882    0.6757        85
     T:neutral     0.9037    0.9582    0.9302       862

     micro avg     0.6931    0.6225    0.6559      4

In [None]:
result = runs[2]

preds_raw = [(tf.math.sigmoid(r) >= 0.5).numpy().tolist() for r in result]
preds = [[p if 1 in p else [e == max(r) for e in r] for r, p in zip(run.tolist(), pred)] for run, pred in zip(result, preds_raw)]
indexes = random.sample(range(len(preds_raw[0])), 10)
unpredicted_formatted = [f"{sum([1 if not 1 in v else 0 for v in p])} out of {len(p)}" for p in preds]
accuracy = [accuracy_score(eval_y, p) for p in preds]
weighted_f1 = [f1_score(eval_y, p, average='weighted') for p in preds]
print(f"Accuracy: {format_floats(accuracy)}")
print(f"Weighted F-score: {format_floats(weighted_f1)}")
print(f'Average accuracy: {np.mean(accuracy):.4f}, stdev: {np.std(accuracy):.4f}')
print(f'Average weighted F-score: {np.mean(weighted_f1):.4f}, stdev: {np.std(weighted_f1):.4f}')
max_i = accuracy.index(max(accuracy))
tdt_label_list = ['T:positive', 'T:negative', 'T:other', 'T:neutral']
xed_label_list = ['H:trust', 'H:anger', 'H:anticipation', 'H:disgust', 'H:fear', 'H:joy', 'H:sadness', 'H:surprise', 'H:neutral']
print(classification_report(eval_y, preds[max_i], target_names=xed_label_list + tdt_label_list, digits=4))
tdt_micro_f1 = [f1_score(eval_y, p, labels=[9,10,11,12], average='micro') for p in preds]
xed_micro_f1 = [f1_score(eval_y, p, labels=list(range(9)), average='micro') for p in preds]
print(f'Micro-average F-score for TDT: {format_floats(tdt_micro_f1)}')
print(f'Average: {np.mean(tdt_micro_f1):.4f}, stdev: {np.std(tdt_micro_f1):.4f}\n')
print(f'Micro-average F-score for XED: {format_floats(xed_micro_f1)}')
print(f'Average: {np.mean(xed_micro_f1):.4f}, stdev: {np.std(xed_micro_f1):.4f}')

[[1, 3, 7], [1, 3, 4], [1], [1], [1, 4]]
[3, 3, 1, 1, 2]
14449
3381
Accuracy: 0.6333, 0.6400, 0.6303
Weighted F-score: 0.6294, 0.6331, 0.6280
Average accuracy: 0.6346, stdev: 0.0041
Average weighted F-score: 0.6302, stdev: 0.0022
                precision    recall  f1-score   support

       H:trust     0.5198    0.4163    0.4623       221
       H:anger     0.5238    0.3507    0.4201       345
H:anticipation     0.5463    0.2479    0.3410       238
     H:disgust     0.5419    0.4236    0.4755       229
        H:fear     0.4653    0.3300    0.3862       203
         H:joy     0.6332    0.5800    0.6054       250
     H:sadness     0.5000    0.4465    0.4717       215
    H:surprise     0.4940    0.1806    0.2645       227
     H:neutral     0.6907    0.7940    0.7387      1063
    T:positive     0.7706    0.7925    0.7814       106
    T:negative     0.7812    0.5859    0.6696       128
       T:other     0.7937    0.5882    0.6757        85
     T:neutral     0.9037    0.9582    0.