In [1]:
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, SentencesDataset, losses, models
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, LabelAccuracyEvaluator
from sentence_transformers.readers import InputExample

import pandas as pd
import numpy as np
import torch
import math

In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [3]:
trainRaw = pd.read_csv("train.csv")
testRaw = pd.read_csv("test.csv")
embed_model = models.Transformer('bert-base-uncased', max_seq_length=256)
pool_model = models.Pooling(embed_model.get_word_embedding_dimension())
dense_model = models.Dense(in_features=pool_model.get_sentence_embedding_dimension(), out_features=256, activation_function=torch.nn.Tanh())
model = SentenceTransformer(modules=[embed_model, pool_model, dense_model])
epochs = 16
batch_size = 16

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
trainRaw = trainRaw.astype({'label': 'float'})
trainRaw['label'] = trainRaw['label'].replace(2.0, -1.0)
trainRaw.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,5130fd2cb5,and these comments were considered in formulat...,The rules developed in the interim were put to...,en,English,0.0
1,5b72532a0b,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...,en,English,-1.0
2,3931fbe82a,Des petites choses comme celles-là font une di...,J'essayais d'accomplir quelque chose.,fr,French,0.0
3,5622f0c60b,you know they can't really defend themselves l...,They can't defend themselves because of their ...,en,English,0.0
4,86aaa48b45,ในการเล่นบทบาทสมมุติก็เช่นกัน โอกาสที่จะได้แสด...,เด็กสามารถเห็นได้ว่าชาติพันธุ์แตกต่างกันอย่างไร,th,Thai,1.0


In [5]:
training, validation, testing = np.split(trainRaw.sample(frac=1), [int(.6*len(trainRaw)), int(.8*len(trainRaw))])

In [6]:
training['label'].value_counts()/len(training), validation['label'].value_counts()/len(validation), testing['label'].value_counts()/len(testing)

( 0.0    0.345435
 -1.0    0.329345
  1.0    0.325220
 Name: label, dtype: float64,
  0.0    0.363036
 -1.0    0.329208
  1.0    0.307756
 Name: label, dtype: float64,
 -1.0    0.359323
  0.0    0.323432
  1.0    0.317244
 Name: label, dtype: float64)

In [7]:
training_data = [InputExample(texts=[i['premise'], i['hypothesis']], label=i['label']) for _, i in training.iterrows()]
validation_data = [InputExample(texts=[i['premise'], i['hypothesis']], label=i['label']) for _, i in validation.iterrows()]
testing_data = [InputExample(texts=[i['premise'], i['hypothesis']], label=i['label']) for _, i in testing.iterrows()]
train_dataset = SentencesDataset(training_data, model)

In [8]:
train_dataloader = DataLoader(training_data, shuffle=True, batch_size=batch_size)
train_loss = losses.CosineSimilarityLoss(model)
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(validation_data, name='validation_data')

In [9]:
warmup_steps = math.ceil(len(train_dataloader) * epochs * 0.1)

In [10]:
model = model.to(device=device)

In [11]:
model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path='./bert_base_uncased')

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/455 [00:00<?, ?it/s]

Iteration:   0%|          | 0/455 [00:00<?, ?it/s]

Iteration:   0%|          | 0/455 [00:00<?, ?it/s]

Iteration:   0%|          | 0/455 [00:00<?, ?it/s]

Iteration:   0%|          | 0/455 [00:00<?, ?it/s]

In [13]:
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(testing_data, name='testing_data')
test_evaluator(model, output_path='./bert_base_uncased')

0.25154581389620273