In [50]:
# Test notebook for semantic sentence comparison.
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader
from datetime import datetime
import numpy as np
import pandas as pd
import math
import gzip
import csv

In [51]:
# Define parameters:
model_name = 'all-roberta-large-v1'
train_batch_size = 16
num_epochs = 1
model_save_path = 'continuedLearning' + model_name + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [52]:
# Load model
model = SentenceTransformer(model_name)

In [53]:
# Load data and make train/dev/test split
dataset = 'mrpcDataset'
trainPath = dataset + '/train.csv'
devPath = dataset + '/dev.csv'
testPath = dataset + '/test.csv'

train = []
dev = []
test = []

reader = csv.DictReader(open(trainPath))
for row in reader:
    inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=float(row['label']))
    train.append(inp_example)
    
reader = csv.DictReader(open(devPath))
for row in reader:
    inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=float(row['label']))
    dev.append(inp_example)
    
reader = csv.DictReader(open(testPath))
for row in reader:
    inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=float(row['label']))
    test.append(inp_example)

In [54]:
# Setup Train set
trainDataloader = DataLoader(train, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

In [55]:
# Setup Dev set
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev, name='mrpc-dev')

In [56]:
# Training the Model
warmup_steps = math.ceil(len(trainDataloader) * num_epochs * 0.1) # 10% of train data for warm-up

model.fit(train_objectives=[(trainDataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/230 [00:00<?, ?it/s]

In [57]:
# Store the model and evaluate it
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test, name='mrpc-test')
test_evaluator(model, output_path=model_save_path)  

0.6147395224559882