# Setup

to learn:<br/>
warmup steps

install

In [1]:
pip install -q sentence_transformers

[K     |████████████████████████████████| 79 kB 3.6 MB/s 
[K     |████████████████████████████████| 4.2 MB 31.0 MB/s 
[K     |████████████████████████████████| 1.2 MB 40.3 MB/s 
[K     |████████████████████████████████| 84 kB 3.1 MB/s 
[K     |████████████████████████████████| 6.6 MB 33.5 MB/s 
[K     |████████████████████████████████| 596 kB 28.4 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


imports

In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses, evaluation
from torch.utils.data import DataLoader
import numpy as np
import random
import torch
import math
import os
from datetime import datetime
import json

settings

In [3]:
RANDOM_STATE = 42
DATASET_FOLDER_PATH = 'drive/MyDrive/Colab Notebooks/nn/dataset/WikiQACorpus/WikiQACorpus'
MODEL_FOLDER_PATH = 'drive/MyDrive/Colab Notebooks/nn/models'
EVALUATION_FOLDER_PATH = 'drive/MyDrive/Colab Notebooks/nn/files/evaluations'

In [4]:
torch.manual_seed(RANDOM_STATE)
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

utility functions

In [5]:
def parse_data(df: pd.DataFrame) -> tuple:
    questions = df['Question'].tolist()
    answers = df['Sentence'].tolist()
    labels = df['Label'].tolist()
    return questions, answers, [float(l) for l in labels]

def get_data_loader(model: SentenceTransformer, train_batch_size: int,
                    questions: list, answers: list, labels: list, shuffle: bool = True) -> DataLoader:
    examples = [InputExample(texts=[q, a], label=float(l)) 
                for q, a, l in zip(questions, answers, labels)]
    dataset = SentencesDataset(examples, model)
    return DataLoader(dataset, shuffle=shuffle, batch_size=train_batch_size)

def get_data_loader(model: SentenceTransformer, train_batch_size: int,
                    df: pd.DataFrame, shuffle: bool = True) -> DataLoader:
    questions, answers, labels = parse_data(df)
    examples = [InputExample(texts=[q, a], label=float(l)) 
                for q, a, l in zip(questions, answers, labels)]
    dataset = SentencesDataset(examples, model)
    return DataLoader(dataset, shuffle=shuffle, batch_size=train_batch_size)

def write_json(path: str, data):
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

load dataset

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
train_dataset_path = os.path.join(DATASET_FOLDER_PATH, 'WikiQA-train.tsv') # drive path
dev_dataset_path = os.path.join(DATASET_FOLDER_PATH, 'WikiQA-dev.tsv')
#train_dataset_path = 'C:\\some_folder' # local path

In [8]:
df_train = pd.read_csv(train_dataset_path, delimiter='\t')
df_dev = pd.read_csv(dev_dataset_path, delimiter='\t')

In [9]:
df_train.head()

Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label
0,Q1,how are glacier caves formed?,D1,Glacier cave,D1-0,A partly submerged glacier cave on Perito More...,0
1,Q1,how are glacier caves formed?,D1,Glacier cave,D1-1,The ice facade is approximately 60 m high,0
2,Q1,how are glacier caves formed?,D1,Glacier cave,D1-2,Ice formations in the Titlis glacier cave,0
3,Q1,how are glacier caves formed?,D1,Glacier cave,D1-3,A glacier cave is a cave formed within the ice...,1
4,Q1,how are glacier caves formed?,D1,Glacier cave,D1-4,"Glacier caves are often called ice caves , but...",0


prepare model

training settings

In [10]:
SETTINGS = {
    'MODEL_NAME': 'all-mpnet-base-v2',
    # MODEL_NAME = 'multi-qa-mpnet-base-dot-v1'
    'TRAIN_BATCH_SIZE': 16,
    'NO_EPOCHS': 3,
    'WARMUP_STEPS_SCALE_FACTOR': 0.1,  # 10% of train data for warm-up
    'NO_EVALUATION_STEPS': 100
}

In [11]:
questions_dev, answers_dev, labels_dev = parse_data(df_dev)
evaluator_dev = evaluation.EmbeddingSimilarityEvaluator(questions_dev, answers_dev, labels_dev)

model = SentenceTransformer(SETTINGS['MODEL_NAME'])
train_dataloader = get_data_loader(model, SETTINGS['TRAIN_BATCH_SIZE'], df_train)
train_loss = losses.CosineSimilarityLoss(model=model)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [12]:
folder_name = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
model_save_path = os.path.join(MODEL_FOLDER_PATH, folder_name)
os.mkdir(model_save_path)

for i in range(SETTINGS['NO_EPOCHS']):
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=1,
        warmup_steps=math.ceil(len(train_dataloader) * SETTINGS['NO_EPOCHS'] * SETTINGS['WARMUP_STEPS_SCALE_FACTOR']),
        evaluator=evaluator_dev,
        evaluation_steps=SETTINGS['NO_EVALUATION_STEPS'],
        output_path=model_save_path,
        show_progress_bar=True,
        save_best_model=True)

write_json(os.path.join(model_save_path, 'settings.json'), SETTINGS)



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1273 [00:00<?, ?it/s]



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1273 [00:00<?, ?it/s]



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1273 [00:00<?, ?it/s]

In [16]:
loaded_model = SentenceTransformer(model_save_path)
evaluator_dev = evaluation.EmbeddingSimilarityEvaluator(questions_dev, answers_dev, labels_dev, write_csv=False)
evaluator_dev(loaded_model)

0.32250496000681234

In [18]:
old_model_save_path = os.path.join(MODEL_FOLDER_PATH, '2022_05_15_18_45_59')
old_loaded_model = SentenceTransformer(old_model_save_path)
evaluator_dev(old_loaded_model)

0.3125354811345472