# Setup

to learn:<br/>
warmup steps<br/>
check if next(DataLoader) returns random batch or first in a (batched) dataset<br/>
f1 micro, macro, weighted - in depth understanding<br/>
f1 micro is same as accuracy for binary classification :/ - investigate why

install

In [3]:
pip install -q sentence_transformers

imports

In [4]:
import pandas as pd
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses, evaluation
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report
import numpy as np
from scipy import spatial
import random
import torch
import math
import os
from datetime import datetime
import json
from copy import deepcopy
from sentence_transformers.losses import TripletDistanceMetric
from utils import parse_data, get_data_loader, get_macro_f1_for_threshold, find_optimum_values, predict_similarity, get_classification_report, write_json, get_maximum_point
import warnings
warnings.filterwarnings("ignore")

settings

In [5]:
RANDOM_STATE = 42
DATASET_FOLDER_PATH = 'drive/MyDrive/Colab Notebooks/nn/dataset/WikiQACorpus/WikiQACorpus'
MODEL_FOLDER_PATH = 'drive/MyDrive/Colab Notebooks/nn/models'
EVALUATION_FOLDER_PATH = 'drive/MyDrive/Colab Notebooks/nn/files/evaluations'

In [6]:
torch.manual_seed(RANDOM_STATE)
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

# Dataset

load dataset

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
train_dataset_path = os.path.join(DATASET_FOLDER_PATH, 'WikiQA-train.tsv') # drive path
dev_dataset_path = os.path.join(DATASET_FOLDER_PATH, 'WikiQA-dev.tsv')
#train_dataset_path = 'C:\\some_folder' # local path

In [9]:
df_train = pd.read_csv(train_dataset_path, delimiter='\t')
df_dev = pd.read_csv(dev_dataset_path, delimiter='\t')

In [None]:
df_train.head()

Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label
0,Q1,how are glacier caves formed?,D1,Glacier cave,D1-0,A partly submerged glacier cave on Perito More...,0
1,Q1,how are glacier caves formed?,D1,Glacier cave,D1-1,The ice facade is approximately 60 m high,0
2,Q1,how are glacier caves formed?,D1,Glacier cave,D1-2,Ice formations in the Titlis glacier cave,0
3,Q1,how are glacier caves formed?,D1,Glacier cave,D1-3,A glacier cave is a cave formed within the ice...,1
4,Q1,how are glacier caves formed?,D1,Glacier cave,D1-4,"Glacier caves are often called ice caves , but...",0


# Train

## cosine loss

In [23]:
SETTINGS = {
    'MODEL_NAME': 'all-mpnet-base-v2',
    #'MODEL_NAME': 'multi-qa-mpnet-base-dot-v1',
    'TRAIN_BATCH_SIZE': 16,  # higher values will cause GPU failure
    'NO_EPOCHS': 10,
    'WARMUP_STEPS_SCALE_FACTOR': 0.1,  # 10% of train data for warm-up
    'EARLY_STOPPING': True
}

In [24]:
questions_train, answers_train, labels_train = parse_data(df_train)
questions_dev, answers_dev, labels_dev = parse_data(df_dev)

model = SentenceTransformer(SETTINGS['MODEL_NAME'])
train_dataloader = get_data_loader(model, SETTINGS['TRAIN_BATCH_SIZE'], df_train)
train_loss = losses.CosineSimilarityLoss(model=model)
#train_loss = losses.MultipleNegativesRankingLoss(model=model)

# use this function when dot product similarity is needed
def find_optimum_values2(model, questions, answers, labels, similarity='dot'):
    predicted_similarity = predict_similarity(model, questions, answers, similarity)

    classification_reports = []
    thresholds = [l for l in list(range(0, 100, 1))]
    for threshold in thresholds:
        classification_reports.append(get_classification_report(labels, threshold, predicted_similarity))

    return get_maximum_point(thresholds, classification_reports, 'macro avg', 'f1-score')


In [25]:
best_no_epochs = 0

for epoch in range(SETTINGS['NO_EPOCHS']):
    print('\n' + '='*100)
    print(f'Epoch >>> {epoch}')
    previous_model = deepcopy(model)

    if best_no_epochs == 0:
        #previous_threshold_dev, previous_f1_macro_dev = find_optimum_values2(previous_model, questions_dev, answers_dev, labels_dev)
        previous_threshold_dev, previous_f1_macro_dev = find_optimum_values(previous_model, questions_dev, answers_dev, labels_dev)
        #previous_f1_macro_train = get_macro_f1_for_threshold(previous_model, questions_train, answers_train, labels_train, previous_threshold_dev, similarity='dot')
        previous_f1_macro_train = get_macro_f1_for_threshold(previous_model, questions_train, answers_train, labels_train, previous_threshold_dev, similarity='cosine')
    
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=1,
        warmup_steps=math.ceil(len(train_dataloader) * SETTINGS['NO_EPOCHS'] * SETTINGS['WARMUP_STEPS_SCALE_FACTOR']),
        show_progress_bar=True,
        save_best_model=False)
    
    #threshold_dev, f1_macro_dev = find_optimum_values2(model, questions_dev, answers_dev, labels_dev)
    threshold_dev, f1_macro_dev = find_optimum_values(model, questions_dev, answers_dev, labels_dev)
    #f1_macro_train = get_macro_f1_for_threshold(model, questions_train, answers_train, labels_train, threshold_dev, similarity='dot')
    f1_macro_train = get_macro_f1_for_threshold(model, questions_train, answers_train, labels_train, threshold_dev, similarity='cosine')

    print('Previous macro train f1  >>>', previous_f1_macro_train)
    print('Current macro train f1   >>>', f1_macro_train)
    print('Previous macro dev f1    >>>', previous_f1_macro_dev)
    print('Current macro dev f1     >>>', f1_macro_dev)
    print('Previous dev threshold   >>>', previous_threshold_dev)
    print('Current dev threshold    >>>', threshold_dev)
    if SETTINGS['EARLY_STOPPING'] and f1_macro_dev < previous_f1_macro_dev:
        break

    best_no_epochs = epoch
    previous_threshold_dev = threshold_dev
    previous_f1_macro_dev = f1_macro_dev
    previous_f1_macro_train = f1_macro_train

if best_no_epochs == 0:
    print('Out-of-the-box model is the best! No need for training.')
    print(f'best dev threshold >>> {previous_threshold_dev}, best dev macro f1 >>> {previous_f1_macro_dev}, best no epochs >>> {best_no_epochs}, best train macro f1 >>> {previous_f1_macro_train}')
else:
    folder_name = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
    model_save_path = os.path.join(MODEL_FOLDER_PATH, folder_name)
    os.mkdir(model_save_path)

    previous_model.save(model_save_path)
    write_json(os.path.join(model_save_path, 'settings.json'), SETTINGS)
    with open(os.path.join(model_save_path, 'training_output.txt'), 'w') as f:
        f.write(f'best dev threshold >>> {previous_threshold_dev}, best dev macro f1 >>> {previous_f1_macro_dev}, best no epochs >>> {best_no_epochs}, best train macro f1 >>> {previous_f1_macro_train}')


Epoch >>> 0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1273 [00:00<?, ?it/s]

Previous macro train f1  >>> 0.6951290086991893
Current macro train f1   >>> 0.815612409244503
Previous macro dev f1    >>> 0.6951290086991893
Current macro dev f1     >>> 0.7434684696914807
Previous dev threshold   >>> 0.7
Current dev threshold    >>> 0.43

Epoch >>> 1


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1273 [00:00<?, ?it/s]

Previous macro train f1  >>> 0.7434684696914807
Current macro train f1   >>> 0.8807291219938942
Previous macro dev f1    >>> 0.7434684696914807
Current macro dev f1     >>> 0.7214319690140207
Previous dev threshold   >>> 0.43
Current dev threshold    >>> 0.36
Out-of-the-box model is the best! No need for training.
best dev threshold >>> 0.43, best dev macro f1 >>> 0.7434684696914807, best no epochs >>> 0, best train macro f1 >>> 0.815612409244503


## triplet loss

In [10]:
SETTINGS = {
    'MODEL_NAME': 'all-mpnet-base-v2',
    #'MODEL_NAME': 'multi-qa-mpnet-base-dot-v1',
    'TRAIN_BATCH_SIZE': 16,  # higher values will cause GPU failure
    'NO_EPOCHS': 10,
    'WARMUP_STEPS_SCALE_FACTOR': 0.1,  # 10% of train data for warm-up
    'EARLY_STOPPING': True
}

In [11]:
def get_questions_dictionary(df: pd.DataFrame) -> dict:
    # fill dictionary
    questions_dictionary = {}
    for idx, row in df_train.iterrows():
        if row['Question'] not in questions_dictionary:
            questions_dictionary[row['Question']] = []
        questions_dictionary[row['Question']].append((row['Sentence'], row['Label']))

    # remove questions without answers
    keys_to_remove = []
    
    for key, value in questions_dictionary.items():
        flag = False
        for item in value:
            if item[1] == 1.0:
                flag = True
                break
        if not flag:
            keys_to_remove.append(key)

    for key in keys_to_remove:
        del questions_dictionary[key]

    return questions_dictionary

In [12]:
class Example:
    question: str = None
    answer: str = None
    sentences = list = None

    def __init__(self, question, answer, sentences):
        self.question = question
        self.answer = answer
        self.sentences = sentences

def generate_examples(questions_dictionary):
    for key, value in questions_dictionary.items():
        valid_answers = [candidate[0] for candidate in value if candidate[1] == 1.0]
        invalid_answers = [candidate[0] for candidate in value if candidate[1] == 0.0]
        if len(invalid_answers) == 0:
            continue
        for answer in valid_answers:
            yield Example(key, answer, invalid_answers)

In [13]:
def get_triplet_data_loader(model: SentenceTransformer, train_batch_size: int,
                    df: pd.DataFrame, shuffle: bool = True, repeat_questions: bool = False) -> DataLoader:
    examples = list(generate_examples(get_questions_dictionary(df)))
    input_examples = []
    for example in examples:
        #                     InputExample(texts=['Anchor 1', 'Positive 1', 'Negative 1'])
        #random.seed(RANDOM_STATE)
        #input_examples.append(InputExample(texts=[example.question, example.answer, random.choice(example.sentences)]))
        for sentence in example.sentences:
            input_examples.append(InputExample(texts=[example.question, example.answer, sentence]))

    dataset = SentencesDataset(input_examples, model)
    return DataLoader(dataset, shuffle=shuffle, batch_size=train_batch_size)

questions_train, answers_train, labels_train = parse_data(df_train)
questions_dev, answers_dev, labels_dev = parse_data(df_dev)
#'''
model = SentenceTransformer(SETTINGS['MODEL_NAME'])
train_dataloader = get_triplet_data_loader(model, SETTINGS['TRAIN_BATCH_SIZE'], df_train, shuffle=False)
train_loss = losses.TripletLoss(model=model)
#'''


Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [20]:
examples = list(generate_examples(get_questions_dictionary(df_train)))
input_examples = []
for example in examples:
    #                     InputExample(texts=['Anchor 1', 'Positive 1', 'Negative 1'])
    #random.seed(RANDOM_STATE)
    #input_examples.append(InputExample(texts=[example.question, example.answer, random.choice(example.sentences)]))
    for sentence in example.sentences:
        input_examples.append([example.question, example.answer, sentence])

In [14]:
loss_mapping = {
    'cosine': TripletDistanceMetric.COSINE, 
    'euclidean': TripletDistanceMetric.EUCLIDEAN, 
    'manhattan': TripletDistanceMetric.MANHATTAN
}
'''
for triplet_margin in [1, 3, 5, 7, 10]:
    print('TRIPLET MARGIN >>>', triplet_margin)
    for loss_metric in ['cosine', 'euclidean', 'manhattan']:
        print('LOSS METRIC >>>', loss_metric)

        model = SentenceTransformer(SETTINGS['MODEL_NAME'])
        train_dataloader = get_triplet_data_loader(model, SETTINGS['TRAIN_BATCH_SIZE'], df_train, shuffle=False)
        train_loss = losses.TripletLoss(model=model, distance_metric=loss_mapping[loss_metric], triplet_margin=triplet_margin)
'''
best_no_epochs = 0

for epoch in range(SETTINGS['NO_EPOCHS']):
    print('\n' + '='*100)
    print(f'Epoch >>> {epoch}')
    print('best_no_epochs >>>', best_no_epochs)
    previous_model = deepcopy(model)

    if best_no_epochs == 0:
        previous_threshold_dev, previous_f1_macro_dev = find_optimum_values(previous_model, questions_dev, answers_dev, labels_dev)
        previous_f1_macro_train = get_macro_f1_for_threshold(previous_model, questions_train, answers_train, labels_train, previous_threshold_dev, similarity='cosine')
    
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=1,
        warmup_steps=math.ceil(len(train_dataloader) * SETTINGS['NO_EPOCHS'] * SETTINGS['WARMUP_STEPS_SCALE_FACTOR']),
        show_progress_bar=True,
        save_best_model=False)
    
    threshold_dev, f1_macro_dev = find_optimum_values(model, questions_dev, answers_dev, labels_dev)
    f1_macro_train = get_macro_f1_for_threshold(model, questions_train, answers_train, labels_train, threshold_dev, similarity='cosine')

    print('Previous macro train f1  >>>', previous_f1_macro_dev)
    print('Current macro train f1   >>>', f1_macro_train)
    print('Previous macro dev f1    >>>', previous_f1_macro_dev)
    print('Current macro dev f1     >>>', f1_macro_dev)
    print('Previous dev threshold   >>>', previous_threshold_dev)
    print('Current dev threshold    >>>', threshold_dev)
    if SETTINGS['EARLY_STOPPING'] and f1_macro_dev < previous_f1_macro_dev:
        break

    best_no_epochs = epoch
    previous_threshold_dev = threshold_dev
    previous_f1_macro_dev = f1_macro_dev
    previous_f1_macro_train = f1_macro_train

if best_no_epochs == 0:
    print('Out-of-the-box model is the best! No need for training.')
    print(f'best dev threshold >>> {previous_threshold_dev}, best dev macro f1 >>> {previous_f1_macro_dev}, best no epochs >>> {best_no_epochs}, best train macro f1 >>> {previous_f1_macro_train}')
else:
    folder_name = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
    model_save_path = os.path.join(MODEL_FOLDER_PATH, folder_name)
    os.mkdir(model_save_path)
    previous_model.save(model_save_path)
    write_json(os.path.join(model_save_path, 'settings.json'), SETTINGS)
    with open(os.path.join(model_save_path, 'training_output.txt'), 'w') as f:
        f.write(f'best dev threshold >>> {previous_threshold_dev}, best dev macro f1 >>> {previous_f1_macro_dev}, best no epochs >>> {best_no_epochs}, best train macro f1 >>> {previous_f1_macro_train}, loss=Triplet')
        print(f'best dev threshold >>> {previous_threshold_dev}, best dev macro f1 >>> {previous_f1_macro_dev}, best no epochs >>> {best_no_epochs}, best train macro f1 >>> {previous_f1_macro_train}, loss=Triplet')


Epoch >>> 0
best_no_epochs >>> 0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/563 [00:00<?, ?it/s]

Previous macro train f1  >>> 0.6951290086991893
Current macro train f1   >>> 0.6030390064589498
Previous macro dev f1    >>> 0.6951290086991893
Current macro dev f1     >>> 0.6138745469339448
Previous dev threshold   >>> 0.7
Current dev threshold    >>> 0.99
Out-of-the-box model is the best! No need for training.
best dev threshold >>> 0.7, best dev macro f1 >>> 0.6951290086991893, best no epochs >>> 0, best train macro f1 >>> 0.6863829656107483


# Test

In [None]:
test_dataset_path = os.path.join(DATASET_FOLDER_PATH, 'WikiQA-test.tsv')
df_test = pd.read_csv(test_dataset_path, delimiter='\t')
df_test.head()