In [1]:
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer, InputExample, models, losses
from torch import nn

import random

In [12]:
# first, build a dataloader for the IMDB62 dataset 
class IMDB62_AV_Dataset(Dataset):
    """Dataset for Author Verification on the IMDB62 Dataset."""

    def __init__(self, data_file, dataset_size=None):
        """
        Args:
            data_file (string): the path to the IMDB62 Dataset txt file
        """
        self.data_file = data_file
        self.dataset_size = dataset_size
        
        # read the file into a convenient data structure - needs to be of the form (text1, text2, similarity)
        raw_data = []  # This will just store (userid, content) tuples
        with open(self.data_file, 'r') as f:
            lines = f.readlines()
            if self.dataset_size is None:
                self.dataset_size = len(lines)
            else:
                assert self.dataset_size > len(lines), f'The desired dataset size must be larger than the actual dataset size which is: {len(lines)}'
            
            for line in lines:
                line = line.split('\t')
                user_id = line[1]
                text = line[-1]
                raw_data.append((user_id, text))
                    
        # now process the individual files into an actual dataset
        # random sampling should work fine for now, but first do a pass through the data 
        # with (n, n+1, label) to ensure that everything is seen at least once
        self.data = []
        for i in range(len(raw_data) - 1):
            dp1 = raw_data[i]
            dp2 = raw_data[i+1]
            label = float(1) if dp1[0] == dp2[0] else float(0)
            self.data.append(InputExample(texts=[dp1[1], dp2[1]], label=label))
        
        # now randomly sample to increase dataset size - there are 2^62,000 combinations, so just randomly sample like, 100k or so and call it good
        for i in range(self.dataset_size - len(self.data)):
            dp1 = random.choice(raw_data)
            dp2 = random.choice(raw_data)
            # make sure points aren't the same
            while (dp1[0] == dp2[0] and dp1[1] == dp2[1]):
                dp1 = random.choice(raw_data)
                dp2 = random.choice(raw_data)
            # add to the dataset
            label = float(1) if dp1[0] == dp2[0] else float(0)
            self.data.append(InputExample(texts=[dp1[1], dp2[1]], label=label))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [13]:
# test the dataset 

train_dataset = IMDB62_AV_Dataset('/home/jtyo/Projects/Authorship Attribution/IMDB/imdb62_train.txt', 200_000)
test_dataset = IMDB62_AV_Dataset('/home/jtyo/Projects/Authorship Attribution/IMDB/imdb62_test.txt')

print(len(train_dataset))
print(train_dataset[0])

200000
<InputExample> label: 1.0, texts: I caught glimpses of this show which feature a gay male couple and their dog , Liberace . First of all , they come across as the most stereotyped gay couple that I have seen and yet they are real . I couldn't watch them with their dog as they took this tiny lap dog ( I mean the dog weighed under 10 pounds and was not fully developed ) and pushed into these contests . I thought Showbiz Moms and Dads were ridiculous but this young gay couple are obviously immature and neglectful of Liberace . I understand that some people want to place their animals for show but don't put Liberace on for show like a doll . Liberace is a dog and a small one . I had a rabbit who weighed more than this dog and I wouldn't put it in contests . Liberace isn't even attractive . Look I know people love their animals , I still can't get over the loss of my rabbit but even I have a good sense about what she wanted . I don't think Liberace wants to be paraded around like sho

In [14]:
# now train a model 
word_embedding_model = models.Transformer('bert-base-cased', max_seq_length=512)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.ReLU())

model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])

In [15]:
# build dataloader from the dataset 
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=2)

train_loss = losses.CosineSimilarityLoss(model)

#Tune the model
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=5, warmup_steps=100)

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/100000 [00:00<?, ?it/s]

KeyboardInterrupt: 