In [16]:
# !pip install lightning
# !pip install datasets

In [17]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split, TensorDataset
import pytorch_lightning as pl
from transformers import AutoModelForSequenceClassification, AutoTokenizer

import pandas as pd
from scipy.stats import spearmanr, pearsonr
from sklearn.model_selection import train_test_split
import os
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
import random
from sklearn.model_selection import KFold
from pytorch_lightning.callbacks import EarlyStopping
from datasets import Dataset

import re
import string

import nltk
from nltk.corpus import wordnet as wn

In [18]:
class STR_DataModule(pl.LightningDataModule):
    def __init__(self, train_data : pd.DataFrame, test_data : pd.DataFrame, batch_size=8, syn_replace = False, change_random_letter = False, test_size : float = 0.2, no_validation = False, model_type : str = 'distilbert-base-uncased', seed = None, data_sep = '\n'):
        super().__init__()
        self.batch_size = batch_size
        self.train_data = train_data
        self.test_data = test_data
        self.syn_replace = False
        self.change_random_letter = False

        self.syn_replace_input = syn_replace
        self.change_random_input = change_random_letter

        self.model_type = model_type
        self.test_size = test_size
        self.no_validation = no_validation
        self.tokenizer = AutoTokenizer.from_pretrained(model_type)
        self.syn_replace = syn_replace
        self.change_random_letter = change_random_letter
        self.seed = seed
        self.data_sep = data_sep


    def setup(self, stage: str):

        if stage == "fit":
            self.syn_replace = self.syn_replace_input
            self.change_random_letter = self.change_random_input

        if stage == "test":
            self.syn_replace = False
            self.change_random_letter = False



    def prepare_data_per_node(self):
    # Return True if you want prepare_data to be called on each node
        return False


    def prepare_data(self):
        nltk.download('wordnet')
        nltk.download('averaged_perceptron_tagger')
        nltk.download('punkt')


        dataset = Dataset.from_pandas(self.train_data.drop('PairID', axis = 1))
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.train_dataset = dataset.with_format("torch", device=device)

        self.train_dataset.set_transform(self.encode)

        dataset = Dataset.from_pandas(self.test_data.drop('PairID', axis = 1))
        self.test_dataset = dataset.with_format("torch", device=device)

        self.test_dataset.set_transform(self.encode)


    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=16,num_workers = 2, shuffle=True)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size = 16, num_workers = 2,shuffle = True)


    def get_random_word(self,seq : str, min_len : int = 3,seed : int = None, sep = '\n', syn_replace : bool = False) -> tuple:
            #random.seed(seed)
            seq = seq.replace(sep, '')
            tokens = nltk.word_tokenize(seq)
            pos_tags = nltk.pos_tag(tokens)
            if syn_replace:
                pos_tags = self.syn_replace_choice(pos_tags)

            pos_tags = list(set(pos_tags))
            candidates = [word for word in pos_tags if len(word[0]) >= min_len]
            if(candidates == []):
                return (None,None)
            return random.choice(candidates)

    #takes a pos_tagged sentence as function argument
    def syn_replace_choice(self,pos_tags : list):
        replacable_tags = ['JJ', 'JJR', 'JJS','RB', 'RBR', 'RBS']
        filtered_data = [item for item in pos_tags if item[1] in replacable_tags]
        return filtered_data

    def get_synonym(self, word : str, pos : str, seed : int = None) -> str:
        synonyms = []
        if pos == None:
            return word
        for syn in wn.synsets(word, pos = pos):
            for lemma in syn.lemmas():
                synonyms.append(lemma.name())
        #random.seed(seed)
        if len(synonyms) <= 1:
            return word
        return random.choice(list(set(synonyms)))

    def find_word_type(self,target_tag):
        word_type_dict = {wn.ADJ : ['JJ', 'JJR', 'JJS'],wn.ADV : ['RB', 'RBR', 'RBS']}
        for key, tag_list in word_type_dict.items():
            if target_tag in tag_list:
                return key
        return None  # Tag not found in any list technically not possible

    def apply_syn_replacement(self,batch : list, sep : str = '\n', seed : int = 42, p : float = 0.3):
        batch_aug = []
        for seq in batch:
            word = self.get_random_word(seq, sep = sep, syn_replace= True, seed = seed)
            if word[0] != None:
                batch_aug += [seq.replace(word[0],self.get_synonym(word[0], pos = self.find_word_type(word[1]), seed = seed))]
            else:
                batch_aug += [seq]
        return batch_aug


    #changes one random letter of the word
    #dont interchange first or last letter
    def replace_letter(self,word : str, seed : int = None)-> str:
        #random.seed(seed)
        if len(word) <=2:
            return word
        idx = random.randint(1,len(word)-2)
        mod_word = word[:idx] + random.choice(string.ascii_lowercase) + word[idx + 1:]
        return mod_word

    def apply_change_letter(self,batch : list, p : int = 0.3, sep :str = '\n', seed = None):
        batch_aug = []
        for seq in batch:
            seq = seq.split(sep)
            word = self.get_random_word(seq[0])[0]
            # random.seed(42)
            if word == None or random.random() < p:
                batch_aug +=  [seq[0] + sep + seq[1]]
                continue
            batch_aug += [seq[0].replace(word,self.replace_letter(word,seed = seed)) + sep + seq[1]]
        return batch_aug

    def encode(self,batch):
        batch_aug = batch['Text']
        if self.syn_replace:
            batch_aug = self.apply_syn_replacement(batch_aug)
        if self.change_random_letter:
            batch_aug = self.apply_change_letter(batch_aug)
        tokenized = self.tokenizer(batch_aug,return_tensors="pt", padding=True,truncation=True)
        tokenized =  {'input_ids' : tokenized['input_ids'].type(torch.LongTensor),'attention_mask' :tokenized['attention_mask'].type(torch.LongTensor)}
        tokenized['labels'] = torch.tensor(batch['Score'], dtype=torch.float32)
        return tokenized

In [19]:
class SentenceSimilarityModel(pl.LightningModule):
    def __init__(self,model_type = 'distilbert-base-uncased', lr=1.737e-6):
        super(SentenceSimilarityModel, self).__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(model_type,num_labels=1)

        self.loss_fn = torch.nn.MSELoss()
        self.lr = lr

    # def forward(self, input_ids,attention_mask):
    #     outputs = self.model(input_ids, attention_mask=attention_mask)

    #     logits = outputs.logits
    #     return logits

    def forward(self, **inputs):
        return self.model(**inputs)

    def training_step(self, batch, batch_idx):

        outputs = self(**batch)
        score = batch['labels']
        loss = outputs.loss

        spearman = spearmanr(outputs.logits.detach().cpu().numpy(),score.detach().cpu().numpy()).statistic

        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log("train_spearman", spearman,on_step = False, on_epoch=True, prog_bar=True)


        return loss


    # def training_step(self, batch, batch_idx):
    #     input_ids, attention_mask, score = batch
    #     logits = self(input_ids,attention_mask)


    #     loss = F.mse_loss(logits, score)
    #     spearman = spearmanr(logits.detach().cpu().numpy(),score.detach().cpu().numpy()).statistic

    #     self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
    #     self.log("train_spearman", spearman,on_step = False, on_epoch=True, prog_bar=True)

    #     return loss

    # def validation_step(self, batch, batch_idx):
    #     input_ids, attention_mask, score = batch
    #     outputs = self(input_ids, attention_mask)
    #     logits = outputs.logits

    #     loss = F.mse_loss(logits, score)

    #     spearman = spearmanr(logits.detach().cpu().numpy(),score.detach().cpu().numpy()).statistic

    #     self.log("val_loss", loss,on_epoch = True, prog_bar = True)
    #     self.log("val_spearman",spearman,on_epoch = True, prog_bar = True)


    def test_step(self, batch, batch_idx):
        outputs = self(**batch)
        score = batch['labels']
        loss = outputs.loss
        # print("Score:", score)
        # print('Score_var:', score.var())

        spearman = spearmanr(outputs.logits.detach().cpu().numpy(),score.detach().cpu().numpy()).statistic

        self.log("test_loss", loss, on_epoch=True, prog_bar=True)
        self.log("test_spearman", spearman, on_epoch=True, prog_bar=True)


    def configure_optimizers(self):
        self.optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr, fused = True)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, mode='min', factor=0.5, patience=3, min_lr=1e-8)

        return {'optimizer': self.optimizer, 'lr_scheduler': {'scheduler': scheduler, 'monitor': 'train_loss'}}



In [20]:
if __name__ == '__main__':

    kf = KFold(n_splits=5, shuffle=True, random_state=42)


    train_data = pd.read_csv('/content/eng_train.csv')

    all_spearman_corrs = []

    model_type = 'facebook/bart-base'

    for fold, (train_idx, test_idx) in enumerate(kf.split(train_data)):

      str_datamodule = STR_DataModule(train_data = train_data.iloc[train_idx].reset_index(drop = True),
                                      test_data = train_data.iloc[test_idx].reset_index(drop = True),
                                      batch_size = 16,
                                      model_type = model_type,
                                      syn_replace = True,
                                      change_random_letter = True,
                                      no_validation = True)

      model = SentenceSimilarityModel(model_type = model_type, lr = 5e-5)



#  callbacks=[EarlyStopping(monitor="train_loss", patience=2, mode="min")]

      trainer = pl.Trainer(max_epochs=10,
                           precision="16-mixed",
                           accelerator='gpu',
                           limit_val_batches=0)

      # tuner = pl.tuner.Tuner(trainer)
      # lr_finder = tuner.lr_find(model, datamodule = str_datamodule, min_lr = 1e-8,max_lr= 1e-3)

      # # Pick point based on plot, or get suggestion
      # new_lr = lr_finder.suggestion()

      # #  update hparams of the model
      # model.lr = new_lr

      trainer.fit(model, datamodule=str_datamodule)


      trainer.test(ckpt_path="best", datamodule = str_datamodule)


      # Calculate and print the average Spearman correlation
      average_spearman_corr = trainer.callback_metrics["test_spearman"]
      print(
          f"Average Spearman Correlation for Fold {fold + 1}: {average_spearman_corr}"
      )

      all_spearman_corrs.append(average_spearman_corr)


    # Calculate and print the overall average Spearman correlation
    overall_average_spearman_corr = sum(all_spearman_corrs) / len(all_spearman_corrs)
    print(
        f"Overall Average Spearman Correlation across all folds: {overall_average_spearman_corr}"
     )


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.out_proj.weight', 'classification_head.out_proj.bias', 'classification_head.dense.bias', 'classification_head.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightn

Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/lightning_logs/version_23/checkpoints/epoch=9-step=2750.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/lightning_logs/version_23/checkpoints/epoch=9-step=2750.ckpt
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:49

Testing: |          | 0/? [00:00<?, ?it/s]

Average Spearman Correlation for Fold 1: 0.836669385433197


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.out_proj.weight', 'classification_head.out_proj.bias', 'classification_head.dense.bias', 'classification_head.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightn

Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/lightning_logs/version_24/checkpoints/epoch=9-step=2750.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/lightning_logs/version_24/checkpoints/epoch=9-step=2750.ckpt
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:49

Testing: |          | 0/? [00:00<?, ?it/s]

Average Spearman Correlation for Fold 2: 0.8147494196891785


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.out_proj.weight', 'classification_head.out_proj.bias', 'classification_head.dense.bias', 'classification_head.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightn

Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/lightning_logs/version_25/checkpoints/epoch=9-step=2750.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/lightning_logs/version_25/checkpoints/epoch=9-step=2750.ckpt
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:49

Testing: |          | 0/? [00:00<?, ?it/s]

Average Spearman Correlation for Fold 3: 0.7863326072692871


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.out_proj.weight', 'classification_head.out_proj.bias', 'classification_head.dense.bias', 'classification_head.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightn

Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/lightning_logs/version_26/checkpoints/epoch=9-step=2750.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/lightning_logs/version_26/checkpoints/epoch=9-step=2750.ckpt
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:49

Testing: |          | 0/? [00:00<?, ?it/s]

Average Spearman Correlation for Fold 4: 0.8156422972679138


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.out_proj.weight', 'classification_head.out_proj.bias', 'classification_head.dense.bias', 'classification_head.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightn

Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/lightning_logs/version_27/checkpoints/epoch=9-step=2750.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/lightning_logs/version_27/checkpoints/epoch=9-step=2750.ckpt
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:49

Testing: |          | 0/? [00:00<?, ?it/s]

Average Spearman Correlation for Fold 5: 0.8158584833145142
Overall Average Spearman Correlation across all folds: 0.8138504028320312
