In [1]:
# !pip install lightning
# !pip install protobuf
# !pip install transformers
# !pip install sentencepiece

In [2]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
import pytorch_lightning as pl
from transformers import DebertaTokenizer, DebertaForSequenceClassification,AutoTokenizer
import pandas as pd
from scipy.stats import spearmanr, pearsonr
from sklearn.model_selection import train_test_split
import os

In [3]:
class SentenceSimilarityModel(pl.LightningModule):
    def __init__(self, learning_rate=2e-5):
        super(SentenceSimilarityModel, self).__init__()
        self.deberta = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-v3-small',ignore_mismatched_sizes=True,num_labels=1)
        self.learning_rate = learning_rate

    def forward(self, input_ids,attention_mask):

        outputs = self.deberta(input_ids, attention_mask=attention_mask)
        return torch.sigmoid(outputs.logits)

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, score = batch
        logits = self(input_ids,attention_mask)

        # Assuming a binary classification task
        loss = F.binary_cross_entropy_with_logits(torch.sigmoid(logits), score.float())
        spearman = spearmanr(torch.sigmoid(logits).detach().cpu().numpy(),score.detach().cpu().numpy()).statistic

        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log("spearman", spearman, on_step=True, on_epoch=True, prog_bar=True)

        return loss

    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, score = batch
        logits = self(input_ids, attention_mask)

        # Assuming a binary classification task
        loss = F.binary_cross_entropy_with_logits(torch.sigmoid(logits), score.float())

        spearman = spearmanr(torch.sigmoid(logits).detach().cpu().numpy(),score.detach().cpu().numpy()).statistic

        self.log("val_loss", loss)
        self.log("spearman",spearman)


    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.learning_rate)


In [4]:
class str_dataset(torch.utils.data.Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe
        self.tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-small')

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Extract the features and target from the DataFrame
        # Adjust this based on your DataFrame structure
        features = self.dataframe['input'].loc[idx]
        token = self.tokenizer(features, return_tensors='pt', truncation='longest_first', padding='max_length',max_length=768)

        input_ids = token['input_ids'].squeeze()
        attention_mask = token['attention_mask'].squeeze()

        score = self.dataframe['Score'].loc[idx]

        score = torch.tensor(score, dtype=torch.float32).unsqueeze(dim=0)  # Adjust the dtype as needed

        return input_ids, attention_mask, score

In [5]:
class STR_DataModule(pl.LightningDataModule):
    def __init__(self, train_dataset, val_dataset, batch_size=32):
        super().__init__()
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.batch_size = batch_size

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size,num_workers = 1)


In [6]:
train_data = pd.read_csv('/content/eng_train.csv')
train_data.head()

Unnamed: 0,PairID,Text,Score
0,ENG-train-0000,"It that happens, just pull the plug.\nif that ...",1.0
1,ENG-train-0001,A black dog running through water.\nA black do...,1.0
2,ENG-train-0002,I've been searchingthe entire abbey for you.\n...,1.0
3,ENG-train-0003,If he is good looking and has a good personali...,1.0
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0


In [7]:
sep = '[SEP]'
test = train_data['Text'].loc[0].replace('\n',sep)
test

'It that happens, just pull the plug.[SEP]if that ever happens, just pull the plug.'

In [8]:
train_data['input'] = train_data.apply(lambda row : row['Text'].replace('\n',sep),axis = 1)
train_data.head()

Unnamed: 0,PairID,Text,Score,input
0,ENG-train-0000,"It that happens, just pull the plug.\nif that ...",1.0,"It that happens, just pull the plug.[SEP]if th..."
1,ENG-train-0001,A black dog running through water.\nA black do...,1.0,A black dog running through water.[SEP]A black...
2,ENG-train-0002,I've been searchingthe entire abbey for you.\n...,1.0,I've been searchingthe entire abbey for you.[S...
3,ENG-train-0003,If he is good looking and has a good personali...,1.0,If he is good looking and has a good personali...
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0,"She does not hate you, she is just annoyed wit..."


In [9]:
#train test split

train_df, val_df = train_test_split(train_data, test_size=0.2, random_state=42)

train_dataset = str_dataset(train_df.reset_index(drop=True))
val_dataset = str_dataset(val_df.reset_index(drop=True))

str_datamodule = STR_DataModule(train_dataset=train_dataset, val_dataset= val_dataset, batch_size=4)



In [10]:
print(train_dataset.__getitem__(0)[0].shape)
print(train_dataset.__getitem__(1)[0].shape)

torch.Size([768])
torch.Size([768])


In [11]:
model = SentenceSimilarityModel()

trainer = pl.Trainer(max_epochs=2,accelerator='gpu')
trainer.fit(model, datamodule=str_datamodule)

You are using a model of type deberta-v2 to instantiate a model of type deberta. This is not supported for all configurations of models and can yield errors.
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['deberta.encoder.layer.3.attention.self.q_bias', 'deberta.encoder.layer.4.attention.self.v_bias', 'deberta.encoder.layer.4.attention.self.pos_q_proj.bias', 'deberta.encoder.layer.3.attention.self.pos_q_proj.weight', 'deberta.encoder.layer.5.attention.self.q_bias', 'deberta.encoder.layer.1.attention.self.pos_q_proj.bias', 'deberta.encoder.layer.0.attention.self.pos_q_proj.weight', 'deberta.encoder.layer.1.attention.self.q_bias', 'deberta.encoder.layer.4.attention.self.pos_q_proj.weight', 'deberta.encoder.layer.5.attention.self.v_bias', 'deberta.encoder.layer.1.attention.self.pos_q_proj.weight', 'classifier.weight', 'deberta.encoder.layer.5.attention.self.pos_proj.weight', 'classif

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]



Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.
