In [None]:
import numpy as np 
import pandas as pd 
import torch 
from torch import nn 
from transformers import BertTokenizer, BertModel , BertConfig

In [None]:
df = pd.read_csv("/kaggle/input/question-pairs-dataset/questions.csv")

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df=df.dropna()

In [None]:
class BertTrainingData:
    def __init__(self,q1,q2,target):
        self.q1=q1
        self.q2=q2
        self.target=target
        self.tokenizer= BertTokenizer.from_pretrained('bert-base-uncased')
        
    def __len__(self):
        return len(self.q1)
 
    def __preprocess__(self,s): 
        return str(s)
    
    def __gettarget__(self,item):
        return int(self.target[item])
    
    def __getpair__(self,item):
        sq1=self.__preprocess__(self.q1[item])
        sq2=self.__preprocess__(self.q2[item])
        
        inputs1=self.tokenizer.encode_plus(sq1,None,add_special_tokens=True,max_length=512,pad_to_max_length=True,truncation=True)
        inputs2=self.tokenizer.encode_plus(sq1,None,add_special_tokens=True,max_length=512,pad_to_max_length=True,truncation=True)

        return {
            'input_ids1': torch.tensor(inputs1["input_ids"],dtype=torch.long).unsqueeze(0),
            'attention_mask1': torch.tensor(inputs1["attention_mask"],dtype=torch.long).unsqueeze(0),
            'token_type_ids1': torch.tensor(inputs1["token_type_ids"],dtype=torch.long).unsqueeze(0),
            'input_ids2': torch.tensor(inputs2["input_ids"] ,dtype=torch.long).unsqueeze(0),
            'attention_mask2': torch.tensor(inputs2["attention_mask"],dtype=torch.long).unsqueeze(0),
            'token_type_ids2': torch.tensor(inputs2["token_type_ids"],dtype=torch.long).unsqueeze(0),
        }
        

In [None]:
q1=df["question1"].to_list().copy()
q2=df["question2"].to_list().copy()
target=df["is_duplicate"].to_list().copy()

In [None]:
data=BertTrainingData(q1,q2,target)

In [None]:
class BertClassifierModel(nn.Module):
    def __init__(self):
        super(BertClassifierModel, self).__init__() 
        config = BertConfig.from_pretrained('bert-base-uncased')    
        self.model1= BertModel.from_pretrained("bert-base-uncased",config=config)
        self.model2= BertModel.from_pretrained("bert-base-uncased",config=config)
        self.dropout=nn.Dropout(0.2)
        self.output=nn.Linear(768,1)
    def forward(self,ids1,ids2,mask1,mask2,tti1,tti2):
        _,o1=self.model1(ids1,tti1,mask1, return_dict=False) 
        _,o2=self.model2(ids2,tti2,mask2, return_dict=False)
        o1=self.dropout(o1)
        o2=self.dropout(o2)
        o1=self.output(o1)
        o2=self.output(o2)
        return o1,o2

In [None]:
model=BertClassifierModel()

In [None]:
def predict(data,model):
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    loss = nn.MSELoss()
    total_loss=0
    for i in range(data.__len__()): 
        inputs=data.__getpair__(i)
        o1,o2=model(ids1=inputs["input_ids1"],ids2=inputs["input_ids2"],mask1=inputs["attention_mask1"],mask2=inputs["attention_mask2"],tti1=inputs["token_type_ids1"],tti2=inputs["token_type_ids2"])
        output=cos(o1,o2)
        target=data.__gettarget__(i)
        output=torch.tensor(output)
        target=torch.tensor(target)
        total_loss+=loss(output,target)
    return total_loss
total_loss=predict(data,model)

In [None]:
total_loss