# Imports

In [None]:
!pip install -q rouge_score bert_score
!pip install evaluate

In [12]:
from transformers import MBartForConditionalGeneration, AutoModelForSeq2SeqLM
from transformers import AlbertTokenizer, AutoTokenizer
from sklearn.model_selection import train_test_split
import pickle as pkl
import pandas as pd
import numpy as np
from datasets import Dataset
import torch
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from tqdm.notebook import tqdm
from datasets import load_metric
import wandb

# Custom Dataset

In [13]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe):
        self.df = dataframe
    
    def __getlen__(self):
        return len(self.dataframe)
    
    def __getitem(self, index):
        return self.dataframe.iloc[index]

In [15]:
df = pd.read_csv("/kaggle/input/hindidataset/HindiNews/HindiNews_train_v2/hindi_train.csv")

In [16]:
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)

In [None]:
df_train = pd.read_csv("/kaggle/input/hindidataset/HindiNews/HindiNews_train_v2/hindi_train.csv")
df_test = pd.read_csv("/kaggle/input/hindidataset/HindiNews/HindiNews_test.csv")

In [None]:
train_dataset = MyDataset(df_train)
test_dataset = MyDataset(df_test)

# Similarity Seq2Seq Model

In [None]:
import torch
from torch import nn
from transformers import AutoModelForSeq2SeqLM

class CustomModel(nn.Module):
    def __init__(self, model_path):
        super(CustomModel, self).__init__()
        self.seq2seq = AutoModelForSeq2SeqLM.from_pretrained(model_path)
        self.similarity_attention = nn.Linear(self.seq2seq.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask, similarity_scores):
        outputs = self.seq2seq(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state
        
        sentences = input_ids.masked_fill(input_ids == 0, 843).split(843, dim=1)
        num_sentences = len(sentences)
        
        similarity_attention_weights = torch.sigmoid(self.similarity_attention(last_hidden_state))
        
        attended_outputs = []
        for i in range(num_sentences):
            sentence = sentences[i]
            sentence_length = torch.sum(sentence != 0).item()
            sentence_similarity_scores = similarity_scores[i, :sentence_length].unsqueeze(-1)
            sentence_attention_weights = similarity_attention_weights[i, :sentence_length]
            sentence_attended_output = sentence_attention_weights * last_hidden_state[i, :sentence_length]
            attended_outputs.append(sentence_attended_output)
        
        attended_output = torch.cat(attended_outputs, dim=0)
        
        return attended_output

In [None]:
from transformers import AdamW


model = CustomModel()
model = model.to('cuda')  


optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 10
dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)

def compute_loss(outputs, labels):
    return torch.nn.functional.mse_loss(outputs, labels)


for epoch in range(num_epochs):
    for batch in dataloader:

        input_ids, attention_mask, similarity_scores, labels = batch
        input_ids = input_ids.to('cuda')
        attention_mask = attention_mask.to('cuda')
        similarity_scores = similarity_scores.to('cuda')
        labels = labels.to('cuda')


        outputs = model(input_ids=input_ids, attention_mask=attention_mask, similarity_scores=similarity_scores)

        loss = compute_loss(outputs, labels)

    
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

    print(f'Epoch {epoch+1} completed')


# Testing on samples (Sanity Check)

In [None]:
import torch
from torch import nn
from transformers import AutoModelForSeq2SeqLM

class CustomModel(nn.Module):
    def __init__(self):
        super(CustomModel, self).__init__()
        self.seq2seq = AutoModelForSeq2SeqLM.from_pretrained('/kaggle/working/finetuned_model')
        self.similarity_attention = nn.Linear(self.seq2seq.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask, similarity_scores):
        outputs = self.seq2seq(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state

        similarity_attention_weights = torch.sigmoid(self.similarity_attention(similarity_scores))
        attended_output = last_hidden_state * similarity_attention_weights.unsqueeze(-1)

        return attended_output

In [None]:
!zip -r file.zip /kaggle/working/finetuned_summary_model
from IPython.display import FileLink
FileLink(r'file.zip')