In [None]:
!pip install -q rouge_score bert_score
!pip install evaluate

In [43]:
from transformers import MBartForConditionalGeneration, AutoModelForSeq2SeqLM
from transformers import AlbertTokenizer, AutoTokenizer
import pickle as pkl
import pandas as pd
import numpy as np
from datasets import Dataset
import torch
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
from tqdm.notebook import tqdm
from bert_score import score
from datasets import load_metric
import wandb

In [None]:
wandb.login(key = "788dd34f5b4737da2945fd15125f904c0649fb24")

# Data Analysis

In [None]:
similarity = pkl.load(open("/kaggle/input/hindidataset/similarities.pkl","rb"))

In [None]:
df = pd.read_csv("/kaggle/input/hindidataset/HindiNews/HindiNews_train_v2/hindi_train.csv")

In [None]:
df.iloc[1]["Article"].split("।")

In [None]:
df.iloc[1]["Heading"]

In [None]:
torch.nn.functional.softmax(input = torch.tensor(similarity[1]), dim = 0)

# Setting up the Dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicBARTSS", do_lower_case=False, use_fast=False, keep_accents=True)
checkpoint = "ai4bharat/IndicBARTSS"

In [None]:
train_df = pd.read_csv("/kaggle/input/hindidataset/HindiNews/HindiNews_train_v2/hindi_train.csv")
train_dataset = Dataset.from_pandas(train_df)
train_dataset = train_dataset.remove_columns(["Id"])

test_df = pd.read_csv("/kaggle/input/hindidataset/HindiNews/HindiNews_test.csv")
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.remove_columns(["id"])

In [58]:
dataset = train_dataset.train_test_split(test_size=0.3, shuffle = False)

In [None]:
def preprocess_data(example):
    inputs = tokenizer(example["Article"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    outputs = tokenizer(example["Summary"], padding="max_length", truncation=True, max_length=64, return_tensors="pt")
    return {"input_ids": inputs["input_ids"].tolist(), "attention_mask": inputs["attention_mask"].tolist(), "labels": outputs["input_ids"].tolist()}

In [None]:
batched_dataset = dataset.map(preprocess_data, batched = True, batch_size = 16)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
batched_dataset

# Evaluation Metrics

In [None]:
rouge = evaluate.load("rouge")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

# Training

In [None]:
seq2seq_model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/model",
    num_train_epochs=1,
    logging_dir="/kaggle/working/logs",
    logging_steps=500,
    overwrite_output_dir=True,
    save_steps=1000,
    eval_steps=500,
    save_total_limit=3,
)

trainer = Seq2SeqTrainer(
    model=seq2seq_model,
    args=training_args,
    train_dataset=batched_dataset["train"],
    
    eval_dataset=batched_dataset["test"],
    data_collator=data_collator,
)

trainer.train()
seq2seq_model.save_pretrained("/kaggle/working/finetuned_summary_model")
tokenizer.save_pretrained("/kaggle/working/finetuned_summary_model")

In [44]:
model = AutoModelForSeq2SeqLM.from_pretrained("/kaggle/working/finetuned_summary_model").to('cuda')
tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/finetuned_summary_model", do_lower_case=False, use_fast=False, keep_accents=True)

def generate_heading(article):
    inputs = tokenizer(article, padding="max_length", truncation=True, max_length=512, return_tensors="pt").to('cuda')
    output = model.generate(inputs["input_ids"], max_length=100, num_beams=4, early_stopping=True)
    return tokenizer.decode(output[0], skip_special_tokens=True)

predictions = [generate_heading(article) for article in tqdm(batched_dataset["test"][:100]["Article"])]

metric_rouge = load_metric("rouge")
references = batched_dataset["test"][:100]["Summary"]
rouge_scores = metric_rouge.compute(predictions=predictions, references=references)

P, R, F1 = score(predictions, references, lang='hi', verbose=True)

with open("rouge_scores.txt", "w") as f:
    f.write(str(rouge_scores))

with open("bert_scsAaores.txt", "w") as f:
    f.write(f"P: {P.mean()}\nR: {R.mean()}\nF1: {F1.mean()}")

  metric_rouge = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

calculating scores...
computing bert embedding.


  0%|          | 0/3 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 1.03 seconds, 96.83 sentences/sec


In [100]:
print(P.mean(), R.mean())

tensor(0.7976) tensor(0.8187)


In [54]:
model.get_decoder()

MBartDecoder(
  (embed_tokens): Embedding(64015, 1024, padding_idx=0)
  (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
  (layers): ModuleList(
    (0-5): 6 x MBartDecoderLayer(
      (self_attn): MBartAttention(
        (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
      )
      (activation_fn): GELUActivation()
      (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (encoder_attn): MBartAttention(
        (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
      )
     

In [48]:
rouge_scores

{'rouge1': AggregateScore(low=Score(precision=0.2645527777777777, recall=0.1485628459453343, fmeasure=0.16433847644910538), mid=Score(precision=0.3513333333333334, recall=0.22113956893639553, fmeasure=0.23474194653030972), high=Score(precision=0.43181249999999993, recall=0.29594341199950713, fmeasure=0.30731551856296435)),
 'rouge2': AggregateScore(low=Score(precision=0.12899455266955268, recall=0.0689343135826592, fmeasure=0.07615594874681562), mid=Score(precision=0.1953225108225108, recall=0.12663205258793492, fmeasure=0.12961318622038434), high=Score(precision=0.269300505050505, recall=0.19312302713626245, fmeasure=0.189653580518209)),
 'rougeL': AggregateScore(low=Score(precision=0.26306944444444447, recall=0.15159578806556356, fmeasure=0.16513981259118302), mid=Score(precision=0.3453333333333333, recall=0.21982793367623082, fmeasure=0.23095734785995922), high=Score(precision=0.4323361111111112, recall=0.2971100534871046, fmeasure=0.3033035367246011)),
 'rougeLsum': AggregateScore(

In [None]:
import torch
from torch import nn
from transformers import AutoModelForSeq2SeqLM

class CustomModel(nn.Module):
    def __init__(self):
        super(CustomModel, self).__init__()
        self.seq2seq = AutoModelForSeq2SeqLM.from_pretrained('path_to_your_fine_tuned_model')
        self.similarity_attention = nn.Linear(self.seq2seq.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask, similarity_scores):
        # Pass the input through Seq2Seq model
        outputs = self.seq2seq(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state

        # Apply sentence similarity scores as attention
        similarity_attention_weights = torch.sigmoid(self.similarity_attention(similarity_scores))
        attended_output = last_hidden_state * similarity_attention_weights.unsqueeze(-1)

        return attended_output


In [None]:
!zip -r file.zip /kaggle/working/finetuned_summary_model
from IPython.display import FileLink
FileLink(r'file.zip')

In [73]:
def train_epoch(seq2seq_model , tokenizer, similarity_scores, train_dataset):
    for data in tqdm(train_dataset):
        article = data["Article"]
        heading = data["Heading"]
        summary = data["Summary"]
        print(summary)
        break

In [74]:
similarity_scores = pkl.load(open("/kaggle/input/hindidataset/similarities.pkl",'rb'))
train_epoch(seq2seq_model, tokenizer, similarity_scores, dataset["train"])
    

  0%|          | 0/14857 [00:00<?, ?it/s]

Kerala Minor Girl Rape Case - केरल के एर्नाकुलम जिले में 5 साल की बच्ची से रेप के बाद गला दबाकर हत्या कर दी गई। आरोपी ने बच्ची का शव बोरे में डालकर डंपिंग ग्राउंड में फेंक दिया
