In [2]:
!pip install -q rouge_score bert_score

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, TrainingArguments, pipeline
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_metric, Dataset
from tqdm import tqdm
from bert_score import score
import wandb

In [4]:
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicBART", do_lower_case=False, use_fast=False, keep_accents=True)
model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/IndicBART").to('cuda')
target = "Summary"

tokenizer_config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/832 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.90M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/221 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


pytorch_model.bin:   0%|          | 0.00/976M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [5]:
train_data = pd.read_csv("/kaggle/input/hindi-dataset/hindi_train.csv")
# train_data.drop(columns=["Summary"], inplace=True)

# test_data = pd.read_csv("/kaggle/input/hindi-dataset/HindiNews_test.csv")
# test_data.rename(columns={"id": "Id"}, inplace=True)
# data = pd.concat([train_data, test_data], axis=0)
data = train_data

train_dataset, test_dataset = train_test_split(data, test_size=0.1, shuffle=False)
train_dataset, val_dataset = train_test_split(train_dataset, test_size=0.1, shuffle=False)

In [6]:
def preprocess_data(example):
    inputs = tokenizer(example["Article"], padding="max_length", truncation=True, max_length=512, return_tensors="pt").to('cuda')
    outputs = tokenizer(example[target], padding="max_length", truncation=True, max_length=64, return_tensors="pt").to('cuda')
    return {"input_ids": inputs["input_ids"].tolist(), "attention_mask": inputs["attention_mask"].tolist(), "labels": outputs["input_ids"].tolist()}

train_dataset = Dataset.from_pandas(train_dataset)
train_dataset = train_dataset.map(preprocess_data, batched=True, batch_size=96)
val_dataset = Dataset.from_pandas(val_dataset)
val_dataset = val_dataset.map(preprocess_data, batched=True, batch_size=96)
test_dataset = Dataset.from_pandas(test_dataset)
test_dataset = test_dataset.map(preprocess_data, batched=True, batch_size=96)

Map:   0%|          | 0/17191 [00:00<?, ? examples/s]

Map:   0%|          | 0/1911 [00:00<?, ? examples/s]

Map:   0%|          | 0/2123 [00:00<?, ? examples/s]

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/model",
    num_train_epochs=1,
    logging_dir="/kaggle/working/logs",
    logging_steps=500,
    overwrite_output_dir=True,
    save_steps=1000,
    eval_steps=500,
    save_total_limit=3,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()
model.save_pretrained("/kaggle/working/finetuned_model")
tokenizer.save_pretrained("/kaggle/working/finetuned_model")

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,2.3133
1000,1.3485


Non-default generation parameters: {'forced_eos_token_id': 2}


In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("/kaggle/working/finetuned_model").to('cuda')
tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/finetuned_model", do_lower_case=False, use_fast=False, keep_accents=True)

def generate_heading(article):
    inputs = tokenizer(article, padding="max_length", truncation=True, max_length=512, return_tensors="pt").to('cuda')
    output = model.generate(inputs["input_ids"], max_length=100, num_beams=4, early_stopping=True)
    return tokenizer.decode(output[0], skip_special_tokens=True)

predictions = [generate_heading(article) for article in tqdm(test_dataset["Article"])]

metric_rouge = load_metric("rouge")
references = test_dataset[target]
rouge_scores = metric_rouge.compute(predictions=predictions, references=references)

P, R, F1 = score(predictions, references, lang='hi', verbose=True)

In [None]:
with open("rouge_scores.txt", "w") as f:
    f.write(str(rouge_scores))

with open("bert_scores.txt", "w") as f:
    f.write(f"P: {P.mean()}\nR: {R.mean()}\nF1: {F1.mean()}")

In [None]:
rouge_scores

In [None]:
print(P.mean())
print(R.mean())
print(F1.mean())

In [None]:
!zip -r file.zip /kaggle/working/finetuned_model

In [None]:
from IPython.display import FileLink
FileLink(r'file.zip')

In [None]:
# model = AutoModelForSeq2SeqLM.from_pretrained("/kaggle/working/finetuned_model").to('cuda')
# tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/finetuned_model", do_lower_case=False, use_fast=False, keep_accents=True)

# def generate_heading(article):
#     inputs = tokenizer(article, padding="max_length", truncation=True, max_length=512, return_tensors="pt").to('cuda')
#     output = model.generate(inputs["input_ids"], max_length=64, num_beams=4, early_stopping=True)
#     return tokenizer.decode(output[0], skip_special_tokens=True)

# predictions = [generate_heading(article) for article in test_dataset["Article"][:1]]

In [None]:
# tokenizer = AutoTokenizer.from_pretrained("Someman/bart-hindi", do_lower_case=False, use_fast=False, keep_accents=True)
# model = AutoModelForSeq2SeqLM.from_pretrained("Someman/bart-hindi").to('cuda')