In [None]:
def preprocess_function(text, summary):
    inputs = tokenizer(str(text), max_length=512, padding="max_length", truncation=True, return_tensors="pt")
    labels = tokenizer(str(summary), max_length=128, padding="max_length", truncation=True, return_tensors="pt")
    return {
        "input_ids": inputs.input_ids.flatten(),
        "attention_mask": inputs.attention_mask.flatten(),
        "labels": labels.input_ids.flatten()
    }


In [None]:
from datasets import Dataset

# Convert preprocessed_data to a list of dictionaries
dataset_dict = {
    "input_ids": preprocessed_data["input_ids"].tolist(),
    "attention_mask": preprocessed_data["attention_mask"].tolist(),
    "labels": preprocessed_data["labels"].tolist(),
}

# Create a Dataset object from the list of dictionaries
dataset = Dataset.from_dict(dataset_dict)


In [None]:
preprocessed_data = data.apply(lambda row: preprocess_function(row['Text'], row['Summary']), axis=1)
dataset = Dataset.from_dict(preprocessed_data)


In [None]:
import pandas as pd
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, Trainer, TrainingArguments
from datasets import Dataset

**Initialize model and tokenizer**

In [None]:
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small", src_lang="te_IN", tgt_lang="te_IN")

**Loading train dataset**

In [None]:
from datasets import Dataset
import pandas as pd

# Load CSV dataset
df = pd.read_csv("/kaggle/input/final-dataset/final_train.csv")

# Ensure "Text" and "Summary" columns are string type
df["Text"] = df["Text"].astype(str) 
df["Summary"] = df["Summary"].astype(str)

# Create a dictionary from DataFrame
dataset_dict = {"text": df["Text"].tolist(), "summary": df["Summary"].tolist()}

# Create Hugging Face Dataset
dataset = Dataset.from_dict(dataset_dict)

**Defining training arguments**

In [None]:
training_args = TrainingArguments(
    output_dir="./results",  # output directory
    num_train_epochs=5,  # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    save_steps=100000,  # number of updates steps before checkpoint saves
    save_total_limit=2,  # limit the total amount of checkpoints
)

**Defining function to preprocess the dataset**

In [None]:
def preprocess_function(examples):
    inputs = tokenizer(examples["text"], return_tensors="pt", max_length=512, padding='max_length', truncation=True)
    targets = tokenizer(examples["summary"], return_tensors="pt", max_length=128, padding='max_length', truncation=True)
    return {"input_ids": inputs.input_ids, "attention_mask": inputs.attention_mask, "labels": targets.input_ids}


**Preprocessing the dataset**

In [None]:
train_dataset = dataset.map(preprocess_function, batched=True)

**Initializing the trainer**

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    #compute_metrics=compute_metrics,
)


**Finetuning the model**

In [None]:
trainer.train()

**Save the model**

In [None]:
trainer.save_model("./mt5_finetuned_model")

In [None]:
!zip -r mt5_model.zip /kaggle/working/mt5_finetuned_model

In [None]:
!zip -r 1mt5_latest_gensum.zip /kaggle/working/generated_summaries.csv

In [None]:
from IPython.display import FileLink
FileLink(r'1mt5_latest_gensum.zip')

**Load the test dataset**

In [None]:
from datasets import Dataset
import pandas as pd

# Load CSV dataset
df2 = pd.read_csv("/kaggle/input/final-dataset/final_test.csv")

# Ensure "Text" and "Summary" columns are string type
df2["Text"] = df2["Text"].astype(str) 
df2["Summary"] = df2["Summary"].astype(str)

# Create a dictionary from DataFrame
dataset_dict2 = {"text": df2["Text"].tolist(), "summary": df2["Summary"].tolist()}

# Create Hugging Face Dataset
dataset2 = Dataset.from_dict(dataset_dict2)

**Preprocess the test dataset**

In [None]:
test_dataset = dataset2.map(preprocess_function, batched=True)

**Generate the summaries**

In [None]:
import torch

# List to store generated summaries
generated_summaries = []

# Move model to appropriate device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Iterate over the tokenized test dataset
count=0
for example in test_dataset:
    # Convert input_ids and attention_mask to tensors and move to appropriate device
    input_ids = torch.tensor([example["input_ids"]]).to(device)
    attention_mask = torch.tensor([example["attention_mask"]]).to(device)
    
    # Generate prediction for the current example
    prediction = model.generate(input_ids=input_ids, attention_mask=attention_mask)
    
    # Decode the prediction and append it to the list
    generated_summary = tokenizer.decode(prediction[0], skip_special_tokens=True)
    generated_summaries.append(generated_summary)
    count=count+1
    #if(count>10):
        #break

**Store the summaries in a csv file**

In [None]:
import csv

# Assuming you have a list of generated summaries called generated_summaries
#generated_summaries = ['Summary 1', 'Summary 2', 'Summary 3', ...]

# Specify the file name
csv_file = '/kaggle/working/generated_summaries.csv'  # Saving in the working directory in Kaggle

# Open the CSV file in write mode and write the summaries
with open(csv_file, 'w', newline='', encoding='utf-8') as file:
    # Create a CSV writer object
    writer = csv.writer(file)
    
    # Write each summary in the list as a row with a single column
    for summary in generated_summaries:
        writer.writerow([summary])

print("File created and data stored successfully.")


**Print the summaries**

In [None]:
for i in range(10):
    print(f"Text: {df2['Text'][i]}")
    print(f"Original Summary: {df2['Summary'][i]}")
    print(f"Generated Summary: {generated_summaries[i]}\n")
    #print(f"Generated Summary: {generated_summaries_telugu[i]}\n")

**Function to calculate BLEU score**

In [None]:
from nltk.translate.bleu_score import sentence_bleu

def evaluate_summary(reference, generated):
    # BLEU score
    bleu_score = sentence_bleu([reference], generated)

    # ROUGE score

    return bleu_score

**Avg BLEU score calculation**

In [None]:
avg_bleu = 0
bleu = 0
for i in range(10):
    #print(f"Text: {df2['Text'][i]}")
    #print(f"Original Summary: {df2['Summary'][i]}")
    reference_summary = df2["Summary"][i]
    #print(f"Generated Summary: {generated_summaries[i]}\n")
    generated_summary = generated_summaries[i]
    bleu = evaluate_summary(reference_summary, generated_summary)
    avg_bleu += bleu
print(avg_bleu/10)