In [1]:
import pandas as pd
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, Trainer, TrainingArguments
from datasets import Dataset

2024-05-01 12:47:37.508551: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-01 12:47:37.508653: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-01 12:47:37.635331: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


**Initialize model and tokenizer**

In [None]:
# Initialize model and tokenizer
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="te_IN", tgt_lang="te_IN")

**Loading train dataset**

In [6]:
from datasets import Dataset
import pandas as pd

# Load CSV dataset
df = pd.read_csv("/kaggle/input/final-dataset/final_train.csv")

# Ensure "Text" and "Summary" columns are string type
df["Text"] = df["Text"].astype(str) 
df["Summary"] = df["Summary"].astype(str)

# Create a dictionary from DataFrame
dataset_dict = {"text": df["Text"].tolist(), "summary": df["Summary"].tolist()}

# Create Hugging Face Dataset
dataset = Dataset.from_dict(dataset_dict)



**Defining training arguments**

In [7]:
training_args = TrainingArguments(
    output_dir="./results",  # output directory
    num_train_epochs=3,  # total number of training epochs
    per_device_train_batch_size=2,  # batch size per device during training
    save_steps=100000,  # number of updates steps before checkpoint saves
    save_total_limit=2,  # limit the total amount of checkpoints
)


****

**Defining function to preprocess the dataset**

In [8]:
def preprocess_function(examples):
    inputs = tokenizer(examples["text"], return_tensors="pt", max_length=512, padding='max_length', truncation=True)
    targets = tokenizer(examples["summary"], return_tensors="pt", max_length=128, padding='max_length', truncation=True)
    return {"input_ids": inputs.input_ids, "attention_mask": inputs.attention_mask, "labels": targets.input_ids}


**preprocessing the dataset**

In [9]:
train_dataset = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/10414 [00:00<?, ? examples/s]

**Defining function to compute metrics**

In [10]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return {"bleu": sacrebleu.corpus_bleu(decoded_preds, [decoded_labels]).score}

**Initializing the trainer**

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics,
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


**Finetuning the model**

In [12]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
500,0.043
1000,0.0485
1500,0.0466
2000,0.0526
2500,0.048
3000,0.0537
3500,0.0609
4000,0.0614
4500,0.0694
5000,0.0859


KeyboardInterrupt: 

**Save the model**

In [None]:
trainer.save_model("./finetuned_model")

In [None]:
!zip -r mbart50_latest_finetuned.zip /kaggle/working/finetuned_model

In [None]:
from IPython.display import FileLink
FileLink(r'mbart50_latest_finetuned.zip')

**Load test dataset**

In [15]:
from datasets import Dataset
import pandas as pd

# Load CSV dataset
df2 = pd.read_csv("/kaggle/input/final-dataset/final_test.csv")

# Ensure "Text" and "Summary" columns are string type
df2["Text"] = df2["Text"].astype(str) 
df2["Summary"] = df2["Summary"].astype(str)

# Create a dictionary from DataFrame
dataset_dict2 = {"text": df2["Text"].tolist(), "summary": df2["Summary"].tolist()}

# Create Hugging Face Dataset
dataset2 = Dataset.from_dict(dataset_dict2)

**Preprocess the test dataset**

In [16]:
test_dataset = dataset2.map(preprocess_function, batched=True)

Map:   0%|          | 0/2604 [00:00<?, ? examples/s]

**Generate the summaries**

In [20]:
import torch

# List to store generated summaries
generated_summaries = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Iterate over the tokenized test dataset
count=0
for example in test_dataset:
    # Convert input_ids and attention_mask to tensors and move to appropriate device
    input_ids = torch.tensor([example["input_ids"]]).to(device)
    attention_mask = torch.tensor([example["attention_mask"]]).to(device)
    
    # Generate prediction for the current example
    prediction = model.generate(input_ids=input_ids, attention_mask=attention_mask)
    
    # Decode the prediction and append it to the list
    generated_summary = tokenizer.decode(prediction[0], skip_special_tokens=True)
    generated_summaries.append(generated_summary)
    count=count+1
    #if(count>10):
        #break


#for i in range(10):
 #   print(f"Text: {test_texts[i]}")
  #  print(f"Original Summary: {test_summaries[i]}")
   # print(f"Generated Summary: {generated_summaries[i]}\n")
    

**Printing the summaries**

In [21]:
for i in range(10):
    print(f"Text: {df2['Text'][i]}")
    print(f"Original Summary: {df2['Summary'][i]}")
    print(f"Generated Summary: {generated_summaries[i]}\n")
    #print(f"Generated Summary: {generated_summaries_telugu[i]}\n")

Text: ఖమ్మంక్రైం: వీరిద్దరే కాదు జిల్లాలో ఎన్నికల కోడ్‌ అమల్లోకి వచ్చిన నాటి నుంచి వ్యాపారులు, చాలా మంది ప్రజలు ఈ పరిస్థితి ఎదుర్కొంటున్నారు. అవసరాలు, కారణాలు సరిగ్గా పరిశీలించకుండా.. వివరాలు తెలుసుకోకుండా కనిపించిన నగదును సీజ్‌ చేస్తుండంతో ఇబ్బంది పడుతున్నారు. బ్యాంకులు, వ్యాపార కూడళ్ల వద్ద మఫ్టీలో ఉంటున్న పోలీసులు రూ.50 వేలకు మించి ఎవరి వద్ద నగదు ఉన్నా స్వాధీనం చేసుకుంటున్నారు. అన్ని పత్రాలు ఉండాలని అధికారులు చెబుతున్నా.. రోజువారీ లావాదేవీలు, వ్యాపార, కుటుంబ అవసరాల కోసం నగదు తీసుకెళ్తున్న వారు చిక్కుల్లో పడుతున్నారు. బ్యాంకుల వద్ద మకాం ఎన్నికల కోడ్‌ అమలులోకి వచ్చిన దగ్గర నుంచి చెక్‌పోస్ట్‌ల వద్ద తనిఖీలు ముమ్మరం చేసిన పోలీసులు.. గతంలో ఎన్నడూ లేని విధంగా బ్యాంకులు, వ్యాపార లావాదేవీలు జరిగే ప్రాంతంలో తిష్ట వేయడం గమనార్హం. నిత్యం వ్యాపారాలు చేసే వారికి రూ.లక్షల్లో నగదు లావాదేవీలు తప్పనిసరి. ఇంకోపక్క వివాహాల సీజన్‌ కావడంతో దుస్తులు, వెండి, బంగారం కొనుగోలుకు వస్తున్న వారు సైతం పోలీసుల చేతుల్లో చిక్కుకుంటున్నారు. దీంతో పోలీసులను చూస్తే భయపడాల్సిన పరిస్థితులు ఎదురవుతున్నాయి. ఒకవేళ ఎన్నికల కో

**Function to calculate Bleu score**

In [22]:
from nltk.translate.bleu_score import sentence_bleu

def evaluate_summary(reference, generated):
    # BLEU score
    bleu_score = sentence_bleu([reference], generated)

    # ROUGE score

    return bleu_score

# if __name__ == "__main__":
#     # Telugu Text
#     #telugu_text = "టెక్నాలజీ ఒక వ్యాసం చాలా జరిగింది. ఇది మన జీవనాన్ని ప్రభావితం చేసింది. కొన్ని దేశాల్లో ఇప్పటివరకు జరిగిన ప్రధాన క్రియాశీలతలు టెక్నాలజీ ఫలితాలు."

#     # Generated Summary
#     generated_summary = "The guard arrived late because it was raining"

#     # Dummy reference summary (since you don't have a reference, this is just for demonstration)
#     reference_summary = "The guard arrived late because of the rain"


#     bleu = evaluate_summary(reference_summary, generated_summary)
#     print("\nEvaluation Scores:")
#     print(f"BLEU Score: {bleu}")


**Avg Bleu score calculation**

In [23]:
avg_bleu = 0
bleu = 0
for i in range(len(df2["Summary"])):
    reference_summary = df2["Summary"][i]
    generated_summary = generated_summaries[i]
    bleu = evaluate_summary(reference_summary, generated_summary)
    avg_bleu += bleu
print(avg_bleu/len(df2["Summary"]))
    

Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.4088327204369504


**Store the summaries in a csv file**

In [24]:
import csv

# Assuming you have a list of generated summaries called generated_summaries
#generated_summaries = ['Summary 1', 'Summary 2', 'Summary 3', ...]

# Specify the file name
csv_file = '/kaggle/working/generated_summaries.csv'  # Saving in the working directory in Kaggle

# Open the CSV file in write mode and write the summaries
with open(csv_file, 'w', newline='', encoding='utf-8') as file:
    # Create a CSV writer object
    writer = csv.writer(file)
    
    # Write each summary in the list as a row with a single column
    for summary in generated_summaries:
        writer.writerow([summary])

print("File created and data stored successfully.")

File created and data stored successfully.


In [25]:
!zip -r 1mbart50_latest_gensum.zip /kaggle/working/generated_summaries.csv

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: kaggle/working/generated_summaries.csv (deflated 77%)


In [26]:
from IPython.display import FileLink
FileLink(r'1mbart50_latest_gensum.zip')