In [8]:
import os

os.chdir("/content/drive/MyDrive/NLP Project")

In [9]:
%ls

config.json  generation_config.json  HindiNews_test.csv  hindi_train.csv  model.safetensors


In [None]:
import pandas as pd
import torch
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
from torch.utils.data import Dataset, DataLoader


class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_source_length=512, max_target_length=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        source_text = self.data['Article'][idx]
        target_text = self.data['Summary'][idx]

        source_tokens = self.tokenizer.encode_plus(
            source_text,
            max_length=self.max_source_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        target_tokens = self.tokenizer.encode(
            target_text,
            max_length=self.max_target_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': source_tokens['input_ids'].flatten(),
            'attention_mask': source_tokens['attention_mask'].flatten(),
            'decoder_input_ids': target_tokens.flatten()[:-1],  # ignore last token (<eos>)
            'labels': target_tokens.flatten()[1:]  # ignore first token (<bos>)
        }


train_df = pd.read_csv("hindi_train.csv")


tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
config = BartConfig.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn", config=config)


train_dataset = CustomDataset(train_df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

num_epochs = 1

for epoch in range(num_epochs):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()


model.save_pretrained("/content/drive/MyDrive/NLP Project")




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


In [None]:
# making predictions using the trained model

In [None]:
import pandas as pd
import torch
from transformers import BartTokenizer, BartForConditionalGeneration


test_df = pd.read_csv("HindiNews_test.csv")


model = BartForConditionalGeneration.from_pretrained("/content/drive/MyDrive/NLP Project")


tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')


test_inputs = tokenizer.batch_encode_plus(
    test_df['Article'].tolist(),
    max_length=1024,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)


input_ids = test_inputs['input_ids'].to(model.device)
attention_mask = test_inputs['attention_mask'].to(model.device)
summaries = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)

ummaries
decoded_summaries = tokenizer.batch_decode(summaries, skip_special_tokens=True)


for i, summary in enumerate(decoded_summaries):
    print(f"Article {i} Summary: {summary}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# above code crashing even in t4, trying to make predciitons just on one instance of data

In [None]:
import pandas as pd
import torch
from transformers import BartTokenizer, BartForConditionalGeneration


test_df = pd.read_csv("HindiNews_test.csv")


model = BartForConditionalGeneration.from_pretrained("/content/drive/MyDrive/NLP Project")


tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')


test_input = tokenizer.encode(test_df['Article'][0], return_tensors='pt', max_length=1024, truncation=True)


summary_ids = model.generate(test_input, max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)


summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)


print("Summary of the first article:")
print(summary)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Summary of the first article:
अमृतपाल को वारिस पंजाब दे का पीएम में कहा, ‘‘जल्लाना था’ जोरी दर सकता हूं। उनके ‘वहन’, मराजुमादी, �


In [None]:
# printing summaries of first 10 articles

In [None]:
import pandas as pd
import torch
from transformers import BartTokenizer, BartForConditionalGeneration


test_df = pd.read_csv("HindiNews_test.csv")


model = BartForConditionalGeneration.from_pretrained("/content/drive/MyDrive/NLP Project")


tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')


for i in range(5):
    test_input = tokenizer.encode(test_df['Article'][i], return_tensors='pt', max_length=1024, truncation=True)


    summary_ids = model.generate(test_input, max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)


    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)


    print(f"Summary of article {i+1}:")
    print(summary)
    print()


Summary of article 1:
अमृतपाल को वारिस पंजाब दे का पीएम में कहा, ‘‘जल्लाना था’ जोरी दर सकता हूं। उनके ‘वहन’, मराजुमादी, �

Summary of article 2:
Rajasthan Assembly (Vidhan Sabha) Budget Session 2023 Latest News Update; Follow Rajasthan Vidhan Sabha Chunav (Assembly) Session LIVE Updates with Dainik Bhaskar (दैनिक भास्कर) कांग्‍रेस ने राहुल की मौसम से ‘मीड’ महारण’, जलान है कि

Summary of article 3:
Aam Aadmi Party (AAP) Chief Sharad Pawar Attacks On BJP Leader Rahul Gandhi. संसद परिस्टी ने कहा, ‘‘राघव कौआ बैठ गया। उस सेनाओं मोहन का मामला हुए हाईएम हर जनता रह

Summary of article 4:
Rajasthan Vidhan Sabha (Assembly) Budget Session 2023 Latest News Update. Follow Rajasthan Budget Session Latest News, Reports and Updates On Dainik Bhaskar (दैनिक भास्कर)  राषीय लोकेपर पायलटी बनाने को कहा- पहुंचा है। इसका कारण मे

Summary of article 5:
बचपन में कश्मीर से हूं। बायोलॉजिकलनी के ‘तुम मजान’ मामला कार्’यहां’, लड़का था भी, इसको लाइन, जवा



In [None]:
# for making predicitions on 5 summaries, code took 4 minutes on using t4, training was done in almost 2 hours, for just 1 epoch

In [3]:
!pip install rouge bert-score seqeval

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.0.0->bert-score)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.0.0->bert-score)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.0.0->bert-score)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (

In [None]:
# above score is for iloc[:3]

In [None]:
import pandas as pd
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
from rouge import Rouge
from bert_score import score


test_df = pd.read_csv("hindi_train.csv").iloc[:4]


model = BartForConditionalGeneration.from_pretrained("/content/drive/MyDrive/NLP Project")


tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')


rouge1_scores = []
rouge2_scores = []
rouge4_scores = []
bert_scores = []


for index, row in test_df.iterrows():
    #heading = row['Heading']
    article = row['Article']
    target_summary = row['Summary']


    input_ids = tokenizer.encode_plus(
        #heading,
        article,
        return_tensors='pt',
        max_length=1024,
        truncation=True,
    )


    summary_ids = model.generate(input_ids['input_ids'], max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)


    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)


    if generated_summary.strip() == "" or target_summary.strip() == "":
        print(f"One of the summaries is empty for index {index}.")
    else:

        rouge = Rouge()
        rouge_scores = rouge.get_scores(generated_summary, target_summary)[0]


        rouge4 = (rouge_scores['rouge-1']['f'] * rouge_scores['rouge-2']['f']) ** (1/2)


        _, _, bert_score = score([generated_summary], [target_summary], lang='hi', verbose=False)


        rouge1_scores.append(rouge_scores['rouge-1']['f'])
        rouge2_scores.append(rouge_scores['rouge-2']['f'])
        rouge4_scores.append(rouge4)
        bert_scores.append(bert_score.mean().item())


        print("\nGenerated Summary:", generated_summary)
        print("Target Summary:", target_summary)


avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
avg_rouge4 = sum(rouge4_scores) / len(rouge4_scores)
avg_bert = sum(bert_scores) / len(bert_scores)

print("\nAverage ROUGE-1 F-score:", avg_rouge1)
print("Average ROUGE-2 F-score:", avg_rouge2)
print("Average ROUGE-4 F-score:", avg_rouge4)
print("Average BERTScore:", avg_bert)
