In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install transformers[torch]


In [None]:
pip install accelerate -U


In [None]:
import torch
from transformers import BertTokenizer, BertForMaskedLM, AdamW
from torch.utils.data import Dataset, DataLoader
import pandas as pd

class CustomDataset(Dataset):
    def __init__(self, articles, summaries, tokenizer, max_length=512):
        self.articles = articles
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, idx):
        article = self.articles[idx]
        summary = self.summaries[idx]

        inputs = self.tokenizer(article, summary, truncation=True, padding="max_length", max_length=self.max_length,
                                return_tensors="pt")
        return inputs

df_train = pd.read_csv("/content/drive/MyDrive/hindi_train.csv")
train_articles = df_train["Article"].tolist()
train_summaries = df_train["Summary"].tolist()

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-multilingual-uncased")

train_dataset = CustomDataset(train_articles, train_summaries, tokenizer)

batch_size = 1
epochs = 1
optimizer = AdamW(model.parameters(), lr=5e-5)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

model.train()
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].squeeze(1)
        attention_mask = batch["attention_mask"].squeeze(1)
        labels = input_ids.clone()
        labels[labels == tokenizer.pad_token_id] = -100

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f"Batch Loss: {loss.item()}")


model.save_pretrained("./bert-summarization")




In [None]:
import torch
from transformers import BertTokenizer, BertForMaskedLM, AdamW
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torch.nn as nn
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_source_length=512, max_target_length=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        heading = self.data['Heading'][idx]
        article = self.data['Article'][idx]
        target_text = self.data['Summary'][idx]

        heading_tokens = self.tokenizer.encode_plus(
            heading,
            max_length=self.max_source_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        article_tokens = self.tokenizer.encode_plus(
            article,
            max_length=self.max_source_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids = torch.cat((heading_tokens['input_ids'], article_tokens['input_ids']), dim=1)
        attention_mask = torch.cat((heading_tokens['attention_mask'], article_tokens['attention_mask']), dim=1)

        target_tokens = self.tokenizer.encode(
            target_text,
            max_length=self.max_target_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': input_ids.flatten(),
            'attention_mask': attention_mask.flatten(),
            'decoder_input_ids': target_tokens.flatten()[:-1],
            'labels': target_tokens.flatten()[1:]
        }

train_df = pd.read_csv("/content/drive/MyDrive/hindi_train.csv")

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-multilingual-uncased')

train_dataset = CustomDataset(train_df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

class CustomAttention(nn.Module):
    def __init__(self, hidden_size):
        super(CustomAttention, self).__init__()
        self.hidden_size = hidden_size
        self.heading_attention = nn.Linear(hidden_size, 1)
        self.article_attention = nn.Linear(hidden_size, 1)

    def forward(self, heading_encodings, article_encodings):
        heading_attention_weights = torch.softmax(self.heading_attention(heading_encodings), dim=1)
        article_attention_weights = torch.softmax(self.article_attention(article_encodings), dim=1)
        heading_weighted = torch.bmm(heading_attention_weights.permute(0, 2, 1), heading_encodings)
        article_weighted = torch.bmm(article_attention_weights.permute(0, 2, 1), article_encodings)
        combined_encodings = torch.cat((heading_weighted, article_weighted), dim=1)
        return combined_encodings

attention = CustomAttention(model.config.hidden_size)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

num_epochs = 1

for epoch in range(num_epochs):
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        decoder_input_ids = batch['decoder_input_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, labels=labels)

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

model.save_pretrained("/content/drive/MyDrive/NLP_project")


In [None]:
pip install datasets rouge-score bert-score

In [None]:
pip install Rouge

In [None]:
import pandas as pd
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
from rouge import Rouge
from bert_score import score


test_df = pd.read_csv("/content/drive/MyDrive/hindi_train.csv")

fine_tuned_model = BertForMaskedLM.from_pretrained("/content/drive/MyDrive/NLP_BertMaskLM")

rouge1_scores = []
rouge2_scores = []
rouge4_scores = []
bert_scores = []


for index, row in test_df.iterrows():
    heading = row['Heading']
    article = row['Article']
    target_summary = row['Summary']

    inputs = tokenizer(heading,article, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = fine_tuned_model.generate(inputs['input_ids'], max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)


    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")

    if generated_summary.strip() == "" or target_summary.strip() == "":
        print(f"One of the summaries is empty for index {index}.")
    else:

        rouge = Rouge()
        rouge_scores = rouge.get_scores(generated_summary, target_summary)[0]
        rouge4 = (rouge_scores['rouge-1']['f'] * rouge_scores['rouge-2']['f']) ** (1/2)
        _, _, bert_score = score([generated_summary], [target_summary], lang='hi', verbose=False)
        rouge1_scores.append(rouge_scores['rouge-1']['f'])
        rouge2_scores.append(rouge_scores['rouge-2']['f'])
        rouge4_scores.append(rouge4)
        bert_scores.append(bert_score.mean().item())

avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
avg_rouge4 = sum(rouge4_scores) / len(rouge4_scores)
avg_bert = sum(bert_scores) / len(bert_scores)

print("\nAverage ROUGE-1 F-score:", avg_rouge1)
print("Average ROUGE-2 F-score:", avg_rouge2)
print("Average ROUGE-4 F-score:", avg_rouge4)
print("Average BERTScore:", avg_bert)
