In [1]:
import pandas as pd
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

df = pd.read_csv("TOSDR_labeled_with_summaries.csv")

train_texts = df["Text"].tolist()
train_summaries = df["eng_summary"].tolist()

class CustomDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_source_length=512, max_target_length=128):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        summary = self.summaries[idx]

        input_ids = self.tokenizer(text, max_length=self.max_source_length, truncation=True, padding='max_length', return_tensors='pt')['input_ids']
        output_ids = self.tokenizer(summary, max_length=self.max_target_length, truncation=True, padding='max_length', return_tensors='pt')['input_ids']

        return input_ids.squeeze(0), output_ids.squeeze(0)


def fine_tune_model(model, train_loader, optimizer, epochs, device):
    model.train()
    model.to(device)
    for epoch in range(epochs):
        total_loss = 0
        for source_ids, target_ids in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}'):
            optimizer.zero_grad()
            source_ids = source_ids.to(device)
            target_ids = target_ids.to(device)
            outputs = model(input_ids=source_ids, labels=target_ids)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')


tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
train_dataset = CustomDataset(texts=train_texts, summaries=train_summaries, tokenizer=tokenizer)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

fine_tune_model(model, train_loader, optimizer, epochs=3, device=device)

# model.save_pretrained("fine_tuned_bart_model")


  from .autonotebook import tqdm as notebook_tqdm
Epoch 1/3: 100%|██████████| 209/209 [10:47:56<00:00, 186.01s/it]   


Epoch 1/3, Loss: 0.9085857429002461


Epoch 2/3: 100%|██████████| 209/209 [1:51:54<00:00, 32.13s/it]   


Epoch 2/3, Loss: 0.3716584734226528


Epoch 3/3: 100%|██████████| 209/209 [1:37:49<00:00, 28.08s/it]

Epoch 3/3, Loss: 0.2889743675455522





In [2]:
import torch
from transformers import BartTokenizer, BartForConditionalGeneration


bart_model_name = "fine_tuned_bart_model_eng_to_engSummary"
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained(bart_model_name)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def generate_summary(text, max_length=20):

    inputs = tokenizer(text, max_length=max_length, return_tensors="pt", truncation=True)
    input_ids = inputs.input_ids.to(device)


    summary_ids = model.generate(input_ids, max_length=100, min_length=10, length_penalty=2.0, num_beams=5, early_stopping=True)
    

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    
    return summary


text_to_summarize = "Please note that if you request the erasure of your personal information: We may retain some of your personal information as necessary for our legitimate business interests, such as fraud detection and prevention and enhancing safety.For example, if we suspend an Airbnb Account for fraud or safety reasons, we may retain certain information from that Airbnb Account to prevent that Member from opening a new Airbnb Account in the future.We may retain and use your personal information to the extent necessary to comply with our legal obligations.For example, Airbnb and Airbnb Payments may keep some of your information for tax, legal reporting and auditing obligations."
# text_to_summarize = "When signing up for and using our service, we will collect the following information about you:Information that you provide when creating an Account to become a User.This information includes, but is not limited to:Information, such as your name and address, that uniquely identifies you as a natural or legal person"
# text_to_summarize = "We have the right to monitor, terminate, suspend, or delete any User Account at any time for any reason, or no reason.It is our policy not to comment on any reasons for termination and we have no obligation to provide you with a reason for termination."
# text_to_summarize = "Our computer systems and third party hosting provider systems are currently based in the United States and may be located in other countries, so your personal data will be processed by us in the U.S.and other countries where data protection and privacy regulations may not offer the same level of protection as in other parts of the world, such as the European Union.If you create a user account with the Site as a visitor from outside the United States, by using the Site you agree to this Privacy Policy and you consent to the transfer of all such information to the United States, which may not offer an equivalent level of protection of that required in the European Union or certain other countries, and to the processing of that information as described in this Privacy Policy."
# text_to_summarize = "We may suspend or terminate your rights to use the Parsec Properties (including your Account) at any time for any reason at our sole discretion, including for any use of the Parsec Properties in violation of these Terms.Upon termination of your rights under these Terms, your Account and right to access and use the Parsec Properties will terminate immediately."
generated_summary = generate_summary(text_to_summarize)
print("Generated Summary:", generated_summary)


OSError: fine_tuned_bart_model_eng_to_engSummary is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [10]:
# model.save_pretrained("fine_tuned_bart_model_eng_to_engSummary")


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


In [None]:
import pandas as pd


df = pd.read_csv("TOSDR_labeled_with_summaries.csv")


def generate_summary_for_row(row):
    text = row["Text"]
    generated_summary = generate_summary(text)
    return generated_summary


df["gen_summary"] = df.apply(generate_summary_for_row, axis=1)


df.to_csv("TOSDR_labeled_with_summaries_and_gen.csv", index=False)
