In [1]:
pip install transformers torch pandas

Note: you may need to restart the kernel to use updated packages.


In [28]:
import torch
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

In [29]:
# Assuming we have a dataset with "review_text" and "spoiler_free_text" columns
# Load the dataset (modify this part based on your actual dataset)
reviews = pd.read_json('/kaggle/input/imdb-spoiler-dataset/IMDB_reviews.json', lines=True)
reviews = reviews[:5000]
# For this example, we assume that we somehow have a column `spoiler_free_text` containing spoiler-free versions of the reviews.
# Normally, you would need to manually create or source this data.
reviews['spoiler_free_text'] = reviews['review_text'].apply(lambda x: "Your spoiler-free version here.")  # Placeholder

In [30]:
# Split the dataset into training and testing sets
train_data, test_data = train_test_split(reviews, test_size=0.2, random_state=42)

In [31]:
# Define the SpoilerDataset class
class SpoilerDataset(Dataset):
    def __init__(self, tokenizer, source_texts, target_texts, max_len=512):
        self.tokenizer = tokenizer
        self.source_texts = source_texts
        self.target_texts = target_texts
        self.max_len = max_len

    def __len__(self):
        return len(self.source_texts)

    def __getitem__(self, index):
        source_text = str(self.source_texts[index])
        target_text = str(self.target_texts[index])

        source = self.tokenizer(
            source_text, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True, 
            return_tensors="pt"
        )
        
        target = self.tokenizer(
            target_text, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True, 
            return_tensors="pt"
        )

        labels = target["input_ids"].clone().detach()
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': source['input_ids'].squeeze(), 
            'attention_mask': source['attention_mask'].squeeze(), 
            'labels': labels.squeeze()
        }

In [32]:
# Load BART tokenizer and model
model_name = "facebook/bart-large"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

In [33]:
# Create datasets
train_dataset = SpoilerDataset(tokenizer, train_data['review_text'].tolist(), train_data['spoiler_free_text'].tolist())
test_dataset = SpoilerDataset(tokenizer, test_data['review_text'].tolist(), test_data['spoiler_free_text'].tolist())

In [35]:
# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True
)




In [36]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)


In [37]:
# Train and evaluate
trainer.train()

Step,Training Loss,Validation Loss
500,0.6088,0.645583
1000,1.9635,2.6336
1500,1.7879,3.151528
2000,1.717,2.809023
2500,1.4728,4.766207
3000,1.4441,4.297481
3500,1.3191,4.659945
4000,1.2906,4.861274
4500,1.5409,4.08712
5000,1.3538,5.220618


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams

TrainOutput(global_step=6000, training_loss=1.424491034063365, metrics={'train_runtime': 4617.2373, 'train_samples_per_second': 2.599, 'train_steps_per_second': 1.299, 'total_flos': 1.3002627612672e+16, 'train_loss': 1.424491034063365, 'epoch': 3.0})

In [38]:
# Save model and tokenizer
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')
torch.save(model.state_dict, 'bart_generation_epoch3.pth')

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


In [44]:
torch.cuda.empty_cache()

In [53]:
device = torch.device("cpu")

In [54]:
device

device(type='cpu')

In [55]:
from datasets import load_metric
import numpy as np

# Load the metrics
rouge = load_metric("rouge")
bleu = load_metric("bleu")

def compute_metrics(preds, labels):
    """Compute BLEU and ROUGE scores for predictions and labels"""
    
    # Decode the predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a different format, so we provide the references and predictions separately
    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    rouge_result = {key: value.mid.fmeasure for key, value in rouge_result.items()}

    # BLEU expects references to be list of list of tokens, so we tokenize the decoded labels
    decoded_labels = [label.split() for label in decoded_labels]
    decoded_preds = [pred.split() for pred in decoded_preds]
    bleu_result = bleu.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    
    # Combine the results
    result = {
        'bleu': bleu_result['bleu'],
        **rouge_result
    }
    
    return result

# Generate predictions
def evaluate_model(trainer, test_dataset):
    """Function to generate predictions and evaluate the model using ROUGE and BLEU metrics"""
    
    # Generate predictions
    raw_preds = trainer.predict(test_dataset)
    
    # Convert predictions to integer
    preds = np.argmax(raw_preds.predictions, axis=-1)
    
    # Compute metrics
    result = compute_metrics(preds, raw_preds.label_ids)
    
    return result

# Evaluate the model
evaluation_result = evaluate_model(trainer, test_dataset)

# Print the evaluation results
print("Evaluation Results:")
print(f"BLEU Score: {evaluation_result['bleu']}")
print(f"ROUGE-1 F1 Score: {evaluation_result['rouge1']}")
print(f"ROUGE-2 F1 Score: {evaluation_result['rouge2']}")
print(f"ROUGE-L F1 Score: {evaluation_result['rougeL']}")

OutOfMemoryError: CUDA out of memory. Tried to allocate 590.00 MiB. GPU 0 has a total capacty of 15.89 GiB of which 585.12 MiB is free. Process 2293 has 15.31 GiB memory in use. Of the allocated memory 14.34 GiB is allocated by PyTorch, and 695.19 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [42]:
!pip install rouge_score

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=40e9855e783ec8ebd84e25bbcc9ba142f2128d39fb5c2ef72fb0ca5cd2f4177c
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [24]:
# Define compute metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {'accuracy': accuracy_score(labels, preds)}

In [25]:
# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=500,
    load_best_model_at_end=True
)



In [26]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [27]:
# Train and evaluate
trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [18]:
# Train and evaluate
trainer.train()


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


RuntimeError: stack expects each tensor to be equal size, but got [178] at entry 0 and [153] at entry 1

In [None]:
torch.sav

In [None]:
results = trainer.evaluate()
print(results)

In [11]:
# Create datasets
train_dataset = SpoilerDataset(train_encodings, train_data['is_spoiler'])
test_dataset = SpoilerDataset(test_encodings, test_data['is_spoiler'])

In [None]:

# Load model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)

# Define compute metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {'accuracy': accuracy_score(labels, preds)}

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=500,
    load_best_model_at_end=True
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train and evaluate
trainer.train()
results = trainer.evaluate()
print(results)