In [1]:
!pip install datasets
!pip install transformers
!pip install transformers[torch]
!pip install accelerate
!pip install rouge_score
!pip install evaluate

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.22.2-py3-none-a

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files={'test': "/content/drive/My Drive/spring24/NLP499group/data/recipe_dataset_test_1k.csv"})

Generating test split: 0 examples [00:00, ? examples/s]

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the pretrained BART model
model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/My Drive/bart10k_basic_model", local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained("/content/drive/My Drive/bart10k_basic_tokenizer")

In [5]:
# Define our preprocessing function
def preprocess_function(examples):
    # The "inputs" are the tokenized answer:
    model_inputs = tokenizer(examples["input_text"], max_length=128, padding='max_length', truncation=True)

    # The "labels" are the tokenized outputs:
    labels = tokenizer(text_target=examples["target_text"], max_length=512, padding='max_length', truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Map the preprocessing function across our dataset
test_dataset = dataset['test']
test_dataset = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [6]:
test_dataset

Dataset({
    features: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [7]:
model.eval()  # Set the model to evaluation mode

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), 

In [8]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

# Create a DataLoader for your dataset
data_loader = DataLoader(test_dataset, batch_size=8, collate_fn=default_data_collator)

In [10]:
import torch
from tqdm import tqdm
from collections import Counter

# Function to calculate n-grams
def ngrams(text, n=3):
    words = text.split()
    return [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]

# Function to calculate precision, recall, and F1 score
def precision_recall_f1(generated, reference, n=3):
    generated_ngrams = Counter(ngrams(generated, n))
    reference_ngrams = Counter(ngrams(reference, n))

    true_positives = sum((generated_ngrams & reference_ngrams).values())
    total_generated = sum(generated_ngrams.values())
    total_reference = sum(reference_ngrams.values())

    precision = true_positives / total_generated if total_generated > 0 else 0
    recall = true_positives / total_reference if total_reference > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1

# Evaluate the model
results = []
for batch in tqdm(data_loader, desc="Generating"):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']  # You might not need to use this during generation

    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=256)

    generated_texts = [tokenizer.decode(g, skip_special_tokens=True) for g in outputs]
    reference_texts = [tokenizer.decode(l, skip_special_tokens=True) for l in labels]

    # Calculate precision, recall, and F1 for each item in the batch
    for generated_text, reference_text in zip(generated_texts, reference_texts):
        precision, recall, f1 = precision_recall_f1(generated_text, reference_text, n=3)
        results.append((precision, recall, f1))

# Average the results
avg_precision = sum(x[0] for x in results) / len(results)
avg_recall = sum(x[1] for x in results) / len(results)
avg_f1 = sum(x[2] for x in results) / len(results)

print(f"Average Precision: {avg_precision:.3f}")
print(f"Average Recall: {avg_recall:.3f}")
print(f"Average F1 Score: {avg_f1:.3f}")

Generating: 100%|██████████| 125/125 [35:28<00:00, 17.02s/it]

Average Precision: 0.109
Average Recall: 0.072
Average F1 Score: 0.082



