In [1]:
!pip install transformers datasets nltk scikit-learn peft

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

In [2]:
!pip install datasets




In [3]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from peft import get_peft_model, LoraConfig, TaskType
from transformers import DataCollatorForSeq2Seq

In [4]:
# Load SQuAD dataset
dataset = load_dataset("squad")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [42]:
train_dataset = dataset["train"].shuffle(seed=42).select(range(5000))
val_dataset = dataset["validation"].shuffle(seed=42).select(range(500))

In [43]:
# Load tokenizer and model
model_name = "vblagoje/bart_lfqa"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [44]:
# Define PEFT config
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=4,  # Reduced from 8
    lora_alpha=16,  # Reduced from 32
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"]  # Only apply LoRA to specific modules
)

In [52]:
# Wrap model with PEFT
model = get_peft_model(model, peft_config)

In [46]:
# Tokenization function
def preprocess_function(examples):
    inputs = [f"question: {q} context: {c}" for q, c in zip(examples["question"], examples["context"])]
    targets = [answer['text'][0] if answer['text'] else "" for answer in examples["answers"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [47]:
tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_val = val_dataset.map(preprocess_function, batched=True, remove_columns=val_dataset.column_names)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]



Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [48]:
tokenized_train.set_format(type="torch")
tokenized_val.set_format(type="torch")

In [49]:
# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=10,
    learning_rate=1e-4,  # Increased learning rate
    per_device_train_batch_size=8,  # Increased batch size
    per_device_eval_batch_size=8,
    num_train_epochs=1,  # Reduced to 1 epoch
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir="./logs",
    logging_steps=5,
    save_steps=10,
    save_total_limit=2,
    #no_cuda=True,  # Force CPU usage
    max_steps=1000,  # Explicitly set the maximum number of training steps
)



In [50]:
# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

max_steps is given, it will override any value given in num_train_epochs


In [51]:
# Start training
trainer.train()

Step,Training Loss,Validation Loss
10,17.3391,16.373373
20,15.3287,15.019848
30,13.9921,13.874462
40,13.1278,13.002161
50,12.5837,12.333863
60,12.0081,11.693999
70,11.4538,11.074698
80,10.8394,10.560956
90,10.3124,9.800616
100,9.7287,8.784869


Step,Training Loss,Validation Loss
10,17.3391,16.373373
20,15.3287,15.019848
30,13.9921,13.874462
40,13.1278,13.002161
50,12.5837,12.333863
60,12.0081,11.693999
70,11.4538,11.074698
80,10.8394,10.560956
90,10.3124,9.800616
100,9.7287,8.784869


TrainOutput(global_step=1000, training_loss=6.664714323043823, metrics={'train_runtime': 5400.9188, 'train_samples_per_second': 1.481, 'train_steps_per_second': 0.185, 'total_flos': 8682913923072000.0, 'train_loss': 6.664714323043823, 'epoch': 1.6})

In [None]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=5d88c2169fcbf48e3398948d3fcbb2b93f81ca4c214019f74044568d8a9a5073
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from peft import get_peft_model, LoraConfig, TaskType
from transformers import DataCollatorForSeq2Seq
from rouge_score import rouge_scorer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Evaluation functions
def calculate_faithfulness(prediction, context):
    pred_tokens = word_tokenize(prediction.lower())
    context_tokens = word_tokenize(context.lower())

    stop_words = set(stopwords.words('english'))
    pred_tokens = [w for w in pred_tokens if w not in stop_words]
    context_tokens = [w for w in context_tokens if w not in stop_words]

    faithful_tokens = [token for token in pred_tokens if token in context_tokens]
    faithfulness_score = len(faithful_tokens) / len(pred_tokens) if pred_tokens else 0

    return faithfulness_score

def calculate_context_recall(prediction, context):
    pred_tokens = word_tokenize(prediction.lower())
    context_tokens = word_tokenize(context.lower())

    stop_words = set(stopwords.words('english'))
    pred_tokens = [w for w in pred_tokens if w not in stop_words]
    context_tokens = [w for w in context_tokens if w not in stop_words]

    recalled_tokens = [token for token in context_tokens if token in pred_tokens]
    recall_score = len(recalled_tokens) / len(context_tokens) if context_tokens else 0

    return recall_score

def calculate_relevance(prediction, question, context):
    full_context = question + " " + context

    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([prediction, full_context])

    relevance_score = cosine_similarity(vectors[0], vectors[1])[0][0]

    return relevance_score

def calculate_rouge(prediction, reference):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(prediction, reference)
    return scores

# Custom evaluation function
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    results = {
        'faithfulness': [],
        'context_recall': [],
        'relevance': [],
        'rouge1': [],
        'rouge2': [],
        'rougeL': []
    }

    for pred, label, example in zip(decoded_preds, decoded_labels, val_dataset):
        results['faithfulness'].append(calculate_faithfulness(pred, example['context']))
        results['context_recall'].append(calculate_context_recall(pred, example['context']))
        results['relevance'].append(calculate_relevance(pred, example['question'], example['context']))

        rouge_scores = calculate_rouge(pred, label)
        results['rouge1'].append(rouge_scores['rouge1'].fmeasure)
        results['rouge2'].append(rouge_scores['rouge2'].fmeasure)
        results['rougeL'].append(rouge_scores['rougeL'].fmeasure)

    return {k: np.mean(v) for k, v in results.items()}

In [53]:
# Save the fine-tuned model
model.save_pretrained("./squad_fine_tuned_model")
tokenizer.save_pretrained("./squad_fine_tuned_model")

('./squad_fine_tuned_model/tokenizer_config.json',
 './squad_fine_tuned_model/special_tokens_map.json',
 './squad_fine_tuned_model/vocab.json',
 './squad_fine_tuned_model/merges.txt',
 './squad_fine_tuned_model/added_tokens.json',
 './squad_fine_tuned_model/tokenizer.json')

In [54]:
# Quantization
print("Quantizing model...")
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

Quantizing model...


In [55]:
# Save the quantized model
print("Saving quantized model...")
#quantized_model_path = "./squad_quantized_model"
torch.save(quantized_model.state_dict(), f"quantized_model.pth")

Saving quantized model...


In [56]:
# Save the model configuration
model.config.save_pretrained('.')

print(f"Quantized model saved to {quantized_model_path}")

Non-default generation parameters: {'forced_eos_token_id': 2}


NameError: name 'quantized_model_path' is not defined

In [58]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [59]:
model.config.save_pretrained('/content/drive/My Drive/')
print("Quantized model saved to /content/drive/My Drive/")

Non-default generation parameters: {'forced_eos_token_id': 2}


Quantized model saved to /content/drive/My Drive/


In [60]:
import torch
import zipfile

# Assuming you have the quantized model saved as 'quantized_model.pth'
quantized_model_path = 'quantized_model.pth'

# Save the quantized model (example)
# torch.save(quantized_model, quantized_model_path)

# Name of the zip file
zip_filename = 'quantized_model.zip'

# Create a zip file and add the quantized model file to it
with zipfile.ZipFile(zip_filename, 'w') as zipf:
    zipf.write(quantized_model_path)

print(f"Quantized model saved and zipped as {zip_filename}.")

Quantized model saved and zipped as quantized_model.zip.


In [None]:
# Final evaluation
final_results = trainer.evaluate()

print("Final Evaluation Results:")
for metric, value in final_results.items():
    print(f"{metric}: {value:.4f}")

Final Evaluation Results:
eval_loss: 16.3382
eval_runtime: 803.3339
eval_samples_per_second: 13.1580
eval_steps_per_second: 1.6460
epoch: 0.0018


Final Evaluation Results:
eval_loss: 16.3382
eval_runtime: 803.3339
eval_samples_per_second: 13.1580
eval_steps_per_second: 1.6460
epoch: 0.0018

In [62]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.2


In [None]:
# Add these imports if not already present
import evaluate
from tqdm import tqdm
import numpy as np
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt', quiet=True)


In [66]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=8f92aa216f755491208de6c6dbf007fcc48705a8d681c0fc6beaad2edb277d82
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [67]:
import evaluate
from torch.utils.data import DataLoader
from tqdm import tqdm
import warnings

# Ignore the UserWarning about `as_target_tokenizer`
warnings.filterwarnings("ignore", category=UserWarning, message=".*`as_target_tokenizer` is deprecated.*")

# Evaluation dataset
eval_dataset = dataset["validation"].select(range(100))

# Tokenize evaluation dataset
tokenized_eval = eval_dataset.map(preprocess_function, batched=True, remove_columns=eval_dataset.column_names)
tokenized_eval.set_format(type="torch")

# Create DataLoader
eval_dataloader = DataLoader(tokenized_eval, batch_size=8, shuffle=False)

# Prediction function
def predict(batch):
    inputs = {k: v.to(model.device) for k, v in batch.items() if k != "labels"}
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=128)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Evaluation loop
model.eval()
predictions = []
references = []
contexts = []

for batch in tqdm(eval_dataloader, desc="Evaluating"):
    batch_predictions = predict(batch)
    batch_references = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)

    predictions.extend(batch_predictions)
    references.extend(batch_references)

    # Get the corresponding contexts
    batch_indices = batch["input_ids"].cpu().numpy()
    batch_contexts = [eval_dataset[int(i)]["context"] for i in batch_indices[:, 0]]  # Assuming the first token is the example index
    contexts.extend(batch_contexts)

# Compute metrics
rouge = evaluate.load("rouge")
rouge_scores = rouge.compute(predictions=predictions, references=references)

faithfulness = compute_faithfulness(predictions, contexts)
context_recall = compute_context_recall(predictions, references)
relevance = compute_relevance(predictions, references)

# Print results
print("Evaluation Results:")
print(f"Faithfulness: {faithfulness:.4f}")
print(f"Context Recall: {context_recall:.4f}")
print(f"Relevance: {relevance:.4f}")
print(f"ROUGE-1: {rouge_scores['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_scores['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_scores['rougeL']:.4f}")

Evaluating: 100%|██████████| 13/13 [00:30<00:00,  2.33s/it]


Evaluation Results:
Faithfulness: 0.4836
Context Recall: 0.5783
Relevance: 0.1104
ROUGE-1: 0.1099
ROUGE-2: 0.0558
ROUGE-L: 0.1104


In [68]:
# Load pre-trained tokenizer and model
model_name = "vblagoje/bart_lfqa"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Tokenization function
def preprocess_function(examples):
    inputs = [f"question: {q} context: {c}" for q, c in zip(examples["question"], examples["context"])]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    return model_inputs

# Evaluation functions
def compute_faithfulness(predictions, contexts):
    faithfulness_scores = []
    for pred, context in zip(predictions, contexts):
        pred_words = set(word_tokenize(pred.lower()))
        context_words = set(word_tokenize(context.lower()))
        faithfulness = len(pred_words.intersection(context_words)) / len(pred_words) if pred_words else 1.0
        faithfulness_scores.append(faithfulness)
    return sum(faithfulness_scores) / len(faithfulness_scores)

def compute_context_recall(predictions, references):
    recall_scores = []
    for pred, ref in zip(predictions, references):
        pred_words = set(word_tokenize(pred.lower()))
        ref_words = set(word_tokenize(ref.lower()))
        recall = len(pred_words.intersection(ref_words)) / len(ref_words) if ref_words else 1.0
        recall_scores.append(recall)
    return sum(recall_scores) / len(recall_scores)

def compute_relevance(predictions, references):
    rouge = evaluate.load("rouge")
    results = rouge.compute(predictions=predictions, references=references)
    return results["rougeL"]

# Prediction function
def predict(batch):
    inputs = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=128)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Evaluation loop
model.eval()
predictions = []
references = []
contexts = []

for i in tqdm(range(0, len(eval_dataset), 8), desc="Evaluating"):
    batch = eval_dataset[i:i+8]
    inputs = preprocess_function(batch)
    batch_predictions = predict(inputs)

    predictions.extend(batch_predictions)
    references.extend([answer['text'][0] if answer['text'] else "" for answer in batch["answers"]])
    contexts.extend(batch["context"])

# Compute metrics
rouge = evaluate.load("rouge")
rouge_scores = rouge.compute(predictions=predictions, references=references)

faithfulness = compute_faithfulness(predictions, contexts)
context_recall = compute_context_recall(predictions, references)
relevance = compute_relevance(predictions, references)

# Print results
print("Pre-trained BART-LFQA Evaluation Results on SQuAD:")
print(f"Faithfulness: {faithfulness:.4f}")
print(f"Context Recall: {context_recall:.4f}")
print(f"Relevance: {relevance:.4f}")
print(f"ROUGE-1: {rouge_scores['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_scores['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_scores['rougeL']:.4f}")

Evaluating: 100%|██████████| 13/13 [00:24<00:00,  1.87s/it]


Pre-trained BART-LFQA Evaluation Results on SQuAD:
Faithfulness: 0.6527
Context Recall: 0.5783
Relevance: 0.1104
ROUGE-1: 0.1099
ROUGE-2: 0.0558
ROUGE-L: 0.1104
