In [37]:
# Install necessary libraries (run this cell once)
!pip install transformers datasets peft evaluate scikit-learn nltk accelerate

# Import required libraries
import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import evaluate
from peft import get_peft_model, LoraConfig, TaskType, PeftModelForSequenceClassification
import re
from nltk.corpus import stopwords
import nltk
from sklearn.metrics import f1_score, precision_score, recall_score

# Download NLTK stopwords
nltk.download('stopwords')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nidhipatel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [38]:
# Step 1: Text Preprocessing Enhancements
def preprocess_text(text: str) -> str:
    """
    Preprocess the text by lowercasing, removing punctuation, and filtering stopwords.
    """
    # Lowercase the text
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text_tokens = text.split()
    filtered_tokens = [word for word in text_tokens if word not in stop_words]

    # Rejoin tokens into a single string
    return ' '.join(filtered_tokens)

# Sample sentence to demonstrate preprocessing
sample_sentence = "The quick brown fox jumps over the lazy dog!"
processed_sentence = preprocess_text(sample_sentence)
print(f"Processed Sentence: {processed_sentence}")

Processed Sentence: quick brown fox jumps lazy dog


In [39]:
# Step 2: Load Dataset and Apply Preprocessing
dataset = load_dataset("emotion")

# Apply preprocessing to the dataset
def preprocess_dataset(dataset):
    return [preprocess_text(sentence) for sentence in dataset['text']]

# Preprocess the training and test sets
dataset = dataset.map(lambda x: {'text': preprocess_dataset(x)}, batched=True)


In [40]:
# Step 3: Tokenization
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 6767.69 examples/s]


In [52]:
# Step 4: Load Pre-Trained Model
device = torch.device("cpu")
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6).to(device)

# Step 5: Define Metrics (F1, Precision, Recall, and Accuracy)
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)['accuracy']
    f1 = f1_score(labels, predictions, average='weighted')
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
# Step 6: Evaluate the Base Model
training_args = TrainingArguments(
    output_dir="./results_base_model",
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    evaluation_strategy="epoch",
    do_train=False,
    do_eval=True
)

trainer = Trainer(
    model=base_model,
    args=training_args,
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Evaluate the base model before fine-tuning
print("Evaluating the base model...")
base_results = trainer.evaluate()
print(f"Base Model Results: {base_results}")



Evaluating the base model...


Base Model Results: {'eval_loss': 1.7597534656524658, 'eval_model_preparation_time': 0.0014, 'eval_accuracy': 0.145, 'eval_f1': 0.08822817790373491, 'eval_precision': 0.23849116796695127, 'eval_recall': 0.145, 'eval_runtime': 93.0177, 'eval_samples_per_second': 21.501, 'eval_steps_per_second': 1.344}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [54]:
# Step 7: Fine-Tune Using PEFT (LoRA)
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_lin", "v_lin"]  # Targeting attention layers in DistilBERT
)

peft_model = get_peft_model(base_model, peft_config)

# Select a smaller subset for quick fine-tuning
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(500))


In [55]:
# Step 8: Fine-Tuning
fine_tune_args = TrainingArguments(
    output_dir="./results_peft_model",
    per_device_train_batch_size=4,  # Reduce batch size to fit in memory
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,  # Use gradient accumulation to simulate a larger batch size
    num_train_epochs=3,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    logging_dir="./logs_peft"
)

# Trainer for fine-tuning
peft_trainer = Trainer(
    model=peft_model,
    args=fine_tune_args,
    train_dataset=small_train_dataset,
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Fine-tune the model
peft_trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,1.690976,0.3545,0.219639,0.223004,0.3545
1,No log,1.657593,0.354,0.21338,0.222593,0.354
2,No log,1.648153,0.354,0.212086,0.223542,0.354


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=93, training_loss=1.7034879294774865, metrics={'train_runtime': 406.4393, 'train_samples_per_second': 3.691, 'train_steps_per_second': 0.229, 'total_flos': 200520359018496.0, 'train_loss': 1.7034879294774865, 'epoch': 2.976})

In [None]:
# Step 9: Evaluate the Fine-Tuned Model
print("Evaluating the fine-tuned model...")
peft_results = peft_trainer.evaluate()
print(f"Fine-Tuned Model Results: {peft_results}")

Evaluating the fine-tuned model...


In [None]:
# Step 10: Save the Fine-Tuned Model
save_directory = "./peft_fine_tuned_model"
peft_model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)


In [None]:
from peft import PeftModel
# Step 11: Load the Fine-Tuned Model using PeftModelForSequenceClassification
# Load the PEFT model on top of the base model
loaded_model = PeftModel.from_pretrained(base_model, save_directory).to(device)

# Load the tokenizer
loaded_tokenizer = AutoTokenizer.from_pretrained(save_directory)

# Preprocess and perform inference
def perform_inference(sentence: str):
    preprocessed_sentence = preprocess_text(sentence)
    inputs = loaded_tokenizer(preprocessed_sentence, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = loaded_model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
    return predictions.item()

# Example inference
sentence = "I love programming!"
predicted_label = perform_inference(sentence)
print(f"Predicted label: {predicted_label}")

In [None]:
# Step 12: Compare Base and Fine-Tuned Model Results
print("\nComparison of Base and Fine-Tuned Model:")
print(f"Base Model Accuracy: {base_results['eval_accuracy']}")
print(f"Fine-Tuned Model Accuracy: {peft_results['eval_accuracy']}")
print(f"Base Model F1 Score: {base_results.get('eval_f1', 'N/A')}")
print(f"Fine-Tuned Model F1 Score: {peft_results.get('eval_f1', 'N/A')}")