In [95]:

!pip install datasets
!pip install transformers
!pip install peft
!pip install evaluate




In [96]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
AutoTokenizer,
AutoConfig,
AutoModelForSequenceClassification,
DataCollatorWithPadding,
TrainingArguments,
Trainer
)
from peft import PeftModel, PeftConfig,get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [97]:
model_checkpoint = 'distilbert-base-uncased' #67M base parameter

# define Label maps
id2label = {1:"Negative", 0:"Positive"}
label2id = {"Negative":1,"Positive":0}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
  model_checkpoint, num_labels = 2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [98]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [99]:
# Load dataset
from datasets import load_dataset

dataset = load_dataset("shawhin/imdb-truncated")
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [100]:
# Create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space = True)

# Create tokenize function
def tokenize_function(examples):
  # extract text
  text = examples["text"]
  #tokenize and truncate text
  tokenizer. truncation_side="left"
  tokenized_inputs = tokenizer(
    text,
    return_tensors="np",
    truncation=True,
    max_length=12

  )

  return tokenized_inputs

# add pad token if none exists
if tokenizer.pad_token is None:
  tokenizer. add_special_tokens({'pad_token':'[PAD]'})
  model.resize_token_embeddings(len(tokenizer))

# tokenizer training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [101]:
# Create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# **Evaluation Metrics**

In [102]:
# import accuracy evaluation metric
accuracy = evaluate. load("accuracy")

# define an evaluation function to pass into trainer later
def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions,axis=1)

  return {"accuracy": accuracy.compute(predictions=predictions,references=labels)}

In [103]:
### define List of examples
text_list = ["It was good", "Not a fan, dont recommend", "Better than first one", "this one is pass"]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
It was good - Positive
Not a fan, dont recommend - Negative
Better than first one - Positive
this one is pass - Positive


In [104]:
### define List of examples
text_list = ["It was  not good", "it's a fan,recommend it ", "Better than first one", "this one is a failure"]

print("Untrained model prediction")
print("----------------------------------------------------")

for text in text_list:
  # tokenize text
  inputs = tokenizer.encode(text, return_tensors="pt")
  # Compute Logits
  logits = model(inputs).logits
  # Convert Logits to Labels
  prediction = torch.argmax(logits)
  print(text + " - " +id2label[prediction.tolist()])

Untrained model prediction
----------------------------------------------------
It was  not good - Positive
it's a fan,recommend it  - Positive
Better than first one - Positive
this one is a failure - Positive


# **PEFT** (Parameterized Entangled Friendly Transformations):
PEFT deals with the entanglement and transformation of parameters inside the model architecture in the context of language models such as LLMs. It's a technique meant to increase language models' expressiveness and effectiveness by adding entangled transformations that allow for more productive parameter interactions inside the model. PEFT is a method that improves language model performance or learning dynamics by utilising ideas inspired by quantum mechanics.



---

# **LoRA** (Low Rank Adaptation):
LoRA,  is the term used to describe "Low Rank Adaptation" in the context of language models. This technique focuses on compressing and modifying the parameters of large language models. By utilising low-rank approximations, this method seeks to lower the computational overhead and memory requirements of LLMs. LoRA helps make these models more efficient while largely maintaining their performance by approximating the parameters using low-rank matrices or tensors.

PEFT and LoRA are two approaches or strategies that are used to large language models (LLMs) to address issues with efficiency, computational demands, and performance enhancement. While LoRA concentrates on low-rank adaptations for parameter compression and efficiency without appreciably sacrificing performance, PEFT uses entangled transformations to increase expressiveness.

By making language models more scalable, easier to train, and computationally efficient, these techniques seek to maximise and enhance the capabilities of language models for a range of natural language processing (NLP) applications.




# Fine Tuning (LoRA)

In [105]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin'])

In [106]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules={'q_lin'}, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={})

In [107]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9306847223789819


In [108]:

# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 10



# define training arguments
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [93]:

# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.587119,{'accuracy': 0.694}
2,0.606300,0.748566,{'accuracy': 0.666}
3,0.606300,0.991663,{'accuracy': 0.685}
4,0.409500,1.171465,{'accuracy': 0.689}
5,0.409500,1.648588,{'accuracy': 0.673}
6,0.189600,1.874697,{'accuracy': 0.678}
7,0.189600,2.103611,{'accuracy': 0.674}
8,0.105800,2.212976,{'accuracy': 0.682}
9,0.105800,2.297841,{'accuracy': 0.676}
10,0.074400,2.363646,{'accuracy': 0.682}


Trainer is attempting to log a value of "{'accuracy': 0.694}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.666}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.685}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.689}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.673}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This i

TrainOutput(global_step=2500, training_loss=0.2770958160400391, metrics={'train_runtime': 1043.2584, 'train_samples_per_second': 9.585, 'train_steps_per_second': 2.396, 'total_flos': 31499922240000.0, 'train_loss': 0.2770958160400391, 'epoch': 10.0})

# Generate prediction

In [94]:

model.to('cpu')

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu")

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
It was good - Negative
Not a fan, dont recommend - Positive
Better than first one - Positive
this one is pass - Positive
