# Sentiment Analysis

In [1]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

E:\Anaconda\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
E:\Anaconda\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
import os
os.environ['CURL_CA_BUNDLE'] = ''

# Loading Dataset

In [5]:
dataset = load_dataset('shawhin/imdb-truncated')
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

# Initializing Pre-Trained Model

In [6]:
model_checkpoint = 'distilbert-base-uncased'

id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

model = AutoModelForSequenceClassification.from_pretrained(
                                                            model_checkpoint, 
                                                            num_labels = 2, 
                                                            id2label   = id2label, 
                                                            label2id   = label2id
                                                            )

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Preprocessing

In [7]:
# creating tokenizer object
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# adding pad token if none exists
if tokenizer.pad_token is None:
    
    tokenizer.add_special_tokens({
                                    'pad_token': '[PAD]'
                                  }
                                )
    model.resize_token_embeddings(len(tokenizer))

In [8]:
def tokenize_function(examples):
    text = examples["text"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [9]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

# Results of pretrained model without Fine-Tuning

In [11]:
# define list of examples
text_list = ["worst", "Not a fan, don't recommed.", "worst experience of my life", "This is not worth watching even once.", "This one is a pass."]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
worst - Positive
Not a fan, don't recommed. - Positive
worst experience of my life - Positive
This is not worth watching even once. - Positive
This one is a pass. - Positive


# Fine Tunning

In [12]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin'])

In [13]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9306847223789819


In [14]:
# hyperparameters
lr = 1e-3          #learning rate
batch_size = 4
num_epochs = 10

In [15]:
# define training arguments
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [16]:
tokenized_dataset["validation"].select(range(50))

Dataset({
    features: ['label', 'text', 'input_ids', 'attention_mask'],
    num_rows: 50
})

In [21]:
accuracy = evaluate.load("accuracy")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"].select(range(300)),
    eval_dataset=tokenized_dataset["validation"].select(range(300)),
    tokenizer=tokenizer,
    data_collator=data_collator, # to dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.34722,{'accuracy': 0.85}
2,No log,0.331369,{'accuracy': 0.88}
3,No log,0.524064,{'accuracy': 0.87}
4,No log,0.661274,{'accuracy': 0.87}
5,No log,0.753557,{'accuracy': 0.8766666666666667}
6,No log,0.826571,{'accuracy': 0.88}
7,0.193800,0.856227,{'accuracy': 0.87}
8,0.193800,0.846259,{'accuracy': 0.87}
9,0.193800,0.864889,{'accuracy': 0.8733333333333333}
10,0.193800,0.86706,{'accuracy': 0.87}


Trainer is attempting to log a value of "{'accuracy': 0.85}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.88}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.87}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.87}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.8766666666666667}" of type <class 'dict'> for key "eval/accuracy" as a scala

TrainOutput(global_step=750, training_loss=0.1311811679204305, metrics={'train_runtime': 3689.4215, 'train_samples_per_second': 0.813, 'train_steps_per_second': 0.203, 'total_flos': 341348907353760.0, 'train_loss': 0.1311811679204305, 'epoch': 10.0})