In [1]:
!pip install -q -U git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e
!pip install -q -U git+https://github.com/huggingface/peft@4a1559582281fc3c9283892caea8ccef1d6f5a4f

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
!pip install bitsandbytes==0.41.3



In [5]:
!pip install -U datasets



In [6]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

2024-04-13 05:33:35.912937: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-13 05:33:35.973916: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
print(f"pytorch version {torch.__version__}")

pytorch version 2.2.2+cu121


In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"working on {device}")

working on cuda:0


In [9]:
filename = "/home/anjaliraj/Amit/BTP2/IMDB Dataset.csv"

In [10]:
df = pd.read_csv(filename,encoding="utf-8", encoding_errors="replace")

In [11]:
print(df.shape)
df.head(10)

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [12]:
sentiment_counts = df['sentiment'].value_counts()

# Print the counts
print("Number of Positive samples:", sentiment_counts['positive'])
print("Number of Negative samples:", sentiment_counts['negative'])

Number of Positive samples: 25000
Number of Negative samples: 25000


Here, we can see that the number of positive and negative samples in the dataset is balanced.
Now, we are creating our training and testing dataset which contains 300 each positive and negative samples which are randomly chosen from the original dataset. 

In [13]:
X_train = list()
X_test = list()

for sentiment in ["positive","negative"]:
    train,test = train_test_split(df[df.sentiment == sentiment],train_size = 500,test_size = 250,random_state = 42)
    X_train.append(train)
    X_test.append(test)

In [14]:
print(X_train[0])

                                                  review sentiment
37398  This movie should go down as one of the funnie...  positive
27471  It's a genuine shame that this spin-off TV ser...  positive
6701   A have a female friend who is currently being ...  positive
38259  The jazz soundtrack makes this seem like a Cli...  positive
4375   Before I comment about this movie, you should ...  positive
...                                                  ...       ...
17647  wonderful movie with good story great humour (...  positive
7195   While I had wanted to se this film since the f...  positive
45920  I saw this movie only after hearing raves abou...  positive
18461  If you are looking for a modern film version o...  positive
24164  It may be a little creaky now, and it certainl...  positive

[500 rows x 2 columns]


In [15]:
X_train = pd.concat(X_train).sample(frac=1, random_state=27)
X_test = pd.concat(X_test)

In [16]:
X_train

Unnamed: 0,review,sentiment
37619,If this is all the Watchowski's have to offer ...,negative
32952,This is the most disturbing film I have ever s...,negative
7262,I'm not a Steve Carell fan however I like this...,positive
47622,I love the Thackeray novel on which this film ...,positive
28632,'They All Laughed' is a superb Peter Bogdanovi...,positive
...,...,...
40499,MGM tried pairing up and coming young men with...,negative
37869,Someone told me that this was one of the best ...,negative
41409,"This kid is rather bad, but in no way do they ...",negative
25300,Steven Seagal movies have never been Oscar mat...,negative


In [17]:
eval_idx = [idx for idx in df.index if idx not in list(train.index) + list(test.index)]
X_eval = df[df.index.isin(eval_idx)]
X_eval = (X_eval.groupby('sentiment',group_keys = False).apply(lambda x:x.sample(n=250,random_state = 10,replace = True)))
X_train = X_train.reset_index(drop=True)

In [18]:
# Creating evaluation dataset which contains each 50 positive and negative samples.
X_eval

Unnamed: 0,review,sentiment
36421,This movie is one of the worse examples of hyp...,negative
14858,It's unlikely that anyone except those who ado...,negative
36552,The Christmas Secret was touted as a wonderful...,negative
14856,"This is a terrible film, and not one scene has...",negative
19161,Yesterday was one of those days we decided to ...,negative
...,...,...
5770,From today's point of view it is quite ridicul...,positive
29450,Crispin Glovers' way of acting (and not only h...,positive
44728,This movie has a special way of telling the st...,positive
45647,"I was in the film too, but i don't know if the...",positive


In [19]:
def generate_prompt(data_point):
    return f"""
            Analyze the sentiment of the news headline enclosed in square brackets, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{data_point["review"]}] = {data_point["sentiment"]}
            """.strip()

def generate_test_prompt(data_point):
    return f"""
            Analyze the sentiment of the news headline enclosed in square brackets, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{data_point["review"]}] = """.strip()

In [20]:
X_train = pd.DataFrame(X_train.apply(generate_prompt, axis=1), 
                       columns=["review"])
X_eval = pd.DataFrame(X_eval.apply(generate_prompt, axis=1), 
                      columns=["review"])

y_true = X_test.sentiment
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["review"])

train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_eval)

In [21]:
print(len(train_data['review']))
len(eval_data['review'])

1000


500

In [22]:
def evaluate(y_true, y_pred):
    labels = ['positive', 'neutral', 'negative']
    mapping = {'positive': 2, 'neutral': 1, 'none':1, 'negative': 0}
    def map_func(x):
        return mapping.get(x, 1)
    
    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')
    
    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels
    
    for label in unique_labels:
        label_indices = [i for i in range(len(y_true)) 
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')
        
    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2])
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [23]:
# 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
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [24]:
model_name = "meta-llama/Llama-2-7b-chat-hf"

compute_dtype = getattr(torch, "float16")

In [25]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device,
    torch_dtype=compute_dtype,
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                          trust_remote_code=True,
                                         )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model, tokenizer = setup_chat_format(model, tokenizer)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [26]:
def predict(test, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(X_test))):
        prompt = X_test.iloc[i]["review"]
        pipe = pipeline(task="text-generation", 
                        model=model, 
                        tokenizer=tokenizer, 
                        max_new_tokens = 1, 
                        temperature = 0.0,
                        do_sample = False
                       )
        result = pipe(prompt)
        answer = result[0]['generated_text'].split("=")[-1]
        if "positive" in answer:
            y_pred.append("positive")
        elif "negative" in answer:
            y_pred.append("negative")
        else:
            y_pred.append("none")
    return y_pred

In [27]:
y_pred = predict(test, model, tokenizer)

100%|█████████████████████████████████████████████████████████████████████████████████| 500/500 [02:13<00:00,  3.74it/s]


In [28]:
evaluate(y_true, y_pred)

Accuracy: 0.758
Accuracy for label 0: 0.740
Accuracy for label 2: 0.776

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.74      0.83       250
           1       0.00      0.00      0.00         0
           2       0.89      0.78      0.83       250

    accuracy                           0.76       500
   macro avg       0.61      0.51      0.55       500
weighted avg       0.92      0.76      0.83       500


Confusion Matrix:
[[185  41  24]
 [  0   0   0]
 [ 11  45 194]]


In [29]:
output_dir="trained_weigths"

peft_config = LoraConfig(
        lora_alpha=16, 
        lora_dropout=0.1,
        r=64,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
)

training_arguments = TrainingArguments(
    output_dir=output_dir,                    # directory to save and repository id
    num_train_epochs=3,                       # number of training epochs
    per_device_train_batch_size=1,            # batch size per device during training
    gradient_accumulation_steps=8,            # number of steps before performing a backward/update pass
    gradient_checkpointing=True,              # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,                         # log every 10 steps
    learning_rate=2e-4,                       # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
    max_steps=-1,
    warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
    group_by_length=True,
    lr_scheduler_type="cosine",               # use cosine learning rate scheduler
    report_to="tensorboard",                  # report metrics to tensorboard
    evaluation_strategy="epoch"               # save checkpoint every epoch
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="review",
    tokenizer=tokenizer,
    max_seq_length=1024,
    packing=False,
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": False,
    }
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [30]:
# Train model
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.9025,2.019888
2,1.829,2.02486
3,1.6178,2.061196


TrainOutput(global_step=375, training_loss=1.885148915608724, metrics={'train_runtime': 2317.1199, 'train_samples_per_second': 1.295, 'train_steps_per_second': 0.162, 'total_flos': 4.671768223833293e+16, 'train_loss': 1.885148915608724, 'epoch': 3.0})

In [31]:
# Save trained model and tokenizer
trainer.save_model()
tokenizer.save_pretrained(output_dir)

('trained_weigths/tokenizer_config.json',
 'trained_weigths/special_tokens_map.json',
 'trained_weigths/tokenizer.json')

In [32]:
import gc

del [model, tokenizer, peft_config, trainer, train_data, eval_data, bnb_config, training_arguments]
del [df, X_train, X_eval]
del [TrainingArguments, SFTTrainer, LoraConfig, BitsAndBytesConfig]

In [33]:
for _ in range(100):
    torch.cuda.empty_cache()
    gc.collect()

In [34]:
!nvidia-smi

Sat Apr 13 06:15:38 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.14              Driver Version: 550.54.14      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A40                     Off |   00000000:CA:00.0 Off |                    0 |
|  0%   41C    P0             76W /  300W |    9591MiB /  46068MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [35]:
from peft import AutoPeftModelForCausalLM

finetuned_model = "./trained_weigths/"
compute_dtype = getattr(torch, "float16")
tokenizer = AutoTokenizer.from_pretrained("/home/anjaliraj/Amit/BTP2/trained_weigths")

model = AutoPeftModelForCausalLM.from_pretrained(
     finetuned_model,
     torch_dtype=compute_dtype,
     return_dict=False,
     low_cpu_mem_usage=True,
     device_map=device,
)

merged_model = model.merge_and_unload()
merged_model.save_pretrained("./merged_model",safe_serialization=True, max_shard_size="2GB")
tokenizer.save_pretrained("./merged_model")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


('./merged_model/tokenizer_config.json',
 './merged_model/special_tokens_map.json',
 './merged_model/tokenizer.json')

In [36]:
y_pred = predict(test, merged_model, tokenizer)
evaluate(y_true, y_pred)

100%|█████████████████████████████████████████████████████████████████████████████████| 500/500 [00:45<00:00, 11.09it/s]

Accuracy: 0.980
Accuracy for label 0: 0.972
Accuracy for label 2: 0.988

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       250
           2       0.97      0.99      0.98       250

    accuracy                           0.98       500
   macro avg       0.98      0.98      0.98       500
weighted avg       0.98      0.98      0.98       500


Confusion Matrix:
[[243   0   7]
 [  0   0   0]
 [  3   0 247]]



