In [1]:
!pip install -q -U torch=="2.4.1" 
!pip install -q -U -i https://pypi.org/simple/ bitsandbytes
!pip install -q -U transformers
!pip install -q -U accelerate
!pip install -q -U datasets
!pip install -q -U trl
!pip install -q -U peft
!pip install -q -U tensorboard=="2.17"

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m86.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m73.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m336.6/336.6 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.4/293.4 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.8/374.8 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m450.7/450.7 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format, SFTConfig
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

In [5]:
print(f"pytorch version {torch.__version__}")

pytorch version 2.4.1+cu121


In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"working on {device}")

working on cuda:0


In [7]:
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

In [8]:
filename = "/kaggle/input/pinterest-snapshot-of-popularity-and-engagement/pinterest_finalised.csv"

df = pd.read_csv(filename, encoding="utf-8", encoding_errors="replace")
df = df.dropna()


def map_popularity_to_sentiment(popularity):
    if popularity > 10:
        return "positive"
    elif 1 <= popularity <= 10:
        return "neutral"
    else:
        return "negative"
df["sentiment"] = df["repin_count"].apply(map_popularity_to_sentiment)

# Split the data
X_train = []
X_test = []

for sentiment in ["positive", "neutral", "negative"]:
    train, test = train_test_split(
        df[df.sentiment == sentiment],
        train_size=550 if len(df[df.sentiment == sentiment]) >= 600 else len(df[df.sentiment == sentiment]) // 2,
        test_size=550 if len(df[df.sentiment == sentiment]) >= 600 else len(df[df.sentiment == sentiment]) // 2,
        random_state=42
    )
    X_train.append(train)
    X_test.append(test)
    
X_train = pd.concat(X_train).sample(frac=1, random_state=10)
X_test = pd.concat(X_test)

# Evaluation set
eval_idx = [idx for idx in df.index if idx not in list(X_train.index) + list(X_test.index)]
X_eval = df[df.index.isin(eval_idx)]
X_eval = (
    X_eval.groupby('sentiment', group_keys=False)
    .apply(lambda x: x.sample(n=50, random_state=10, replace=True))
)

X_train = X_train.reset_index(drop=True)

def generate_prompt(data_point):
    return f"""
            Analyze the sentiment of the Pinterest pin title enclosed in square brackets,
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{data_point['title']}] = {data_point['sentiment']}
            """.strip()

def generate_test_prompt(data_point):
    return f"""
            Analyze the sentiment of the Pinterest pin title enclosed in square brackets,
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{data_point['title']}] = """.strip()

X_train = pd.DataFrame(X_train.apply(generate_prompt, axis=1), columns=["text"])
X_eval = pd.DataFrame(X_eval.apply(generate_prompt, axis=1), columns=["text"])

y_true = X_test.sentiment
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])

train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_eval)



In [9]:
def evaluate(y_true, y_pred):
    labels = ['positive', 'neutral', 'negative']
    mapping = {'positive': 2, 'neutral': 1, 'negative': 0}
    def map_func(x):
        return mapping.get(x, 1)
    
    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)
    
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')
    
    unique_labels = set(y_true)
    for label in unique_labels:
        label_indices = [i for i in range(len(y_true)) if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')
        
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)
    
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2])
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [10]:
model_name = "/kaggle/input/llama-3/transformers/8b-chat-hf/1"

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device,
    torch_dtype=compute_dtype,
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

max_seq_length = 512 #2048
tokenizer = AutoTokenizer.from_pretrained(model_name, max_seq_length=max_seq_length)
tokenizer.pad_token_id = tokenizer.eos_token_id


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [11]:
def predict(test, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(X_test))):
        prompt = X_test.iloc[i]["text"]
        pipe = pipeline(task="text-generation", 
                        model=model, 
                        tokenizer=tokenizer, 
                        max_new_tokens = 1, 
                        temperature = 0.0,
                       )
        result = pipe(prompt)
        answer = result[0]['generated_text'].split("=")[-1]
        if "positive" in answer:
            y_pred.append("positive")
        elif "negative" in answer:
            y_pred.append("negative")
        elif "neutral" in answer:
            y_pred.append("neutral")
        else:
            y_pred.append("none")
    return y_pred

In [12]:
y_pred = predict(test, model, tokenizer)

  0%|          | 0/736 [00:00<?, ?it/s]Device set to use cuda:0
  0%|          | 1/736 [00:01<15:15,  1.25s/it]Device set to use cuda:0
  0%|          | 2/736 [00:01<08:14,  1.49it/s]Device set to use cuda:0
  0%|          | 3/736 [00:01<05:59,  2.04it/s]Device set to use cuda:0
  1%|          | 4/736 [00:02<04:55,  2.48it/s]Device set to use cuda:0
  1%|          | 5/736 [00:02<04:38,  2.63it/s]Device set to use cuda:0
  1%|          | 6/736 [00:02<04:10,  2.91it/s]Device set to use cuda:0
  1%|          | 7/736 [00:02<03:53,  3.12it/s]Device set to use cuda:0
  1%|          | 8/736 [00:03<03:57,  3.06it/s]Device set to use cuda:0
  1%|          | 9/736 [00:03<03:44,  3.23it/s]Device set to use cuda:0
  1%|▏         | 10/736 [00:03<03:36,  3.36it/s]Device set to use cuda:0
  1%|▏         | 11/736 [00:04<03:30,  3.44it/s]Device set to use cuda:0
  2%|▏         | 12/736 [00:04<03:26,  3.51it/s]Device set to use cuda:0
  2%|▏         | 13/736 [00:04<03:23,  3.56it/s]Device set to use cud

In [13]:
evaluate(y_true, y_pred)

Accuracy: 0.357
Accuracy for label 0: 0.000
Accuracy for label 1: 0.338
Accuracy for label 2: 0.819

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        92
           1       0.79      0.34      0.47       550
           2       0.15      0.82      0.26        94

    accuracy                           0.36       736
   macro avg       0.32      0.39      0.24       736
weighted avg       0.61      0.36      0.39       736


Confusion Matrix:
[[  0  32  60]
 [  0 186 364]
 [  0  17  77]]


In [14]:
from sklearn.metrics import (accuracy_score, 
                             recall_score, 
                             precision_score, 
                             f1_score)

from transformers import EarlyStoppingCallback, IntervalStrategy

def compute_metrics(p):    
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)    
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [15]:
output_dir="trained_weigths"

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj",],
)

training_arguments = SFTConfig(
    output_dir=output_dir,                    # directory to save and repository id
    num_train_epochs=5,                       # number of training epochs
    per_device_train_batch_size=1,            # batch size per device during training
    gradient_accumulation_steps=8,            # number of steps before performing a backward/update pass
    gradient_checkpointing=True,              # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,                         # log every 10 steps
    learning_rate=2e-4,                       # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
    max_steps=-1,
    warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
    group_by_length=False,
    lr_scheduler_type="cosine",               # use cosine learning rate scheduler
    report_to="tensorboard", 
    max_seq_length=max_seq_length,# report metrics to tensorboard
    #evaluation_strategy="steps",              # save checkpoint every epoch
    #load_best_model_at_end = True,
    #eval_steps = 25,
    #metric_for_best_model = 'accuracy',
    packing=False,
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": False,
    },
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    #eval_dataset=eval_data,
    peft_config=peft_config,
    # dataset_text_field="text",
    tokenizer=tokenizer,
    
    # packing=False,
    # dataset_kwargs={
    #     "add_special_tokens": False,
    #     "append_concat_token": False,
    # },
    #compute_metrics=compute_metrics,
    #callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
)

Map:   0%|          | 0/736 [00:00<?, ? examples/s]

In [16]:
trainer.train()

Step,Training Loss
25,12.6999
50,4.7491
75,4.5567
100,4.2814
125,3.4646
150,3.4389
175,3.4248
200,2.5027
225,2.2358
250,2.0581


TrainOutput(global_step=460, training_loss=2.9834440874016805, metrics={'train_runtime': 4131.4972, 'train_samples_per_second': 0.891, 'train_steps_per_second': 0.111, 'total_flos': 1.066474168860672e+16, 'train_loss': 2.9834440874016805, 'epoch': 5.0})

In [17]:
trainer.save_model()
tokenizer.save_pretrained(output_dir)

('trained_weigths/tokenizer_config.json',
 'trained_weigths/special_tokens_map.json',
 'trained_weigths/tokenizer.json')

In [18]:
y_pred = predict(test, model, tokenizer)
evaluate(y_true, y_pred)


  0%|          | 0/736 [00:00<?, ?it/s]Device set to use cuda:0
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  0%|          | 1/736 [00:00<04:19,  2.83it/s]Device set to use cuda:0
  0%|          | 2/736 [00:00<04:02,  3.03it/s]Device set to use cuda:0
  0%|          | 3/736 [00:00<03:55,  3.11it/s]Device set to use cuda:0
  1%|          | 4/736 [00:01<03:53,  3.14it/s]Device set to use cuda:0
  1%|          | 5/736 [00:01<04:10,  2.92it/s]Device set to use cuda:0
  1%|          | 6/736 [00:01<04:02,  3.01it/s]Device set to use cuda:0
  1%|          | 7/736 [00:02<03:58,  3.06it/s]Device set to use cuda:0
  1%|          | 8/736 [00:02<04:11,  2.90it/s]Device set to use cuda:0
  1%|          | 9/736 [00:03<04:03,  2.98it/s]Device set to use cuda:0
  1%|▏         | 10/736 [00:03<03:58,  3.04it/s]Device set to use cuda:0
  1%|▏         | 11/736 [00:03<03:54,  3.09it/s]Device set to use cuda:0
  2%|▏         | 12/736 [00:03<03:51,  3.12it/s]Devic

Accuracy: 0.747
Accuracy for label 0: 0.000
Accuracy for label 1: 1.000
Accuracy for label 2: 0.000

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        92
           1       0.75      1.00      0.86       550
           2       0.00      0.00      0.00        94

    accuracy                           0.75       736
   macro avg       0.25      0.33      0.29       736
weighted avg       0.56      0.75      0.64       736


Confusion Matrix:
[[  0  92   0]
 [  0 550   0]
 [  0  94   0]]





In [19]:
%load_ext tensorboard
%tensorboard --logdir logs/runs

<IPython.core.display.Javascript object>

In [20]:
!wget -q -O ngrok.zip https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.zip
!unzip -q ngrok.zip

In [21]:
!./ngrok config add-authtoken **

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
