In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, BitsAndBytesConfig, AutoConfig
import torch.nn.functional as F
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from unsloth import FastLanguageModel
from unsloth import is_bfloat16_supported

max_seq_length = 512
dtype = None 
load_in_4bit = True 

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [9]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 512
dtype = None 
load_in_4bit = True 

# Load the model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3-mini-4k-instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

Unsloth: You passed in `unsloth/Phi-3-mini-4k-instruct` and `load_in_4bit = True`.
We shall load `unsloth/Phi-3-mini-4k-instruct-bnb-4bit` for 4x faster loading.


==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.0. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.24. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj", "lm_head"],
    lora_alpha = 16,
    lora_dropout = 0, #
    bias = "none",    
    use_gradient_checkpointing = "unsloth", 
    random_state = 3407,
    use_rslora = False,  
    loftq_config = {},
)

Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [11]:
model.print_trainable_parameters()

trainable params: 119,537,664 || all params: 3,940,617,216 || trainable%: 3.03347565743366


In [12]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

Unsloth: Will map <|im_end|> to EOS = <|endoftext|>.


In [3]:
# Set up our dataset
import pandas as pd
# Load the data
data = pd.read_parquet("./humor_classification/data/processed/pretraining_data.parquet")

# Subsample to test the code
data = data.sample(frac=0.3, random_state=42)

# Make all columns objects
data["text"] = data["text"].astype("object")
data["label"] = data["label"].astype(int)

# Set up schema
schema = {
    "type": "object",
    "properties": {
        "rating": {
            "type": "number",
            "minimum": 0,
            "maximum": 4,
            "description": "The rating of the joke, from 0 to 5.",
        }
    },
}

# Make labels JSON format
data["label"] = data["label"].apply(lambda x: f'{{"rating": {x}}}')

# Set up prompt format
data["conversations"] = [
    [
        {
            "role": "system",
            "content": f"You are a joke evaluator that answers in JSON. Here's the json schema you must adhere to:\n{schema}",
        },
        {"role": "user",
            "content": f""" Your task is to evaluate jokes based on their funniness on a scale from 0 to 4, where 0 represents the least funny and 4 represents the most funny. Consider the humor, originality, and overall impact of the joke when making your assessment: \n "{joke}" """},
        {
            "role": "assistant", 
            "content": f"{label}"
        }
    ] for joke, label in zip(data["text"], data["label"])
]


# Split the data
train, test = train_test_split(data, test_size=0.2, random_state=42, shuffle= True, stratify=data["label"])
test, val = train_test_split(test, test_size=0.5, random_state=42, stratify=test["label"])

In [14]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train)
val_dataset = Dataset.from_pandas(val)

# Format the prompts
train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
#val_dataset = val_dataset.map(formatting_prompts_func, batched=True)


Map:   0%|          | 0/70144 [00:00<?, ? examples/s]

In [15]:
train_dataset[0]

{'text': '<|im_start|>system\nYou are a joke evaluator that answers in JSON. Here\'s the json schema you must adhere to:\n{\'type\': \'object\', \'properties\': {\'rating\': {\'type\': \'number\', \'minimum\': 0, \'maximum\': 4, \'description\': \'The rating of the joke, from 0 to 5.\'}}}<|im_end|>\n<|im_start|>user\n Your task is to evaluate jokes based on their funniness on a scale from 0 to 4, where 0 represents the least funny and 4 represents the most funny. Consider the humor, originality, and overall impact of the joke when making your assessment: \n "My wife is like an animal, she has her needs.... like ..her need to not have sex that often. EDIT: Sorry I thought of this today. We just had a kid and it\'s been a while." <|im_end|>\n<|im_start|>assistant\n{"rating": 3}<|im_end|>\n',
 'label': '{"rating": 3}',
 'conversations': [{'content': "You are a joke evaluator that answers in JSON. Here's the json schema you must adhere to:\n{'type': 'object', 'properties': {'rating': {'typ

In [16]:
tokenizer.padding_side = 'right'

In [10]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 16,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 16,
        gradient_accumulation_steps = 4,
        num_train_epochs = 1,
        max_grad_norm=0.3,
        learning_rate = 2e-4,
        bf16 = True,
        logging_steps = 1,
        optim = "paged_adamw_32bit",
        weight_decay = 0.001,
        lr_scheduler_type = "constant",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=16):   0%|          | 0/11690 [00:00<?, ? examples/s]

In [11]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 11,690 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 16 | Gradient Accumulation steps = 4
\        /    Total batch size = 64 | Total steps = 182
 "-____-"     Number of trainable parameters = 119,537,664


Step,Training Loss
1,2.3244
2,2.1236
3,1.9342
4,1.8499
5,1.6746
6,1.4661
7,1.2837
8,1.1365
9,0.9451
10,0.8079


In [2]:
model, tokenizer = FastLanguageModel.from_pretrained("lora_model_full")
model.save_pretrained_gguf("Phi-3-mini-4k-instruct-humor-full-clf-gguf", tokenizer, quantization_method = "q4_k_m")

==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.0. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.24. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 42.44 out of 60.46 RAM for saving.


100%|██████████| 32/32 [00:00<00:00, 36.32it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


Unsloth: Converting mistral model. Can use fast conversion = True.
Unsloth: Extending Phi-3-mini-4k-instruct-humor-full-clf-gguf/tokenizer.model with added_tokens.json.
Originally tokenizer.model is of size (32000).
But we need to extend to sentencepiece vocab size (32011).


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GUUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to q4_k_m will take 20 minutes.
 "-____-"     In total, you will have to wait around 26 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at Phi-3-mini-4k-instruct-humor-full-clf-gguf into f16 GGUF format.
The output location will be ./Phi-3-mini-4k-instruct-humor-full-clf-gguf-unsloth.F16.gguf
This will take 3 minutes...
INFO:convert:Loading model file Phi-3-mini-4k-instruct-humor-full-clf-gguf/model-00001-of-00002.safetensors
INFO:convert:Loading model file Phi-3-mini-4k-instruct-humor-full-clf-gguf/model-00001-of-00002.safetensors
INFO:convert:Loading model file Phi-3-mini-4k-instruct-humor-full-clf-gguf/model-00002-of-00002.safetensors
INFO:convert:model parameters count : 3821079552 (4B)
INFO:conver

In [43]:
# get pad t
tokenizer.pad_token_id

32009

In [12]:
model.save_pretrained("lora_model_json_2") 

In [2]:
model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "./models/checkpoint-1000", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
FastLanguageModel.for_inference(model)

==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.0. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.24. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [14]:
# Set up the test dataset
from datasets import Dataset
# Set up test prompt format
test["conversations"] = [
    [
        {
            "role": "system",
            "content": f"You are a joke evaluator that answers in JSON. Here's the json schema you must adhere to:\n{schema}",
        },
        {"role": "user",
            "content": f""" Your task is to evaluate jokes based on their funniness on a scale from 0 to 4, where 0 represents the least funny and 4 represents the most funny. Consider the humor, originality, and overall impact of the joke when making your assessment: \n "{joke}" """},
    ] for joke in test["text"]
]

test_dataset = Dataset.from_pandas(test)

# Format for generations:
def formatting_prompts_func_gen(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = True, add_generation_prompt = True, return_tensors = "pt") for convo in convos]
    return { "text" : texts, }

# Format the prompts
test_dataset = test_dataset.map(formatting_prompts_func_gen, batched=True)

FastLanguageModel.for_inference(model)
# Evaluate the model
def evaluate (model, test_dataset): 
    preds = []
    for i in range(len(test_dataset)):
        inputs = torch.tensor(test_dataset[i]["text"]).to('cuda')
        outputs = model.generate(inputs, max_length = 512)
        preds.append(tokenizer.decode(outputs[0], skip_special_tokens = True))
    return preds

# Get the predictions
# time the evaluation
preds = evaluate(model, test_dataset)

Map:   0%|          | 0/8768 [00:00<?, ? examples/s]

In [15]:
# Get labels from the test set and compear to the predictions
labels = test_dataset["label"]
correct = 0
for i in range(len(labels)):
    if labels[i] == preds_clean[i]:
        correct += 1

NameError: name 'preds_clean' is not defined

In [5]:
preds

['<|im_start|>system\nYou are a joke evaluator that answers in JSON. Here\'s the json schema you must adhere to:\n{\'type\': \'object\', \'properties\': {\'rating\': {\'type\': \'number\', \'minimum\': 0, \'maximum\': 4, \'description\': \'The rating of the joke, from 0 to 5.\'}}} \n<|im_start|>user\n Your task is to evaluate jokes based on their funniness on a scale from 0 to 4, where 0 represents the least funny and 4 represents the most funny. Consider the humor, originality, and overall impact of the joke when making your assessment: \n "Blonde and Barn What do a blonde and a barn have in common? They always have a cock in them."  \n<|im_start|>assistant\n{"rating": 0}',
 '<|im_start|>system\nYou are a joke evaluator that answers in JSON. Here\'s the json schema you must adhere to:\n{\'type\': \'object\', \'properties\': {\'rating\': {\'type\': \'number\', \'minimum\': 0, \'maximum\': 4, \'description\': \'The rating of the joke, from 0 to 5.\'}}} \n<|im_start|>user\n Your task is 

In [15]:
import re
from sklearn.metrics import accuracy_score

# Extract the predictions
preds_clean = [re.search(r'<\|im_start\|>assistant\n(.*)', pred).group(1) for pred in preds]

print(test["label"][0:5])
print(preds_clean[0:5])
# Compute the accuracy
accuracy_score(test["label"], preds_clean)


21342     {"rating": 0}
4946      {"rating": 0}
93564     {"rating": 1}
180534    {"rating": 3}
52918     {"rating": 0}
Name: label, dtype: object
['{"rating": 0}', '{"rating": 3}', '{"rating": 0}', '{"rating": 3}', '{"rating": 4}']


0.25558850364963503

In [16]:
preds_clean_df = pd.DataFrame(preds_clean, columns = ["preds"])

preds_clean_df["preds"].value_counts()

preds
{"rating": 0}    4334
{"rating": 3}    3302
{"rating": 4}    1132
Name: count, dtype: int64

In [17]:
# Count unique values in list
from collections import Counter

Counter(test_dataset["label"])

Counter({'{"rating": 4}': 1765,
         '{"rating": 2}': 1756,
         '{"rating": 1}': 1752,
         '{"rating": 3}': 1752,
         '{"rating": 0}': 1743})

In [18]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np

def evaluate(y_true, y_pred):
    mapping = {0: "Not funny at all", 1: "Not funny", 2: "Funny", 3: "Very funny", 4: "Hilarious"}
    reverse_mapping = {v: k for k, v in mapping.items()}  # Reverse mapping for confusion matrix

    # Ensure y_true is string labels
    if isinstance(y_true[0], (int, np.integer)):
        map_func = np.vectorize(mapping.get)
        y_true = map_func(y_true)

    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')
    
    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels
    
    for label in unique_labels:
        label_indices = [i for i in range(len(y_true)) if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')
        
    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)
    
    # Generate confusion matrix
    y_true_num = np.vectorize(reverse_mapping.get)(y_true)  # Convert back to numerical labels
    y_pred_num = np.vectorize(reverse_mapping.get)(y_pred)
    conf_matrix = confusion_matrix(y_true=y_true_num, y_pred=y_pred_num, labels=list(mapping.keys()))
    print('\nConfusion Matrix:')
    print(conf_matrix)

evaluate(test_dataset["label"], preds_clean)

Accuracy: 0.256
Accuracy for label {"rating": 3}: 0.402
Accuracy for label {"rating": 2}: 0.000
Accuracy for label {"rating": 1}: 0.000
Accuracy for label {"rating": 4}: 0.220
Accuracy for label {"rating": 0}: 0.659

Classification Report:
               precision    recall  f1-score   support

{"rating": 0}       0.26      0.66      0.38      1743
{"rating": 1}       0.00      0.00      0.00      1752
{"rating": 2}       0.00      0.00      0.00      1756
{"rating": 3}       0.21      0.40      0.28      1752
{"rating": 4}       0.34      0.22      0.27      1765

     accuracy                           0.26      8768
    macro avg       0.16      0.26      0.18      8768
 weighted avg       0.16      0.26      0.18      8768



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


ValueError: unknown is not supported