# LLM Fine Tuning

Process steps:
- Prepare and clean the data
- Tokenize the data
- Fine-tune the model
- Evaluate the model

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)

data_dict = {
    "text": [
        "  The staff was very kind and attentive to my needs!!!  ",
        "The waiting time was too long, and the staff was rude. Visit us at http://hospitalreviews.com",
        "The doctor answered all my questions...but the facility was outdated.   ",
        "The nurse was compassionate & made me feel comfortable!! :) ",
        "I had to wait over an hour before being seen.  Unacceptable service! #frustrated",
        "The check-in process was smooth, but the doctor seemed rushed. Visit https://feedback.com",
        "Everyone I interacted with was professional and helpful.  "
    ],
    "label": ["positive", "negative", "neutral", "positive", "negative", "neutral", "positive"]
}

# Convert to pandas DataFrame
data = pd.DataFrame(data_dict)

# Clean the text
import re

def clean_text(text):
    text = text.lower().strip() # Lowercase and strip whitespace
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^\w\s]", "", text) # Remove special characters
    return text

# Apply the cleaning function
data['cleaned_text'] = data['text'].apply(clean_text)

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
data

Unnamed: 0,text,label,cleaned_text
0,The staff was very kind and attentive to my ...,positive,the staff was very kind and attentive to my needs
1,"The waiting time was too long, and the staff w...",negative,the waiting time was too long and the staff wa...
2,The doctor answered all my questions...but the...,neutral,the doctor answered all my questionsbut the fa...
3,The nurse was compassionate & made me feel com...,positive,the nurse was compassionate made me feel comf...
4,I had to wait over an hour before being seen. ...,negative,i had to wait over an hour before being seen ...
5,"The check-in process was smooth, but the docto...",neutral,the checkin process was smooth but the doctor ...
6,Everyone I interacted with was professional an...,positive,everyone i interacted with was professional an...


In [3]:
# Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the data
def tokenize_function(text):
    return tokenizer(text, truncation=True, padding="max_length", max_length=128)

# Apply tokenization
data['tokenized'] = data['cleaned_text'].apply(tokenize_function)

# Extract tokenized features
data["input_ids"] = data["tokenized"].apply(lambda x: x["input_ids"])
data["attention_mask"] = data["tokenized"].apply(lambda x: x["attention_mask"])

# Drop old tokenized column
data = data.drop(columns=["tokenized"])

print(data.head())

                                                text     label  \
0    The staff was very kind and attentive to my ...  positive   
1  The waiting time was too long, and the staff w...  negative   
2  The doctor answered all my questions...but the...   neutral   
3  The nurse was compassionate & made me feel com...  positive   
4  I had to wait over an hour before being seen. ...  negative   

                                        cleaned_text  \
0  the staff was very kind and attentive to my needs   
1  the waiting time was too long and the staff wa...   
2  the doctor answered all my questionsbut the fa...   
3  the nurse was compassionate  made me feel comf...   
4  i had to wait over an hour before being seen  ...   

                                           input_ids  \
0  [101, 1996, 3095, 2001, 2200, 2785, 1998, 2012...   
1  [101, 1996, 3403, 2051, 2001, 2205, 2146, 1998...   
2  [101, 1996, 3460, 4660, 2035, 2026, 3980, 8569...   
3  [101, 1996, 6821, 2001, 29353, 2081, 20

In [4]:
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split

# Split into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

# Remove unnecessary columns
train_dataset = train_dataset.remove_columns(["text", "cleaned_text"])
test_dataset = test_dataset.remove_columns(["text", "cleaned_text"])

#print(train_dataset)

# Enable dynamic padding for batches
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
training_args = TrainingArguments(
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    output_dir="./results",
    logging_dir="./logs",
    report_to="none",  
    save_strategy="epoch" 
)
# Load pre-trained BERT model (3-class classification)
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator
)

# Train the model
trainer.train()

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`label` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
from sklearn.metrics import accuracy_score, f1_score

# Generate predictions
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)
labels = test_dataset['label']

# Calculate metrics
accuracy = accuracy_score(labels, preds)
f1 = f1_score(labels, preds, average='weighted')

print(f"Accuracy: {accuracy}, F1 Score: {f1}")