<a href="https://colab.research.google.com/github/IsuruMahakumara/microsoft-ai-ml-engineering/blob/main/Walkthrough_LLM_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import pandas as pd

# Create a noisy sample dataset
data_dict = {
    "text": [
        "  The staff was very kind and attentive to my needs!!!  ",
        "The waiting time was too long, and the staff was rude. Visit us at http://hospitalreviews.com",
        "The doctor answered all my questions...but the facility was outdated.   ",
        "The nurse was compassionate & made me feel comfortable!! :) ",
        "I had to wait over an hour before being seen.  Unacceptable service! #frustrated",
        "The check-in process was smooth, but the doctor seemed rushed. Visit https://feedback.com",
        "Everyone I interacted with was professional and helpful. ðŸ˜Š  "
    ],
    "label": ["positive", "negative", "neutral", "positive", "negative", "neutral", "positive"]
}

# Convert to a DataFrame
data = pd.DataFrame(data_dict)

# Function to clean the text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation and special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespaces
    return text

# Apply the cleaning function
data['cleaned_text'] = data['text'].apply(clean_text)
data[['cleaned_text', 'label']].head()

Unnamed: 0,cleaned_text,label
0,the staff was very kind and attentive to my needs,positive
1,the waiting time was too long and the staff wa...,negative
2,the doctor answered all my questionsbut the fa...,neutral
3,the nurse was compassionate made me feel comfo...,positive
4,i had to wait over an hour before being seen u...,negative


In [5]:
from transformers import AutoTokenizer

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the cleaned text
def tokenize_function(text):
    return tokenizer(text, padding='max_length', truncation=True, return_tensors="pt", max_length=128)

# Apply tokenization
data['tokenized'] = data['cleaned_text'].apply(tokenize_function)
data[['tokenized', 'label']].head()

Unnamed: 0,tokenized,label
0,"[input_ids, token_type_ids, attention_mask]",positive
1,"[input_ids, token_type_ids, attention_mask]",negative
2,"[input_ids, token_type_ids, attention_mask]",neutral
3,"[input_ids, token_type_ids, attention_mask]",positive
4,"[input_ids, token_type_ids, attention_mask]",negative


In [4]:
 data['tokenized'].head()

Unnamed: 0,tokenized
0,"[input_ids, token_type_ids, attention_mask]"
1,"[input_ids, token_type_ids, attention_mask]"
2,"[input_ids, token_type_ids, attention_mask]"
3,"[input_ids, token_type_ids, attention_mask]"
4,"[input_ids, token_type_ids, attention_mask]"


In [6]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# Prepare tensors for fine-tuning
input_ids = torch.cat([token['input_ids'] for token in data['tokenized']], dim=0)
attention_masks = torch.cat([token['attention_mask'] for token in data['tokenized']], dim=0)
labels = torch.tensor([0 if label == "negative" else 1 if label == "neutral" else 2 for label in data['label']])

# Create DataLoader
dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

print("DataLoader created successfully!")

DataLoader created successfully!


In [7]:
from sklearn.model_selection import train_test_split

# Split data into training, validation, and test sets
train_inputs, test_inputs, train_labels, test_labels = train_test_split(
    input_ids, labels, test_size=0.2, random_state=42
)

# Create DataLoader objects
train_dataset = TensorDataset(train_inputs, train_labels)
test_dataset = TensorDataset(test_inputs, test_labels)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16)

print("Data splitting successful!")

Data splitting successful!
