In [25]:
! pip install accelerate>=0.26.0 datasets



[notice] A new release of pip is available: 23.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [32]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertForQuestionAnswering, Trainer, TrainingArguments
from torch.utils.data import Dataset
import json
import numpy as np
from sklearn.model_selection import train_test_split
from uuid import uuid4
import transformers

# Check transformers version for compatibility
print(f"Transformers version: {transformers.__version__}")

# Initialize tokenizer and models
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
intent_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)
qa_model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
intent_labels = ["greeting", "small_talk", "symptom_query"]

# Custom Dataset for Intent Classification
class IntentDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        self.encodings = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Custom Dataset for Question Answering
class QADataset(Dataset):
    def __init__(self, questions, contexts, start_positions, end_positions):
        self.questions = questions
        self.contexts = contexts
        self.start_positions = start_positions
        self.end_positions = end_positions
        self.encodings = tokenizer(questions, contexts, padding=True, truncation=True, return_tensors="pt")

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["start_positions"] = torch.tensor(self.start_positions[idx])
        item["end_positions"] = torch.tensor(self.end_positions[idx])
        return item

# Load and prepare data
def load_data():
    with open("symptom_data.json", "r") as f:
        data = json.load(f)

    # Intent classification data
    intent_texts = []
    intent_labels_idx = []
    for intent in data["intents"]:
        for example in intent["examples"]:
            intent_texts.append(example)
            intent_labels_idx.append(intent_labels.index(intent["type"]) if intent["type"] in intent_labels else 2)

    # Question-answering data (extended for QA format)
    qa_questions = []
    qa_contexts = []
    qa_start_positions = []
    qa_end_positions = []
    for intent in data["intents"]:
        if intent["type"] == "symptom_query":
            context = intent["response"]
            for example in intent["examples"]:
                qa_questions.append(example)
                qa_contexts.append(context)
                # Approximate start and end positions for the answer (simplified)
                tokens = tokenizer.encode(context, add_special_tokens=False)
                answer_start = 0  # Start of the context (simplified for demo)
                answer_end = len(tokens) - 1
                qa_start_positions.append(answer_start)
                qa_end_positions.append(answer_end)

    return intent_texts, intent_labels_idx, qa_questions, qa_contexts, qa_start_positions, qa_end_positions

# Fine-tuning function
def fine_tune_models():
    # Load data
    intent_texts, intent_labels_idx, qa_questions, qa_contexts, qa_start_positions, qa_end_positions = load_data()

    # Split data for intent classification
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        intent_texts, intent_labels_idx, test_size=0.2, random_state=42
    )

    # Split data for question answering
    qa_train_indices, qa_val_indices = train_test_split(
        range(len(qa_questions)), test_size=0.2, random_state=42
    )
    train_qa_questions = [qa_questions[i] for i in qa_train_indices]
    val_qa_questions = [qa_questions[i] for i in qa_val_indices]
    train_qa_contexts = [qa_contexts[i] for i in qa_train_indices]
    val_qa_contexts = [qa_contexts[i] for i in qa_val_indices]
    train_qa_start_positions = [qa_start_positions[i] for i in qa_train_indices]
    val_qa_start_positions = [qa_start_positions[i] for i in qa_val_indices]
    train_qa_end_positions = [qa_end_positions[i] for i in qa_train_indices]
    val_qa_end_positions = [qa_end_positions[i] for i in qa_val_indices]

    # Create datasets
    train_intent_dataset = IntentDataset(train_texts, train_labels)
    val_intent_dataset = IntentDataset(val_texts, val_labels)
    train_qa_dataset = QADataset(train_qa_questions, train_qa_contexts, train_qa_start_positions, train_qa_end_positions)
    val_qa_dataset = QADataset(val_qa_questions, val_qa_contexts, val_qa_start_positions, val_qa_end_positions)

    # Training arguments for intent model
    intent_training_args = TrainingArguments(
        output_dir="./intent_results",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./intent_logs",
        logging_steps=10,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )

    # Trainer for intent model
    intent_trainer = Trainer(
        model=intent_model,
        args=intent_training_args,
        train_dataset=train_intent_dataset,
        eval_dataset=val_intent_dataset,
    )

    # Train intent model
    intent_trainer.train()
    intent_model.save_pretrained("fine_tuned_intent_model")
    tokenizer.save_pretrained("fine_tuned_intent_model")
    print("Intent model fine-tuning completed and saved.")

    # Training arguments for QA model
    qa_training_args = TrainingArguments(
        output_dir="./qa_results",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./qa_logs",
        logging_steps=10,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )

    # Trainer for QA model
    qa_trainer = Trainer(
        model=qa_model,
        args=qa_training_args,
        train_dataset=train_qa_dataset,
        eval_dataset=val_qa_dataset,  # Added evaluation dataset
    )

    # Train QA model
    qa_trainer.train()
    qa_model.save_pretrained("fine_tuned_qa_model")
    tokenizer.save_pretrained("fine_tuned_qa_model")
    print("QA model fine-tuning completed and saved.")

if __name__ == "__main__":
    fine_tune_models()


Transformers version: 4.52.4


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,1.088458
2,1.083800,1.071702
3,1.083800,1.041938


Intent model fine-tuning completed and saved.


Epoch,Training Loss,Validation Loss
1,No log,4.122265
2,4.598600,4.104264
3,4.598600,4.075503


QA model fine-tuning completed and saved.


In [None]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, DistilBertForQuestionAnswering, Trainer, TrainingArguments
from datasets import Dataset
import torch
import json
import numpy as np
import shutil
import os
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load symptom data
try:
    with open("symptom_data.json", "r") as f:
        data = json.load(f)
except FileNotFoundError:
    logger.error("symptom_data.json not found in the current directory")
    raise

# Define intent labels
intent_labels = ["greeting", "small_talk", "symptom_query", "out_of_scope"]

# Prepare dataset for intent classification
intent_texts = []
intent_labels_idx = []
for intent in data["intents"]:
    for example in intent["examples"]:
        intent_texts.append(example)
        intent_labels_idx.append(intent_labels.index(intent["type"]) if intent["type"] in intent_labels else 3)

# Check dataset balance
label_counts = {label: intent_labels_idx.count(i) for i, label in enumerate(intent_labels)}
logger.info(f"Intent dataset distribution: {label_counts}")
if label_counts["greeting"] < 5:
    logger.warning("Too few greeting examples. Adding more examples.")
    intent_texts.extend(["hello there", "hi bot", "greetings", "hey bot", "good morning"])
    intent_labels_idx.extend([0, 0, 0, 0, 0])

# Prepare dataset for question answering
qa_questions = []
qa_contexts = []
qa_answers = []
for intent in data["intents"]:
    if intent["type"] == "symptom_query":
        for example in intent["examples"]:
            qa_questions.append(example)
            qa_contexts.append(intent["response"])
            answer_text = intent["response"].split(".")[0]
            start_pos = intent["response"].find(answer_text)
            qa_answers.append({"text": answer_text, "start": start_pos, "end": start_pos + len(answer_text)})

# Create datasets
intent_dataset = Dataset.from_dict({"text": intent_texts, "label": intent_labels_idx})
qa_dataset = Dataset.from_dict({"question": qa_questions, "context": qa_contexts, "answers": qa_answers})

# Load tokenizer
try:
    tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
except Exception as e:
    logger.error(f"Error loading tokenizer: {e}")
    raise

# Tokenize intent classification dataset
def tokenize_intent(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

intent_dataset = intent_dataset.map(tokenize_intent, batched=True)
intent_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Tokenize QA dataset
def tokenize_qa(examples):
    encodings = tokenizer(
        examples["question"],
        examples["context"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_offsets_mapping=True
    )
    start_positions = []
    end_positions = []
    for i, offset in enumerate(encodings["offset_mapping"]):
        start_char = examples["answers"][i]["start"]
        end_char = examples["answers"][i]["end"]
        start_pos = None
        end_pos = None
        for j, (start, end) in enumerate(offset):
            if start <= start_char < end:
                start_pos = j
            if start < end_char <= end:
                end_pos = j
        start_positions.append(start_pos if start_pos is not None else 0)
        end_positions.append(end_pos if end_pos is not None else 0)
    encodings["start_positions"] = start_positions
    encodings["end_positions"] = end_positions
    return encodings

qa_dataset = qa_dataset.map(tokenize_qa, batched=True)
qa_dataset.set_format("torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])

# Load models
try:
    intent_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(intent_labels))
    qa_model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
except Exception as e:
    logger.error(f"Error loading models: {e}")
    raise

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,  # Increased for better convergence
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

# Initialize trainers
intent_trainer = Trainer(
    model=intent_model,
    args=training_args,
    train_dataset=intent_dataset,
)

qa_trainer = Trainer(
    model=qa_model,
    args=training_args,
    train_dataset=qa_dataset,
)

# Fine-tune models
try:
    logger.info("Starting intent model training...")
    intent_trainer.train()
    logger.info("Intent model training completed.")
except Exception as e:
    logger.error(f"Error during intent model training: {e}")
    raise

try:
    logger.info("Starting QA model training...")
    qa_trainer.train()
    logger.info("QA model training completed.")
except Exception as e:
    logger.error(f"Error during QA model training: {e}")
    raise

# Save fine-tuned models with error handling
def save_model_with_retry(model, save_path, retries=3):
    for attempt in range(retries):
        try:
            if os.path.exists(save_path):
                logger.info(f"Removing existing directory: {save_path}")
                shutil.rmtree(save_path)
            os.makedirs(save_path, exist_ok=True)
            model.save_pretrained(save_path)
            logger.info(f"Model saved successfully to {save_path}")
            return
        except OSError as e:
            logger.error(f"Attempt {attempt + 1} failed to save model to {save_path}: {e}")
            if attempt == retries - 1:
                raise OSError(f"Failed to save model to {save_path} after {retries} attempts: {e}")
print(intent_texts, intent_labels_idx)
try:
    save_model_with_retry(intent_model, "fine_tuned_intent_model")
    save_model_with_retry(tokenizer, "fine_tuned_intent_model")
    save_model_with_retry(qa_model, "fine_tuned_qa_model")
except OSError as e:
    logger.error(f"Error saving models: {e}")
    raise

INFO:__main__:Intent dataset distribution: {'greeting': 0, 'small_talk': 3, 'symptom_query': 49, 'out_of_scope': 2}
Map: 100%|██████████| 59/59 [00:00<00:00, 1768.96 examples/s]
Map: 100%|██████████| 49/49 [00:00<00:00, 789.65 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:__main__:Starting intent model training...


Step,Training Loss
10,1.3647
20,1.3428
30,1.2937
40,1.2059


INFO:__main__:Intent model training completed.
INFO:__main__:Starting QA model training...


Step,Training Loss
10,6.2502
20,6.2073
30,6.1352


INFO:__main__:QA model training completed.
INFO:__main__:Model saved successfully to fine_tuned_intent_model
INFO:__main__:Removing existing directory: fine_tuned_intent_model
INFO:__main__:Model saved successfully to fine_tuned_intent_model
INFO:__main__:Model saved successfully to fine_tuned_qa_model
