In [1]:
import json
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from torch.utils.data import Dataset
import os
from typing import List, Dict
import warnings
warnings.filterwarnings('ignore')

class IntentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def load_intent_data(json_file_path: str) -> tuple:
    """
    Load intent data from JSON file with your format
    Expected format: [{"text": "example", "intent": "intent_name"}, ...]
    """
    try:
        with open(json_file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        texts = []
        intents = []

        for item in data:
            texts.append(item['text'])
            intents.append(item['intent'])

        print(f"Loaded {len(texts)} examples")
        print(f"Intents found: {set(intents)}")
        return texts, intents

    except FileNotFoundError:
        raise Exception(f"JSON file not found: {json_file_path}")
    except json.JSONDecodeError:
        raise Exception(f"Invalid JSON format in file: {json_file_path}")
    except KeyError as e:
        raise Exception(f"Missing key in JSON data: {e}")

def prepare_training_data(texts: List[str], intents: List[str]):
    """
    Prepare training data for BERT
    """
    # Encode labels
    label_encoder = LabelEncoder()
    labels_encoded = label_encoder.fit_transform(intents)

    # Split data
    X_train, X_val, y_train, y_val = train_test_split(
        texts, labels_encoded, test_size=0.2, random_state=42, stratify=labels_encoded
    )

    print(f"Training samples: {len(X_train)}")
    print(f"Validation samples: {len(X_val)}")
    print(f"Class distribution - Train: {np.bincount(y_train)}")
    print(f"Class distribution - Val: {np.bincount(y_val)}")

    return X_train, X_val, y_train, y_val, label_encoder

def compute_metrics(eval_pred):
    """
    Compute metrics for evaluation
    """
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy': accuracy_score(labels, predictions)}

def train_bert_intent_classifier(json_file_path: str, output_dir: str = "bert_intent_model"):
    """
    Train BERT intent classifier (compatible with older transformers versions)
    """
    # Load data
    print("Loading training data...")
    texts, intents = load_intent_data(json_file_path)

    # Prepare data
    X_train, X_val, y_train, y_val, label_encoder = prepare_training_data(texts, intents)

    # Initialize tokenizer and model
    print("Initializing BERT model...")
    model_name = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    num_labels = len(label_encoder.classes_)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=num_labels
    )

    # Create datasets
    train_dataset = IntentDataset(X_train, y_train, tokenizer)
    val_dataset = IntentDataset(X_val, y_val, tokenizer)

    # Training arguments (compatible with older versions)
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=10,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir=f'{output_dir}/logs',
        logging_steps=10,
        eval_steps=50,
        save_steps=100,
        evaluation_strategy="steps",  # Use "steps" instead of "epoch"
        save_strategy="steps",
        load_best_model_at_end=True,
        metric_for_best_model="eval_accuracy",
        greater_is_better=True,
        save_total_limit=2,
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    # Train
    print("Starting training...")
    trainer.train()

    # Save model and tokenizer
    print("Saving model...")
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    # Save label encoder
    label_mapping = {i: label for i, label in enumerate(label_encoder.classes_)}
    with open(f'{output_dir}/label_mapping.json', 'w') as f:
        json.dump(label_mapping, f, indent=2)

    # Save label encoder classes for inference
    np.save(f'{output_dir}/label_classes.npy', label_encoder.classes_)

    print(f"Model saved to {output_dir}")
    print(f"Label mapping: {label_mapping}")

    # Evaluate
    print("Final evaluation:")
    eval_results = trainer.evaluate()
    print(f"Validation accuracy: {eval_results['eval_accuracy']:.4f}")

    return model, tokenizer, label_encoder

# Alternative simple training function without Trainer
def train_bert_simple(json_file_path: str, output_dir: str = "bert_intent_model"):
    """
    Simple training function without Trainer for maximum compatibility
    """
    import torch.optim as optim
    from torch.utils.data import DataLoader

    # Load data
    print("Loading training data...")
    texts, intents = load_intent_data(json_file_path)

    # Prepare data
    X_train, X_val, y_train, y_val, label_encoder = prepare_training_data(texts, intents)

    # Initialize tokenizer and model
    print("Initializing BERT model...")
    model_name = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    num_labels = len(label_encoder.classes_)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=num_labels
    )

    # Create datasets and dataloaders
    train_dataset = IntentDataset(X_train, y_train, tokenizer)
    val_dataset = IntentDataset(X_val, y_val, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

    # Training setup
    optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    criterion = torch.nn.CrossEntropyLoss()

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Training loop
    print("Starting training...")
    for epoch in range(10):
        model.train()
        total_loss = 0

        for batch in train_loader:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # Validation
        model.eval()
        correct = 0
        total = 0

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                predictions = torch.argmax(outputs.logits, dim=1)
                correct += (predictions == labels).sum().item()
                total += labels.size(0)

        accuracy = correct / total
        print(f'Epoch {epoch+1}/10 - Loss: {total_loss/len(train_loader):.4f} - Val Accuracy: {accuracy:.4f}')

    # Save model and tokenizer
    print("Saving model...")
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    # Save label mapping
    label_mapping = {i: label for i, label in enumerate(label_encoder.classes_)}
    with open(f'{output_dir}/label_mapping.json', 'w') as f:
        json.dump(label_mapping, f, indent=2)

    print(f"Model saved to {output_dir}")
    return model, tokenizer, label_encoder

def predict_intent(text: str, model, tokenizer, label_encoder, max_length=128):
    """
    Predict intent for a single text
    """
    model.eval()
    device = next(model.parameters()).device

    encoding = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_label_idx = torch.argmax(predictions, dim=1).item()
        confidence = predictions[0][predicted_label_idx].item()
        predicted_label = label_encoder.inverse_transform([predicted_label_idx])[0]

    return predicted_label, confidence

if __name__ == "__main__":
    # Configuration
    JSON_FILE_PATH = "/content/intent.json"  # Update with your JSON file path
    MODEL_OUTPUT_DIR = "/content/bert_intent_model"

    # Train the model
    try:
        # Try the simple training method for maximum compatibility
        print("Using simple training method for compatibility...")
        model, tokenizer, label_encoder = train_bert_simple(JSON_FILE_PATH, MODEL_OUTPUT_DIR)

        # Test predictions
        test_texts = [
            "Hello there!",
            "What are the side effects of aspirin?",
            "What's the weather like today?",
            "Dosage for metformin"
        ]

        print("\nTest predictions:")
        for text in test_texts:
            intent, confidence = predict_intent(text, model, tokenizer, label_encoder)
            print(f"Text: '{text}' -> Intent: {intent} (confidence: {confidence:.3f})")

    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()

Using simple training method for compatibility...
Loading training data...
Loaded 902 examples
Intents found: {'irrelevant', 'drug_medical', 'greeting'}
Training samples: 721
Validation samples: 181
Class distribution - Train: [626  36  59]
Class distribution - Val: [157   9  15]
Initializing BERT model...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...
Epoch 1/10 - Loss: 0.3912 - Val Accuracy: 0.9669
Epoch 2/10 - Loss: 0.1045 - Val Accuracy: 0.9779
Epoch 3/10 - Loss: 0.0468 - Val Accuracy: 0.9779
Epoch 4/10 - Loss: 0.0187 - Val Accuracy: 1.0000
Epoch 5/10 - Loss: 0.0068 - Val Accuracy: 1.0000
Epoch 6/10 - Loss: 0.0042 - Val Accuracy: 0.9834
Epoch 7/10 - Loss: 0.0029 - Val Accuracy: 0.9834
Epoch 8/10 - Loss: 0.0023 - Val Accuracy: 0.9834
Epoch 9/10 - Loss: 0.0019 - Val Accuracy: 0.9834
Epoch 10/10 - Loss: 0.0016 - Val Accuracy: 0.9834
Saving model...
Model saved to /content/bert_intent_model

Test predictions:
Text: 'Hello there!' -> Intent: greeting (confidence: 0.997)
Text: 'What are the side effects of aspirin?' -> Intent: drug_medical (confidence: 0.999)
Text: 'What's the weather like today?' -> Intent: irrelevant (confidence: 0.996)
Text: 'Dosage for metformin' -> Intent: drug_medical (confidence: 0.999)
