In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
!pip install -U transformers datasets scikit-learn seaborn

import pandas as pd
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.multiclass import unique_labels
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments
)
from datasets import Dataset

# Disable Weights & Biases logging (optional)
import os
os.environ["WANDB_DISABLED"] = "true"

# Load dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dental_health_dataset.csv')
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

# Tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=len(label_encoder.classes_)
)

# Tokenization
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=128)

# Prepare dataset
dataset = Dataset.from_pandas(df[['text', 'label_encoded']])
dataset = dataset.train_test_split(test_size=0.2, seed=42)
tokenized_ds = dataset.map(tokenize, batched=True)
tokenized_ds = tokenized_ds.rename_column("label_encoded", "labels")
tokenized_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    logging_dir='./logs'
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"]
)

# Train
trainer.train()

# Predictions
predictions = trainer.predict(tokenized_ds["test"])
y_true = predictions.label_ids
y_pred = predictions.predictions.argmax(axis=1)

# ✅ Fix: Use only labels/classes present in test set
used_labels = unique_labels(y_true, y_pred)
used_class_names = label_encoder.inverse_transform(used_labels)

# Classification Report
print("\nClassification Report:")
print(classification_report(y_true, y_pred, labels=used_labels, target_names=used_class_names))

# Confusion Matrix
cm = confusion_matrix(y_true, y_pred, labels=used_labels)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=used_class_names, yticklabels=used_class_names, cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.show()

In [None]:
pip install gradio joblib

In [None]:
import joblib
joblib.dump(label_encoder, "label_encoder.joblib")

In [None]:
trainer.save_model("./results")

In [None]:
import gradio as gr
import pandas as pd
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import joblib

# Load trained model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained("./results")
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Load label encoder (you should save this during training)
label_encoder = joblib.load("label_encoder.joblib")

def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    pred = torch.argmax(probs, dim=1).item()
    label = label_encoder.inverse_transform([pred])[0]
    confidence = probs[0][pred].item()
    return {label: float(round(confidence, 3))}

interface = gr.Interface(
    fn=predict,
    inputs=gr.Textbox(lines=2, placeholder="Enter a dental query..."),
    outputs=gr.Label(num_top_classes=3),
    title="Dental Query Chatbot",
    description="Enter a dental-related question and the model will classify it."
)

interface.launch()