<a href="https://colab.research.google.com/github/Jdimarucut/Assembly-ImageConverter/blob/main/MachineLearningPIT(reworked).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import torch
import numpy as np
import pandas as pd
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
import nltk
from nltk.tokenize import word_tokenize
import random
import re
from sklearn.metrics import accuracy_score

In [None]:
# Download necessary NLTK data
nltk.download('punkt')

# Set random seed for reproducibility
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Mount Google Drive to access files
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Path to your JSON file - update this to your actual file path
JSON_FILE_PATH = '/content/drive/MyDrive/Colab Notebooks/revised_intent.json'  # Path to your JSON file
print(f"JSON file path set to: {JSON_FILE_PATH}")

JSON file path set to: /content/drive/MyDrive/Colab Notebooks/revised_intent.json


In [None]:
def load_and_preprocess_data(json_file):
    """
    Load the JSON file and preprocess for training

    Expected JSON format:
    [
        {
            "input": "user message",
            "response": "bot response"
        },
        ...
    ]
    OR
    {
        "intents": [
            {
                "tag": "greeting",
                "patterns": ["Hello", "Hi", ...],
                "responses": ["Hi there", "Hello", ...]
            },
            ...
        ]
    }
    """
    print("Loading dataset...")

    # Remove any trailing slash that might cause errors
    json_file = json_file.rstrip('/')

    try:
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"File not found: {json_file}")
        return None
    except json.JSONDecodeError:
        print(f"Invalid JSON format in file: {json_file}")
        return None

    # Extract inputs, intent tags, and response mappings
    inputs = []
    intent_tags = []
    intent_responses = {}

    # Check the structure of your JSON data and process accordingly
    if isinstance(data, list):
        # Format: list of dictionaries with "input" and "response" keys
        # Convert to intent-based format
        unique_responses = []
        for item in data:
            if "input" in item and "response" in item:
                inputs.append(item["input"])
                if item["response"] not in unique_responses:
                    unique_responses.append(item["response"])
                intent_tags.append(unique_responses.index(item["response"]))

        # Create intent_responses mapping
        for i, response in enumerate(unique_responses):
            intent_responses[i] = [response]

    elif isinstance(data, dict) and "intents" in data:
        # Format: dict with "intents" key containing a list of intent objects
        for i, intent in enumerate(data["intents"]):
            # Store the intent tag and its responses
            if "patterns" in intent and "responses" in intent:
                tag = intent.get("tag", f"intent_{i}")
                if tag not in intent_responses:
                    intent_responses[tag] = intent["responses"]

                # Add each pattern with its corresponding intent tag
                for pattern in intent["patterns"]:
                    inputs.append(pattern)
                    intent_tags.append(tag)
    else:
        print("Unrecognized JSON format. Please check your data structure.")
        return None

    if not inputs or not intent_tags:
        print("No valid input-intent pairs found in the data.")
        return None

    # Create numerical labels for each unique intent tag
    unique_intents = list(set(intent_tags))
    intent_to_label = {intent: idx for idx, intent in enumerate(unique_intents)}
    labels = [intent_to_label[intent] for intent in intent_tags]

    # Store the mapping from label to responses
    label_to_responses = {}
    for intent, label in intent_to_label.items():
        label_to_responses[label] = intent_responses[intent]

    print(f"Found {len(unique_intents)} unique intents/classes")
    print(f"Processed {len(inputs)} training examples")

    return inputs, labels, label_to_responses

In [None]:
# Make sure your file path is correct and doesn't have a trailing slash
print(f"Attempting to load file from: {JSON_FILE_PATH}")
inputs, labels, label_to_responses = load_and_preprocess_data(JSON_FILE_PATH)

# Print some examples to verify data loading
print("\nSample data:")
for i in range(min(3, len(inputs))):
    print(f"Input: {inputs[i]}")
    print(f"Intent label: {labels[i]}")
    print(f"Possible responses: {label_to_responses[labels[i]][:1]}...")  # Show just one response
    print()


Attempting to load file from: /content/drive/MyDrive/Colab Notebooks/revised_intent.json
Loading dataset...
Found 88 unique intents/classes
Processed 680 training examples

Sample data:
Input: Hi
Intent label: 61
Possible responses: ['Hello! How can I assist you regarding USTP?']...

Input: Hello
Intent label: 61
Possible responses: ['Hello! How can I assist you regarding USTP?']...

Input: Hey
Intent label: 61
Possible responses: ['Hello! How can I assist you regarding USTP?']...



In [None]:
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    inputs, labels, test_size=0.2, random_state=RANDOM_SEED
)

print(f"Training examples: {len(train_inputs)}")
print(f"Validation examples: {len(val_inputs)}")

Training examples: 544
Validation examples: 136


In [None]:
# Initialize the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
print("Tokenizer initialized")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Tokenizer initialized


In [None]:
def tokenize_data(texts):
    """Tokenize text data using the DistilBERT tokenizer"""
    # Add basic text cleaning
    cleaned_texts = []
    for text in texts:
        # Convert to lowercase
        text = text.lower()
        # Remove extra spaces
        text = re.sub(r'\s+', ' ', text).strip()
        # Remove special characters (keep letters, numbers, and basic punctuation)
        text = re.sub(r'[^\w\s.,?!]', '', text)
        cleaned_texts.append(text)

    return tokenizer(
        cleaned_texts,
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )


In [None]:
class ChatbotDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.encodings = tokenize_data(inputs)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [None]:
# Create the datasets
train_dataset = ChatbotDataset(train_inputs, train_labels)
val_dataset = ChatbotDataset(val_inputs, val_labels)
print("Datasets created")

Datasets created


In [None]:
# Get the number of unique intent classes
num_intent_classes = len(set(labels))

# Initialize the DistilBERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_intent_classes,
)

# Move model to the specified device
model.to(device)
print("Model initialized with", num_intent_classes, "output classes (intents)")


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model initialized with 88 output classes (intents)


In [None]:
training_args = TrainingArguments(
    output_dir='./output',
    do_train=True,
    do_eval=True,
    num_train_epochs=100,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.05,
    logging_strategy='steps',
    logging_dir='./multi-class-logs',
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    load_best_model_at_end=True
)

In [None]:
# Define compute_metrics function for evaluation during training
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Initialize the Trainer with metrics and early stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Stop if no improvement for 3 epochs
)
print("Trainer initialized with evaluation metrics and early stopping")

Trainer initialized with evaluation metrics and early stopping


In [None]:
# Train the model
print("Starting model training...")
trainer.train()
print("Training completed!")

Starting model training...




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtocle-gabrielluke[0m ([33mtocle-gabrielluke-university-of-science-and-technology-o[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy
50,4.4598,4.41316,0.058824
100,4.0116,3.759887,0.352941
150,2.9021,2.849568,0.470588
200,1.82,2.205715,0.595588
250,1.0196,1.778056,0.654412
300,0.5754,1.509081,0.669118
350,0.2879,1.336828,0.705882
400,0.1571,1.232166,0.676471
450,0.0828,1.172504,0.705882
500,0.0556,1.187533,0.691176


Training completed!


In [None]:
# Save the model and tokenizer
model_save_path = './chatbot_model'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model and tokenizer saved to {model_save_path}")

# Save the intent-response mapping for future use
response_mapping_path = './intent_responses_mapping.json'
with open(response_mapping_path, 'w') as f:
    # Convert the label keys from int to string for JSON
    mapping_for_json = {str(label): responses for label, responses in label_to_responses.items()}
    json.dump(mapping_for_json, f, indent=2)
print(f"Intent-response mapping saved to {response_mapping_path}")

Model and tokenizer saved to ./chatbot_model
Intent-response mapping saved to ./intent_responses_mapping.json


In [None]:
# Function to get chatbot response
def get_chatbot_response(text, model, tokenizer, label_to_responses):
    """Generate a response for user input using the trained model"""
    # Tokenize input text
    inputs = tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt'
    ).to(device)

    # Set model to evaluation mode
    model.eval()

    # Get prediction
    with torch.no_grad():
        outputs = model(**inputs)

    # Get predicted class and its probability
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=1)
    predicted_class = torch.argmax(probabilities, dim=1).item()
    confidence = probabilities[0][predicted_class].item()

    # Print debug info
    print(f"Debug - Predicted Intent Class: {predicted_class}, Confidence: {confidence:.4f}")

    # Get a response from the predicted class
    possible_responses = label_to_responses[predicted_class]
    response = random.choice(possible_responses)

    return response

In [None]:
# Test the model with some example queries
test_queries = [
    "What is USTP Handbook?",
    "What are the attributes of a USTP graduate?",
    "How can i qualify for dean's list?"
]

print("\nTesting the chatbot:")
for query in test_queries:
    response = get_chatbot_response(query, model, tokenizer, label_to_responses)
    print(f"User: {query}")
    print(f"Bot: {response}\n")


Testing the chatbot:
Debug - Predicted Intent Class: 80, Confidence: 0.9961
User: What is USTP Handbook?
Bot: The USTP Student Handbook (2023 Edition) contains the policies and procedures for the academic and non-academic engagements of the students. As primary constituents of this university, students are expected to rigorously follow all provisions in this handbook. It serves as a guide for all university stakeholders in bringing to life the university vision and mission of preparing students to become lifelong learners and nation builders. Approved by the Board on September 13, 2018 (BOR Resolution No. 62, s. 2018), it was reviewed and revised on December 2, 2020 (BOR Resolution No. 168, s. 2020), and again under BOR Resolution No. 53, s. 2023, to produce this 2023 edition.

Debug - Predicted Intent Class: 37, Confidence: 0.9945
User: What are the attributes of a USTP graduate?
Bot: A USTP graduate is an innovative and entrepreneurial thought leader and game changer in academia and

In [None]:
# Create a simple interactive demo with confidence threshold
def interactive_chat(confidence_threshold=0.3):
    """Interactive chatbot demo with confidence checking"""
    print("\n=== Interactive Chatbot Demo ===")
    print("Type 'exit' to end the conversation")
    print(f"Using confidence threshold: {confidence_threshold}\n")

    # Add a default response if confidence is too low
    default_responses = [
        "I'm not sure I understand. Could you rephrase that?",
        "I don't have enough information to respond to that properly.",
        "Could you provide more details about what you're asking?",
        "I'm still learning. Could you try asking in a different way?"
    ]

    while True:
        user_input = input("You: ")
        if user_input.lower() in ['exit', 'quit', 'bye']:
            print("Bot: Goodbye!")
            break

        # Get tokenized input
        inputs = tokenizer(
            user_input,
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors='pt'
        ).to(device)

        # Set model to evaluation mode
        model.eval()

        # Get prediction with confidence
        with torch.no_grad():
            outputs = model(**inputs)

        # Get predicted class and confidence
        logits = outputs.logits
        probabilities = torch.nn.functional.softmax(logits, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1).item()
        confidence = probabilities[0][predicted_class].item()

        print(f"[Debug] Predicted Intent: {predicted_class}, Confidence: {confidence:.4f}")

        # Check confidence threshold
        if confidence >= confidence_threshold:
            # Get a response from the predicted class
            possible_responses = label_to_responses[predicted_class]
            response = random.choice(possible_responses)
        else:
            # Use default response if confidence is too low
            response = random.choice(default_responses)
            print(f"[Low confidence response triggered: {confidence:.4f} < {confidence_threshold}]")

        print(f"Bot: {response}")

# Run the interactive demo
interactive_chat(confidence_threshold=0.3)



=== Interactive Chatbot Demo ===
Type 'exit' to end the conversation
Using confidence threshold: 0.3

You: 
[Debug] Predicted Intent: 61, Confidence: 0.9792
Bot: Hello! How can I assist you regarding USTP?
You: 
[Debug] Predicted Intent: 61, Confidence: 0.9792
Bot: Hello! How can I assist you regarding USTP?
You: a
[Debug] Predicted Intent: 61, Confidence: 0.9920
Bot: Greetings! Do you want to know more about USTP and its policies?
You: Hi
[Debug] Predicted Intent: 61, Confidence: 0.9967
Bot: Hello! How can I assist you regarding USTP?
You: Sup
[Debug] Predicted Intent: 61, Confidence: 0.9966
Bot: Greetings! Do you want to know more about USTP and its policies?
You: Waddup dog
[Debug] Predicted Intent: 61, Confidence: 0.9885
Bot: Greetings! Do you want to know more about USTP and its policies?
You: How to enrol for freshman
[Debug] Predicted Intent: 49, Confidence: 0.8747
Bot: Incoming Freshmen must submit:

a. A duly accomplished Application Form (available at the Admissions and Scho

KeyboardInterrupt: Interrupted by user

In [None]:
# Optional: Evaluate the model performance
from sklearn.metrics import accuracy_score, classification_report

def evaluate_model():
    """Evaluate model performance on validation set"""
    val_dataloader = DataLoader(val_dataset, batch_size=16)

    # Store predictions and true labels
    all_predictions = []
    all_labels = []

    model.eval()
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Evaluating"):
            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items()}

            # Get model predictions
            outputs = model(**batch)
            predictions = torch.argmax(outputs.logits, dim=1)

            # Store predictions and labels
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(batch['labels'].cpu().numpy())

    # Calculate accuracy
    accuracy = accuracy_score(all_labels, all_predictions)
    print(f"Validation accuracy: {accuracy:.4f}")

    # Print detailed classification report
    print("\nClassification Report:")
    print(classification_report(all_labels, all_predictions))

# Run the evaluation
evaluate_model()

Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

Validation accuracy: 0.6912

Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         1
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         2
           5       1.00      0.67      0.80         3
           6       0.00      0.00      0.00         2
           7       1.00      0.50      0.67         2
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1
          10       0.75      1.00      0.86         3
          11       0.83      0.83      0.83         6
          12       1.00      1.00      1.00         7
          13       0.67      1.00      0.80         2
          14       1.00      1.00      1.00         2
          15       1.00      0.33      0.50         3
          16       1.00      1.00      1.00         1
          17       0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
