In [1]:
import numpy as np
import json

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import nlp_utils
from NN_model import NeuralNetwork

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.is_available()

True

In [3]:
# Load JSON file
def load_intents(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return json.load(file)

# Save JSON file
def save_intents(data, file_path):
    with open(file_path, "w", encoding="utf-8") as file:
        json.dump(data, file, indent=4, ensure_ascii=False)

In [4]:
# # File paths
# input_file = "intents_augmented_1.json"
# output_file = "intents_augmented.json"

# # Load, Augment, and Save
# intents = load_intents(input_file)
# augmented_intents = nlp_utils.augment_intents(intents)
# save_intents(augmented_intents, output_file)

# print(f"✅ Data augmentation complete! Saved as {output_file}")

In [5]:
patterns = []  # Collect all patterns
pattern_tags = []  # Store corresponding tags

augmented_intents = load_intents("intents_augmented.json")

# Process each pattern
for intent in augmented_intents["intents"]:
    for pattern in intent["patterns"]:
        patterns.append(pattern)  
        pattern_tags.append(intent["tag"])

In [6]:
# text = ["hello, How Are You, Friend?", "Tell me a funny joke:)"]

# processed_pattern = nlp_utils.remove_punctuation(text)
# print("removed_punc:", processed_pattern)

# processed_pattern = nlp_utils.tokenize(processed_pattern)
# print("tokenized:", processed_pattern)

# processed_pattern = nlp_utils.remove_stopwords(processed_pattern)
# print("removed_stp:", processed_pattern)

# processed_pattern = nlp_utils.stem(processed_pattern)
# print("stemmed:", processed_pattern)

# processed_pattern = nlp_utils.join_tokens(processed_pattern)
# print("joined:", processed_pattern)

# processed_pattern = nlp_utils.get_embedding(processed_pattern)
# print("embedded:", len(processed_pattern))

In [7]:
pipeline = nlp_utils.create_pipeline()  # Load pipeline once
processed_patterns = pipeline.transform(patterns)  # Transform all at once

# word2vec_model = nlp_utils.train_word2vec(processed_patterns)

# embedded_patterns = nlp_utils.word2vec_embeddings(processed_patterns, word2vec_model)

# Apply GloVe embeddings to patterns
# embedded_patterns = np.array([get_embedding(" ".join(pattern)) for pattern in processed_patterns])



In [8]:
tag_to_index = {tag: idx for idx, tag in enumerate(sorted(set(pattern_tags)))}
index_to_tag = {idx: tag for tag, idx in tag_to_index.items()}


In [9]:
# Convert training data into vectors
X_data = np.array(processed_patterns)
y_data = np.array([tag_to_index[tag] for tag in pattern_tags])


In [10]:
scaler = StandardScaler()
X_data = scaler.fit_transform(X_data)  # Normalize input data


In [11]:
X_train, X_val, y_train, y_val = train_test_split(
    X_data, y_data, test_size=0.2, random_state=42
)

In [12]:
X_train.shape

(1060, 200)

In [13]:
# Hyper-parameters 
num_epochs = 1000
input_size = 200
hidden_size = 128
output_size = len(set(y_train))

class ChatDataset(Dataset):
    def __init__(self, X_data, y_data):
        self.n_samples = len(X_data)
        self.xs = torch.tensor(X_data, dtype=torch.float32).to(device)
        self.ys = torch.tensor(y_data, dtype=torch.long).to(device)
        
    def __getitem__(self, idx):
        return self.xs[idx], self.ys[idx]

    def __len__(self):
        return self.n_samples


# Create dataset objects
train_dataset = ChatDataset(X_train, y_train)
val_dataset = ChatDataset(X_val, y_val)

# Create DataLoader objects
train_loader = DataLoader(dataset=train_dataset, shuffle=True, num_workers=0)
val_loader = DataLoader(dataset=val_dataset, shuffle=False, num_workers=0)  # No shuffle for validation


In [14]:
model = NeuralNetwork(input_size, hidden_size, output_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001, weight_decay=1e-4)  # More stable optimizer
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=35, factor=0.5, verbose=True)




In [15]:
# Early stopping setup
patience = 25
best_val_loss = np.inf
epochs_no_improve = 0

for epoch in range(num_epochs):
    model.train()
    epoch_loss, epoch_accuracy, total_samples = 0.0, 0.0, 0

    for words, labels in train_loader:
        words, labels = words.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(words)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        batch_size = labels.size(0)
        batch_accuracy = (outputs.argmax(dim=1) == labels).sum().item() / batch_size
        epoch_loss += loss.item() * batch_size
        epoch_accuracy += batch_accuracy * batch_size
        total_samples += batch_size

    epoch_loss /= total_samples
    epoch_accuracy /= total_samples

    # Validation Phase
    model.eval()
    val_loss, val_accuracy, val_samples = 0.0, 0.0, 0

    with torch.no_grad():
        for words, labels in val_loader:
            words, labels = words.to(device), labels.to(device)

            outputs = model(words)
            loss = criterion(outputs, labels)

            batch_size = labels.size(0)
            batch_accuracy = (outputs.argmax(dim=1) == labels).sum().item() / batch_size
            val_loss += loss.item() * batch_size
            val_accuracy += batch_accuracy * batch_size
            val_samples += batch_size

    val_loss /= val_samples
    val_accuracy /= val_samples

    # Check for early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        torch.save(model.state_dict(), "best_model.pth")  # Save best model
    else:
        epochs_no_improve += 1

    if (epoch + 1) % 5 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], '
              f'Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}, '
              f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')

    if epochs_no_improve >= patience:
        print(f"Early stopping triggered after {epoch+1} epochs!")
        break

    scheduler.step(val_loss)  # Adjust LR

print("Training complete! Best validation loss:", best_val_loss)

Epoch [5/1000], Loss: 2.0289, Accuracy: 0.5019, Val Loss: 1.4334, Val Accuracy: 0.7293
Epoch [10/1000], Loss: 1.1244, Accuracy: 0.7349, Val Loss: 0.6690, Val Accuracy: 0.8647
Epoch [15/1000], Loss: 0.6742, Accuracy: 0.8387, Val Loss: 0.4195, Val Accuracy: 0.8985
Epoch [20/1000], Loss: 0.4993, Accuracy: 0.8679, Val Loss: 0.3433, Val Accuracy: 0.9098
Epoch [25/1000], Loss: 0.4374, Accuracy: 0.8792, Val Loss: 0.3212, Val Accuracy: 0.9098
Epoch [30/1000], Loss: 0.3559, Accuracy: 0.8972, Val Loss: 0.2835, Val Accuracy: 0.9211
Epoch [35/1000], Loss: 0.3377, Accuracy: 0.9123, Val Loss: 0.2696, Val Accuracy: 0.9248
Epoch [40/1000], Loss: 0.2964, Accuracy: 0.9160, Val Loss: 0.2533, Val Accuracy: 0.9173
Epoch [45/1000], Loss: 0.2578, Accuracy: 0.9302, Val Loss: 0.2646, Val Accuracy: 0.9211
Epoch [50/1000], Loss: 0.2607, Accuracy: 0.9208, Val Loss: 0.2204, Val Accuracy: 0.9474
Epoch [55/1000], Loss: 0.2790, Accuracy: 0.9198, Val Loss: 0.2343, Val Accuracy: 0.9361
Epoch [60/1000], Loss: 0.2346, Ac

In [18]:
data = {
"model_state": model.state_dict(),
"input_size": input_size,
"hidden_size": hidden_size,
"output_size": output_size,
"all_words": patterns,
"tags": sorted(set(pattern_tags))
}

FILE = f"data_{round(epoch_accuracy, 2)}_{round(val_accuracy, 2)}.pth"
torch.save(data, FILE)

print(f'training complete. file saved to {FILE}')

training complete. file saved to data_0.94_0.94.pth


In [19]:
import joblib

joblib.dump(pipeline, "pipeline.pkl")
print("Pipeline has been saved successfully!")


joblib.dump(scaler, "scaler.pkl")
print("Scaler saved successfully!")


Pipeline has been saved successfully!
Scaler saved successfully!
