# Intent Recognition & Context Management Integration

*This notebook integrates existing text cleaning and processing pipeline with intent recognition using Word2Vec-style embeddings (from the pre-trained GloVe model) and context management. We use an expanded dataset of common intents to train a classifier. The predicted intent and updated context for each input are printed without generating a response.*

#### Imports and Setup

In [2]:
import numpy as np
import re
import string
import nltk
import yaml
import gensim.downloader as api
import torch
import torch.nn as nn
import torch.optim as optim
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from numpy.linalg import norm
from sklearn.linear_model import LogisticRegression


# Download necessary NLTK resources
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

# Load pre-trained GloVe model (using gensim)
print("Loading GloVe embeddings...")
pretrained_model = api.load("glove-wiki-gigaword-100")  # 100-dimensional embeddings
embed_dim = pretrained_model.vector_size

# Load a pre-trained word embedding model.
# pretrained_model = api.load("glove-wiki-gigaword-100")

# Load the Google News Word2Vec model (300-dimensional)
# pretrained_model = api.load("word2vec-google-news-300")

# Load the GloVe Twitter model (e.g., 25-dimensional)
# pretrained_model = api.load("glove-twitter-25")

# Load the FastText model (300-dimensional with subword information)
# pretrained_model = api.load("fasttext-wiki-news-subwords-300")

# Load the ConceptNet Numberbatch model (300-dimensional)
# pretrained_model = api.load("conceptnet-numberbatch-17-06-300")

english_stop_words = list(set(stopwords.words("english")))
# print(english_stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asusg\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asusg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\asusg\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Loading GloVe embeddings...


#### Text Pre-Processing Functions

In [3]:
def to_lowercase(prompt):
    return prompt.lower()

def delete_stopwords(prompt):
    return " ".join([word for word in prompt.split() if word not in english_stop_words])

def text_cleaning(prompt):
    ignore_character = list(string.punctuation)
    pattern = f"[{re.escape(''.join(ignore_character))}]"
    cleaned_prompt = re.sub(pattern, " ", prompt)
    cleaned_prompt = re.sub(r"\b[a-z]\b", "", cleaned_prompt)
    cleaned_prompt = re.sub(r"\s+", " ", cleaned_prompt)
    return cleaned_prompt.strip()

def tokenization(sentence):
    return word_tokenize(sentence)

def lemmatization(tokens):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

def nettoyage_corpus(corpus):
    """
    Process a list of documents: lowercasing, stopword deletion,
    cleaning, tokenization, and lemmatization.
    """
    cleaned_conversations = [
        lemmatization(tokenization(text_cleaning(delete_stopwords(to_lowercase(doc)))))
        for doc in corpus
    ]
    return cleaned_conversations

#### Sentence Embedding Function

In [4]:
def get_sentence_embedding(model, sentence):
    tokens = word_tokenize(sentence.lower())
    valid_tokens = [token for token in tokens if token in model]
    if not valid_tokens:
        return np.zeros(model.vector_size)
    embeddings = [model[token] for token in valid_tokens]
    return np.mean(embeddings, axis=0)

#### Expanded Intent Dataset and Classifier Training

In [5]:
intent_phrases = {
    "greeting": [
        "Hello", "Hi", "Hey there", "Good morning", "Good afternoon", "Good evening",
        "What's up", "Greetings", "Howdy", "Hi, how are you?", "Hey", "Hello there",
        "Hey, what's going on?", "Yo", "Hiya", "Hello, nice to see you!", "Hey buddy",
        "Good to see you", "Hi, hope you're well", "Hello, how do you do?"
    ],
    "goodbye": [
        "Goodbye", "Bye", "See you later", "Talk to you soon", "Farewell", "Take care",
        "Catch you later", "See ya", "Bye bye", "Adios", "Later", "So long", "Good night",
        "I'm off", "Peace out", "Ciao", "Until next time", "Farewell for now",
        "See you around", "Later alligator"
    ],
    "get_time": [
        "What time is it?", "Tell me the current time", "Could you give me the time?",
        "I need to know the time", "Time please", "Do you know what time it is?",
        "Can you tell me the time?", "What's the time now?", "Please share the time",
        "Current time?", "Time update", "What's the clock saying?", "Show me the time",
        "Time check", "What's the time, please?", "May I know the time?",
        "Could you update me with the time?", "Time now?", "Let me know the time", "Time?"
    ],
    "get_weather": [
        "What's the weather like today?", "Tell me the weather forecast", "How is the weather?",
        "Is it going to rain?", "Weather update please", "What's the temperature outside?",
        "Do I need an umbrella today?", "Weather report", "Current weather conditions?",
        "How's the weather outside?", "Forecast for today?", "Is it sunny or rainy?",
        "Weather status", "What's the climate like today?", "Do I need a jacket today?",
        "How's the weather looking?", "Any rain expected today?", "Weather check",
        "Let me know today's weather", "Weather update"
    ],
    "thanks": [
        "Thank you", "Thanks a lot", "Much appreciated", "Thanks", "Thank you very much",
        "I appreciate it", "Thanks a million", "Thank you so much", "Cheers", "Thanks a bunch",
        "Many thanks", "I'm grateful", "Thank you kindly", "I owe you one", "Appreciate it",
        "Thanks for everything", "Thanks, that was helpful", "Thank you, really appreciate it",
        "Thanks a ton", "Sincere thanks"
    ],
    "apology": [
        "I'm sorry", "My apologies", "Sorry for that", "I apologize", "Please forgive me",
        "Sorry about that", "My bad", "I didn't mean that", "I am really sorry", "Apologies",
        "I regret that", "So sorry", "Excuse me", "Pardon me", "I beg your pardon",
        "I sincerely apologize", "Forgive me, please", "I apologize for any inconvenience",
        "I'm truly sorry", "Sorry, my mistake"
    ],
    "unknown": [
        "I don't know", "Can you repeat that?", "What do you mean?", "I don't understand",
        "Could you say that again?", "Not sure what you mean", "I'm confused", "What?",
        "Huh?", "I have no idea", "Could you clarify?", "I didn't catch that",
        "Sorry, what did you say?", "I am not sure I follow", "Please explain",
        "I don't follow", "Could you rephrase that?", "I don't comprehend", "Unclear to me", "Not sure"
    ]
}

texts = []
labels = []
for intent, phrases in intent_phrases.items():
    for phrase in phrases:
        texts.append(phrase)
        labels.append(intent)

X = np.array([get_sentence_embedding(pretrained_model, text) for text in texts])
y = np.array(labels)

clf = LogisticRegression(max_iter=1000)
clf.fit(X, y)

def ml_intent(sentence):
    embedding = get_sentence_embedding(pretrained_model, sentence).reshape(1, -1)
    return clf.predict(embedding)[0]

#### Context Manager Class

In [6]:
class ContextManager:
    def __init__(self):
        self.user_context = {}
    def update_context(self, user_id, intent):
        self.user_context[user_id] = intent
    def get_context(self, user_id):
        return self.user_context.get(user_id, None)

#### Retrieval-Based Response Generation

In [7]:
# Predefined candidate responses for each intent.
candidate_responses = {
    "greeting": [
        "Hello! How can I help you today?",
        "Hi there! What can I do for you?",
        "Hey! How's it going?"
    ],
    "goodbye": [
        "Goodbye! Have a great day.",
        "Bye! Take care.",
        "See you later!"
    ],
    "get_time": [
        "The current time is 3:45 PM.",  # placeholder text; in practice, call a time function
        "It's 3:45 in the afternoon right now.",
        "Right now, it's 3:45 PM."
    ],
    "get_weather": [
        "It's sunny and 25°C outside.",  # placeholder text; in practice, call a weather API
        "Currently, it's sunny with a temperature of 25°C.",
        "The weather is clear and warm at 25°C."
    ],
    "thanks": [
        "You're welcome!",
        "No problem, happy to help!",
        "Anytime!"
    ],
    "apology": [
        "No worries, it's okay.",
        "Apology accepted.",
        "Don't worry about it."
    ],
    "unknown": [
        "I'm not sure I understand. Could you please clarify?",
        "Sorry, I didn't catch that. Can you rephrase?",
        "I don't understand. Can you explain a bit more?"
    ]
}

# Function to compute cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
    if norm(vec1) == 0 or norm(vec2) == 0:
        return 0.0
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))

# Function to select the best candidate response based on cosine similarity of embeddings.
def select_response(user_input, predicted_intent):
    input_embedding = get_sentence_embedding(pretrained_model, user_input)
    responses = candidate_responses.get(predicted_intent, candidate_responses["unknown"])
    best_response = None
    best_similarity = -1
    for response in responses:
        resp_embedding = get_sentence_embedding(pretrained_model, response)
        similarity = cosine_similarity(input_embedding, resp_embedding)
        if similarity > best_similarity:
            best_similarity = similarity
            best_response = response
    return best_response

#### Integration: Full Pipeline Including Response Generation

In [8]:
# Optionally load conversation data from YAML (if available)
try:
    with open("conversations.yml", "r", encoding="utf-8") as file:
        data = yaml.safe_load(file)
    conversations = data.get("conversations", {})
    print("Loaded Conversations:", conversations)
except FileNotFoundError:
    print("conversations.yml not found; skipping YAML load.")

# Simulated user inputs (these would come from your UI in practice)
user_inputs = [
    "Hi there!",
    "Can you tell me what time it is?",
    "What's the weather like outside?",
    "Thanks for your help!",
    "I'm sorry, I didn't understand that.",
    "Bye!"
]

context_manager = ContextManager()
user_id = "user123"

for user_input in user_inputs:
    # Process the input through the cleaning pipeline
    cleaned_corpus = nettoyage_corpus([user_input])
    cleaned_text = " ".join(cleaned_corpus[0])
    
    # Predict the intent using our ML-based classifier
    predicted_intent = ml_intent(cleaned_text)
    context_manager.update_context(user_id, predicted_intent)
    
    # Use the predicted intent and existing pipeline to select a candidate response.
    response = select_response(cleaned_text, predicted_intent)
    
    # Print the results: input, cleaned text, predicted intent, and selected response.
    print(f"Input: {user_input}")
    print(f"Cleaned Text: {cleaned_text}")
    print(f"Predicted Intent: {predicted_intent}")
    print(f"Selected Response: {response}")
    print(f"Current Context for {user_id}: {context_manager.get_context(user_id)}")
    print("-" * 50)

Loaded Conversations: [['Good morning, how are you?', 'I am doing well, how about you?', "I'm also good.", "That's good to hear.", 'Yes it is.'], ['Hello', 'Hi', 'How are you doing?', 'I am doing well.', 'That is good to hear', 'Yes it is.', 'Can I help you with anything?', 'Yes, I have a question.', 'What is your question?', 'Could I borrow a cup of sugar?', "I'm sorry, but I don't have any.", 'Thank you anyway', 'No problem'], ['How are you doing?', 'I am doing well, how about you?', 'I am also good.', "That's good."], ['Have you heard the news?', 'What good news?'], ['What is your favorite book?', "I can't read.", "So what's your favorite color?", 'Blue'], ['Who are you?', 'Who? Who is but a form following the function of what', 'What are you then?', 'A man in a mask.', 'I can see that.', "It's not your powers of observation I doubt, but merely the paradoxical nature of asking a masked man who is. But tell me, do you like music?", 'I like seeing movies.', 'What kind of movies do you

#### Seq2Seq Model

In [9]:
# Define a small conversational dataset

# Sample input-response pairs
pairs = [
    ("hello", "hi there"),
    ("hi", "hello, how can I help you?"),
    ("good morning", "good morning! how can I assist you today?"),
    ("hey", "hey there, what can I do for you?"),
    ("how are you", "i am fine, thanks for asking."),
    ("what's up", "not much, how about you?"),
    ("what is your name", "i am a chatbot, here to assist you."),
    ("who are you", "i am your virtual assistant, ready to help."),
    ("goodbye", "see you later, take care!"),
    ("bye", "goodbye, have a nice day!"),
    ("thanks", "you're welcome!"),
    ("thank you", "no problem, happy to help!"),
    ("i need help", "sure, what do you need help with?"),
    ("can you help me", "of course, how can i assist you?"),
    ("what time is it", "the current time is 3:45 pm."),
    ("what's the weather", "it's sunny and 25°C outside."),
    ("i am having a technical issue", "i'm sorry to hear that, can you describe the problem?"),
    ("i want to check my order status", "please provide your order number so I can check."),
    ("i would like a refund", "i'm sorry for the inconvenience. please share your order number for processing."),
    ("i don't understand", "could you please rephrase that?"),
    ("can you repeat that", "sure, let me repeat that for you."),
    ("what is your purpose", "i am here to assist you with any questions or tasks."),
    ("tell me a joke", "why did the scarecrow win an award? because he was outstanding in his field!"),
    ("what can you do", "i can help answer your questions, provide information, and assist with tasks."),
    ("how can i reset my password", "you can reset your password by clicking on 'forgot password' on the login page."),
    ("i am bored", "maybe try a new hobby, or i can share a fun fact with you."),
    ("tell me a fun fact", "did you know that honey never spoils?"),
    ("i need some advice", "what kind of advice are you looking for?"),
    ("what is the meaning of life", "that's a deep question! some say it's 42."),
    ("do you know any good restaurants", "i can recommend some if you tell me your location."),
    ("i want to book a flight", "sure, i can help with that. can you provide your travel dates?"),
    ("can i talk to a human", "i can connect you with a human agent, please hold on."),
]


def clean_text(text, remove_stopwords=False):
    # If text is a list, apply clean_text to each element
    if isinstance(text, list):
        return [clean_text(t, remove_stopwords) for t in text]
    
    # Process single string
    text = text.lower()
    text = re.sub(r"[{}]".format(re.escape(string.punctuation)), " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    tokens = word_tokenize(text)
    lemmatizer = nltk.stem.WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    if remove_stopwords:
        tokens = [token for token in tokens if token not in english_stop_words]
    
    return tokens



# Separate inputs and targets
input_texts = [pair[0] for pair in pairs]
target_texts = [pair[1] for pair in pairs]

# Preprocess the texts using provided functions
# -------------------------------
# Experiment here removing stop words for Encoder layer or not ?
input_tokens = clean_text(input_texts, remove_stopwords=False)
target_tokens = clean_text(target_texts)

# Add special tokens for decoder
START_TOKEN = "<start>"
END_TOKEN = "<end>"

# For the target, add start and end tokens
target_tokens = [[START_TOKEN] + tokens + [END_TOKEN] for tokens in target_tokens]

#### Build Vocabulary from the dataset

In [10]:
def build_vocab(tokenized_texts, min_freq=1):
    freq = {}
    for tokens in tokenized_texts:
        for token in tokens:
            freq[token] = freq.get(token, 0) + 1
    vocab = {token for token, count in freq.items() if count >= min_freq}
    vocab = sorted(list(vocab))
    # Create word2idx and idx2word
    word2idx = {word: idx+2 for idx, word in enumerate(vocab)}  # reserve 0 for PAD, 1 for UNK
    word2idx["<pad>"] = 0
    word2idx["<unk>"] = 1
    idx2word = {idx: word for word, idx in word2idx.items()}
    return word2idx, idx2word

# Build vocab for both encoder and decoder (you can share the same vocab here)
all_tokens = input_tokens + target_tokens
word2idx, idx2word = build_vocab(all_tokens)

vocab_size = len(word2idx)
print("Vocabulary size:", vocab_size)

Vocabulary size: 167


#### Create Embedding Matrix using GloVe

In [11]:
embedding_matrix = np.zeros((vocab_size, embed_dim))
for word, idx in word2idx.items():
    if word in pretrained_model:
        embedding_matrix[idx] = pretrained_model[word]
    else:
        # Random initialization for words not found in GloVe
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embed_dim,))

embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float)

#### Utility functions

In [12]:
def tokens_to_indices(tokens, word2idx):
    return [word2idx.get(token, word2idx["<unk>"]) for token in tokens]

def pad_sequence(seq, max_len):
    return seq + [word2idx["<pad>"]] * (max_len - len(seq))

# Convert all sentences to indices
encoder_inputs = [tokens_to_indices(tokens, word2idx) for tokens in input_tokens]
decoder_inputs = [tokens_to_indices(tokens, word2idx) for tokens in target_tokens]

# For simplicity, use the maximum length in the batch
encoder_max_len = max(len(seq) for seq in encoder_inputs)
decoder_max_len = max(len(seq) for seq in decoder_inputs)

encoder_inputs = [pad_sequence(seq, encoder_max_len) for seq in encoder_inputs]
decoder_inputs = [pad_sequence(seq, decoder_max_len) for seq in decoder_inputs]

encoder_inputs = torch.tensor(encoder_inputs, dtype=torch.long)
decoder_inputs = torch.tensor(decoder_inputs, dtype=torch.long)

#### Define the Seq2Seq Model (Encoder & Decoder)

In [13]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, embedding_matrix):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.embedding.weight = nn.Parameter(embedding_matrix)
        # Freeze embedding weights if desired:
        # self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(embed_dim, hidden_size, batch_first=True)
    
    def forward(self, x):
        embedded = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(embedded)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, embedding_matrix):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.embedding.weight = nn.Parameter(embedding_matrix)
        # self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, input_token, hidden, cell):
        # input_token shape: (batch_size) -> add time dimension
        input_token = input_token.unsqueeze(1)
        embedded = self.embedding(input_token)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc(output.squeeze(1))
        return prediction, hidden, cell

# Hyperparameters
hidden_size = 256
num_epochs = 300
learning_rate = 0.001
batch_size = encoder_inputs.size(0)  # using all data in one batch for simplicity

encoder = Encoder(vocab_size, embed_dim, hidden_size, embedding_matrix)
decoder = Decoder(vocab_size, embed_dim, hidden_size, embedding_matrix)

criterion = nn.CrossEntropyLoss(ignore_index=word2idx["<pad>"])
optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=learning_rate)

def generate_response(input_sentence, max_len=20):
    encoder.eval()
    decoder.eval()
    # Preprocess input sentence using the same pipeline
    tokens = clean_text([input_sentence])[0]
    indices = tokens_to_indices(tokens, word2idx)
    indices = pad_sequence(indices, encoder_max_len)
    input_tensor = torch.tensor([indices], dtype=torch.long)
    
    with torch.no_grad():
        hidden, cell = encoder(input_tensor)
        decoder_input = torch.tensor([word2idx[START_TOKEN]], dtype=torch.long)  # initial token
        output_sentence = []
        for _ in range(max_len):
            output, hidden, cell = decoder(decoder_input, hidden, cell)
            predicted_idx = output.argmax(1).item()
            if predicted_idx == word2idx.get(END_TOKEN, None):
                break
            output_sentence.append(idx2word.get(predicted_idx, "<unk>"))
            decoder_input = torch.tensor([predicted_idx], dtype=torch.long)
    return " ".join(output_sentence)

#### Training Loop

In [14]:
print("Training the Seq2Seq model...")

for epoch in range(1, num_epochs+1):
    encoder.train()
    decoder.train()
    optimizer.zero_grad()
    
    # Encoder forward
    hidden, cell = encoder(encoder_inputs)
    
    # Decoder forward with teacher forcing
    # Use the first token of decoder_inputs as the initial input for decoder
    decoder_input = decoder_inputs[:, 0]  # start tokens for each sample
    loss = 0
    for t in range(1, decoder_max_len):
        output, hidden, cell = decoder(decoder_input, hidden, cell)
        # target at time step t is decoder_inputs[:, t]
        loss += criterion(output, decoder_inputs[:, t])
        # Teacher forcing: feed the target as the next input
        decoder_input = decoder_inputs[:, t]
    
    loss.backward()
    optimizer.step()
    
    if epoch % 50 == 0:
        print(f"Epoch [{epoch}/{num_epochs}], Loss: {loss.item()/ (decoder_max_len-1):.4f}")

Training the Seq2Seq model...
Epoch [50/300], Loss: 2.3911
Epoch [100/300], Loss: 0.4855
Epoch [150/300], Loss: 0.0971
Epoch [200/300], Loss: 0.0364
Epoch [250/300], Loss: 0.0202
Epoch [300/300], Loss: 0.0133


#### Test the model

In [15]:
test_sentences = [
    "hello",
    "What's up ?",
    "good morning",
    "how are you today?",
    "what is your name?",
    "can you help me with my order?",
    "i need a refund",
    "i don't understand",
    "tell me a joke",
    "what's the weather like?",
    "i am having a technical issue",
    "can i talk to a human?",
    "goodbye"
]


print("\nGenerated responses:")
for sent in test_sentences:
    response = generate_response(sent)
    print(f"User: {sent}\nBot: {response}\n")


Generated responses:
User: hello
Bot: hi there

User: What's up ?
Bot: not much how about you

User: good morning
Bot: good morning how can i assist you today

User: how are you today?
Bot: i am your virtual assistant ready to help

User: what is your name?
Bot: i am a chatbot here to assist you

User: can you help me with my order?
Bot: of course can i can i assist you

User: i need a refund
Bot: i m sorry for the inconvenience please share your order for processing

User: i don't understand
Bot: could you please rephrase that

User: tell me a joke
Bot: why did the scarecrow win an award because he wa outstanding in his field

User: what's the weather like?
Bot: it s sunny and 25°c outside

User: i am having a technical issue
Bot: i m sorry to hear that can you describe the problem

User: can i talk to a human?
Bot: i can connect you with a human agent please hold on

User: goodbye
Bot: see you later take care



### RNN Based Response Generation

In [16]:
# Define the Encoder using a vanilla RNN
class EncoderRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, embedding_matrix):
        super(EncoderRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.embedding.weight = nn.Parameter(embedding_matrix)
        # Replace LSTM with a vanilla RNN
        self.rnn = nn.RNN(embed_dim, hidden_size, batch_first=True)
    
    def forward(self, x):
        embedded = self.embedding(x)
        # The RNN returns outputs and hidden state (no cell state)
        outputs, hidden = self.rnn(embedded)
        return hidden  # shape: (num_layers, batch, hidden_size)

# Define the Decoder using a vanilla RNN
class DecoderRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, embedding_matrix):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.embedding.weight = nn.Parameter(embedding_matrix)
        self.rnn = nn.RNN(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, input_token, hidden):
        # input_token shape: (batch_size), add time dimension for RNN: (batch_size, 1)
        input_token = input_token.unsqueeze(1)
        embedded = self.embedding(input_token)
        # Forward pass through the RNN; we use the hidden state from the encoder (or previous step)
        output, hidden = self.rnn(embedded, hidden)
        # Predict the next token from the RNN output
        prediction = self.fc(output.squeeze(1))
        return prediction, hidden

# Instantiate the RNN-based encoder and decoder
hidden_size = 256  # or your chosen size
encoder_rnn = EncoderRNN(vocab_size, embed_dim, hidden_size, embedding_matrix)
decoder_rnn = DecoderRNN(vocab_size, embed_dim, hidden_size, embedding_matrix)

criterion = nn.CrossEntropyLoss(ignore_index=word2idx["<pad>"])
optimizer = optim.Adam(list(encoder_rnn.parameters()) + list(decoder_rnn.parameters()), lr=learning_rate)

# Training Loop using the RNN-based Seq2Seq model
print("Training the RNN-based Seq2Seq model...")

for epoch in range(1, num_epochs+1):
    encoder_rnn.train()
    decoder_rnn.train()
    optimizer.zero_grad()
    
    # Encoder forward: get the final hidden state from the encoder
    hidden = encoder_rnn(encoder_inputs)
    
    # Decoder forward with teacher forcing
    decoder_input = decoder_inputs[:, 0]  # start tokens for each sample
    loss = 0
    for t in range(1, decoder_max_len):
        # Use the hidden state from previous time step
        output, hidden = decoder_rnn(decoder_input, hidden)
        # target at time step t is decoder_inputs[:, t]
        loss += criterion(output, decoder_inputs[:, t])
        # Teacher forcing: feed the ground truth token as the next input
        decoder_input = decoder_inputs[:, t]
    
    loss.backward()
    optimizer.step()
    
    if epoch % 50 == 0:
        avg_loss = loss.item() / (decoder_max_len - 1)
        print(f"Epoch [{epoch}/{num_epochs}], Loss: {avg_loss:.4f}")

# Response Generation function using the RNN-based model
def generate_response_rnn(input_sentence, max_len=20):
    encoder_rnn.eval()
    decoder_rnn.eval()
    # Preprocess input sentence using the same pipeline
    tokens = clean_text([input_sentence])[0]
    indices = tokens_to_indices(tokens, word2idx)
    indices = pad_sequence(indices, encoder_max_len)
    input_tensor = torch.tensor([indices], dtype=torch.long)
    
    with torch.no_grad():
        hidden = encoder_rnn(input_tensor)
        decoder_input = torch.tensor([word2idx[START_TOKEN]], dtype=torch.long)  # initial token
        output_sentence = []
        for _ in range(max_len):
            output, hidden = decoder_rnn(decoder_input, hidden)
            predicted_idx = output.argmax(1).item()
            if predicted_idx == word2idx.get(END_TOKEN, None):
                break
            output_sentence.append(idx2word.get(predicted_idx, "<unk>"))
            decoder_input = torch.tensor([predicted_idx], dtype=torch.long)
    return " ".join(output_sentence)

# Testing the RNN-based Seq2Seq model
test_sentences = [
    "hello",
    "what's up?",
    "good morning",
    "how are you today?",
    "what is your name?",
    "can you help me with my order?",
    "i need a refund",
    "i don't understand",
    "tell me a joke",
    "what's the weather like?",
    "i am having a technical issue",
    "can i talk to a human?",
    "goodbye"
]

print("\nGenerated responses using RNN:")
for sent in test_sentences:
    response = generate_response_rnn(sent)
    print(f"User: {sent}\nBot: {response}\n")


Training the RNN-based Seq2Seq model...
Epoch [50/300], Loss: 1.0209
Epoch [100/300], Loss: 0.1647
Epoch [150/300], Loss: 0.0414
Epoch [200/300], Loss: 0.0147
Epoch [250/300], Loss: 0.0088
Epoch [300/300], Loss: 0.0061

Generated responses using RNN:
User: hello
Bot: hi there

User: what's up?
Bot: not much how about you

User: good morning
Bot: good morning how can i assist you today

User: how are you today?
Bot: i am a chatbot here to assist you

User: what is your name?
Bot: i am a chatbot here to assist you

User: can you help me with my order?
Bot: of course how can i assist you

User: i need a refund
Bot: what kind of advice are you looking for

User: i don't understand
Bot: could you please rephrase that

User: tell me a joke
Bot: why did the scarecrow win an award because he wa outstanding in his field

User: what's the weather like?
Bot: it s sunny and 25°c outside

User: i am having a technical issue
Bot: i m sorry to hear that can you describe the problem

User: can i talk 

In [None]:
# Testing the RNN-based Seq2Seq model
test_sentences = [
    "hello",
    "what's up?",
    "good morning",
    "how are you today?",
    "what is your name?",
    "can you help me with my order?",
    "i need a refund",
    "i don't understand",
    "tell me a joke",
    "what's the weather like?",
    "i am having a technical issue",
    "can i talk to a human?",
    "goodbye"
]

print("\nGenerated responses using RNN:")
for sent in test_sentences:
    response = generate_response_rnn(sent)
    print(f"User: {sent}\nBot: {response}\n")


### Neural Net Auto Regressive

In [20]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import re
import string
from nltk.tokenize import word_tokenize
import nltk

# Download necessary NLTK resources
nltk.download("punkt")

# Expanded sample corpus: a larger set of sentences to improve training.
corpus = [
    "Hello how are you doing today",
    "I am doing well thank you",
    "How about you what are you doing",
    "I am reading a book about machine learning",
    "The weather is sunny and pleasant today",
    "I hope you have a great day ahead",
    "It is a beautiful day for a walk in the park",
    "What are your plans for the weekend",
    "I plan to visit a museum and then have dinner",
    "Learning new things every day is exciting",
    "Practice makes perfect so keep on training",
    "Artificial intelligence is changing the world",
    "I enjoy coding in Python and solving problems",
    "Music and art enrich our lives in many ways",
    "Traveling broadens the mind and inspires creativity",
    "Healthy eating and exercise are important for a happy life",
    "Reading helps expand your vocabulary and knowledge",
    "Good communication skills are essential in every field",
    "Technology is evolving rapidly every day",
    "Stay curious and never stop exploring new ideas"
]

# ---------------------
# Preprocessing & Vocabulary Building
# ---------------------

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[{}]".format(re.escape(string.punctuation)), "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Tokenize each sentence and clean it
tokenized_corpus = [word_tokenize(clean_text(sentence)) for sentence in corpus]

# Build vocabulary from corpus
def build_vocab(tokenized_texts, min_freq=1):
    freq = {}
    for tokens in tokenized_texts:
        for token in tokens:
            freq[token] = freq.get(token, 0) + 1
    vocab = {token for token, count in freq.items() if count >= min_freq}
    vocab = sorted(list(vocab))
    # Reserve 0 for PAD and 1 for UNK
    word2idx = {word: idx+2 for idx, word in enumerate(vocab)}
    word2idx["<pad>"] = 0
    word2idx["<unk>"] = 1
    idx2word = {idx: word for word, idx in word2idx.items()}
    return word2idx, idx2word

word2idx, idx2word = build_vocab(tokenized_corpus)
vocab_size = len(word2idx)
print("Vocabulary size:", vocab_size)

# Prepare training data:
# Concatenate all tokens from all sentences into one long sequence.
all_tokens = [token for tokens in tokenized_corpus for token in tokens]
data_indices = [word2idx.get(token, word2idx["<unk>"]) for token in all_tokens]

# Create input-target pairs using a sliding window (context_length = sequence length)
sequence_length = 3  # use previous 3 words to predict the next word
inputs = []
targets = []
for i in range(len(data_indices) - sequence_length):
    inputs.append(data_indices[i:i+sequence_length])
    targets.append(data_indices[i+sequence_length])

inputs = torch.tensor(inputs, dtype=torch.long)
targets = torch.tensor(targets, dtype=torch.long)
print("Number of training samples:", inputs.size(0))

# ---------------------
# Define the RNN Language Model
# ---------------------

class RNNLanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, num_layers=1):
        super(RNNLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=word2idx["<pad>"])
        self.rnn = nn.RNN(embed_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, x, hidden):
        # x: (batch_size, seq_length)
        embedded = self.embedding(x)  # (batch_size, seq_length, embed_dim)
        out, hidden = self.rnn(embedded, hidden)  # out: (batch_size, seq_length, hidden_size)
        # Use the last output for prediction
        last_output = out[:, -1, :]  # (batch_size, hidden_size)
        logits = self.fc(last_output)  # (batch_size, vocab_size)
        return logits, hidden

# Hyperparameters
embed_dim = 50
hidden_size = 128
num_layers = 1
num_epochs = 300
learning_rate = 0.005
batch_size = 16

model = RNNLanguageModel(vocab_size, embed_dim, hidden_size, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# ---------------------
# Training Loop
# ---------------------

num_samples = inputs.size(0)
print("Training the RNN Language Model with an expanded dataset...")
for epoch in range(1, num_epochs+1):
    permutation = torch.randperm(num_samples)
    epoch_loss = 0.0
    for i in range(0, num_samples, batch_size):
        indices = permutation[i:i+batch_size]
        batch_inputs = inputs[indices]
        batch_targets = targets[indices]
        
        # Initialize hidden state: (num_layers, batch_size, hidden_size)
        hidden = torch.zeros(num_layers, batch_inputs.size(0), hidden_size)
        
        optimizer.zero_grad()
        outputs, hidden = model(batch_inputs, hidden)
        loss = criterion(outputs, batch_targets)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    if epoch % 50 == 0:
        avg_loss = epoch_loss / (num_samples / batch_size)
        print(f"Epoch [{epoch}/{num_epochs}], Loss: {avg_loss:.4f}")

# ---------------------
# Text Generation (Autoregressive Sampling)
# ---------------------

def generate_text(seed_text, gen_length=10):
    model.eval()
    tokens = word_tokenize(clean_text(seed_text))
    # Convert seed text to indices (if token not in vocab, use <unk>)
    current_indices = [word2idx.get(token, word2idx["<unk>"]) for token in tokens]
    # Ensure we have exactly sequence_length tokens as input: pad or trim as needed
    if len(current_indices) < sequence_length:
        current_indices = [word2idx["<pad>"]] * (sequence_length - len(current_indices)) + current_indices
    else:
        current_indices = current_indices[-sequence_length:]
    
    generated = tokens.copy()
    
    for _ in range(gen_length):
        input_tensor = torch.tensor([current_indices], dtype=torch.long)
        hidden = torch.zeros(num_layers, 1, hidden_size)
        with torch.no_grad():
            logits, hidden = model(input_tensor, hidden)
            # You can also sample from a softmax distribution for more variety.
            predicted_idx = logits.argmax(1).item()
        predicted_word = idx2word.get(predicted_idx, "<unk>")
        generated.append(predicted_word)
        current_indices = current_indices[1:] + [predicted_idx]
    
    return " ".join(generated)

# ---------------------
# Testing Text Generation
# ---------------------
seed = "hello how are"
generated_text = generate_text(seed, gen_length=15)
print("\nGenerated Text:")
print(generated_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asusg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Vocabulary size: 101
Number of training samples: 150
Training the RNN Language Model with an expanded dataset...
Epoch [50/300], Loss: 0.0210
Epoch [100/300], Loss: 0.0117
Epoch [150/300], Loss: 0.0116
Epoch [200/300], Loss: 0.0159
Epoch [250/300], Loss: 0.0107
Epoch [300/300], Loss: 0.0498

Generated Text:
hello how are you doing today i am doing well thank you how about you what are you
