# Intent Recognition & Context Management Integration

*This notebook integrates existing text cleaning and processing pipeline with intent recognition using Word2Vec-style embeddings (from the pre-trained GloVe model) and context management. We use an expanded dataset of common intents to train a classifier. The predicted intent and updated context for each input are printed without generating a response.*

#### Imports and Setup

In [18]:
import numpy as np
import re
import string
import nltk
import yaml
import gensim.downloader as api
import torch
import torch.nn as nn
import torch.optim as optim
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from numpy.linalg import norm
from sklearn.linear_model import LogisticRegression


# Download necessary NLTK resources
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

# Load pre-trained GloVe model (using gensim)
print("Loading GloVe embeddings...")
pretrained_model = api.load("glove-wiki-gigaword-100")  # 100-dimensional embeddings
embed_dim = pretrained_model.vector_size

# Load a pre-trained word embedding model.
# pretrained_model = api.load("glove-wiki-gigaword-100")

# Load the Google News Word2Vec model (300-dimensional)
# pretrained_model = api.load("word2vec-google-news-300")

# Load the GloVe Twitter model (e.g., 25-dimensional)
# pretrained_model = api.load("glove-twitter-25")

# Load the FastText model (300-dimensional with subword information)
# pretrained_model = api.load("fasttext-wiki-news-subwords-300")

# Load the ConceptNet Numberbatch model (300-dimensional)
# pretrained_model = api.load("conceptnet-numberbatch-17-06-300")

english_stop_words = list(set(stopwords.words("english")))
# print(english_stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asusg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asusg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\asusg\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Loading GloVe embeddings...


#### Text Pre-Processing Functions

In [19]:
def to_lowercase(prompt):
    return prompt.lower()

def delete_stopwords(prompt):
    return " ".join([word for word in prompt.split() if word not in english_stop_words])

def text_cleaning(prompt):
    ignore_character = list(string.punctuation)
    pattern = f"[{re.escape(''.join(ignore_character))}]"
    cleaned_prompt = re.sub(pattern, " ", prompt)
    cleaned_prompt = re.sub(r"\b[a-z]\b", "", cleaned_prompt)
    cleaned_prompt = re.sub(r"\s+", " ", cleaned_prompt)
    return cleaned_prompt.strip()

def tokenization(sentence):
    return word_tokenize(sentence)

def lemmatization(tokens):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

def nettoyage_corpus(corpus):
    """
    Process a list of documents: lowercasing, stopword deletion,
    cleaning, tokenization, and lemmatization.
    """
    cleaned_conversations = [
        lemmatization(tokenization(text_cleaning(delete_stopwords(to_lowercase(doc)))))
        for doc in corpus
    ]
    return cleaned_conversations

#### Sentence Embedding Function

In [20]:
def get_sentence_embedding(model, sentence):
    tokens = word_tokenize(sentence.lower())
    valid_tokens = [token for token in tokens if token in model]
    if not valid_tokens:
        return np.zeros(model.vector_size)
    embeddings = [model[token] for token in valid_tokens]
    return np.mean(embeddings, axis=0)

#### Expanded Intent Dataset and Classifier Training

In [21]:
intent_phrases = {
    "greeting": [
        "Hello", "Hi", "Hey there", "Good morning", "Good afternoon", "Good evening",
        "What's up", "Greetings", "Howdy", "Hi, how are you?", "Hey", "Hello there",
        "Hey, what's going on?", "Yo", "Hiya", "Hello, nice to see you!", "Hey buddy",
        "Good to see you", "Hi, hope you're well", "Hello, how do you do?"
    ],
    "goodbye": [
        "Goodbye", "Bye", "See you later", "Talk to you soon", "Farewell", "Take care",
        "Catch you later", "See ya", "Bye bye", "Adios", "Later", "So long", "Good night",
        "I'm off", "Peace out", "Ciao", "Until next time", "Farewell for now",
        "See you around", "Later alligator"
    ],
    "get_time": [
        "What time is it?", "Tell me the current time", "Could you give me the time?",
        "I need to know the time", "Time please", "Do you know what time it is?",
        "Can you tell me the time?", "What's the time now?", "Please share the time",
        "Current time?", "Time update", "What's the clock saying?", "Show me the time",
        "Time check", "What's the time, please?", "May I know the time?",
        "Could you update me with the time?", "Time now?", "Let me know the time", "Time?"
    ],
    "get_weather": [
        "What's the weather like today?", "Tell me the weather forecast", "How is the weather?",
        "Is it going to rain?", "Weather update please", "What's the temperature outside?",
        "Do I need an umbrella today?", "Weather report", "Current weather conditions?",
        "How's the weather outside?", "Forecast for today?", "Is it sunny or rainy?",
        "Weather status", "What's the climate like today?", "Do I need a jacket today?",
        "How's the weather looking?", "Any rain expected today?", "Weather check",
        "Let me know today's weather", "Weather update"
    ],
    "thanks": [
        "Thank you", "Thanks a lot", "Much appreciated", "Thanks", "Thank you very much",
        "I appreciate it", "Thanks a million", "Thank you so much", "Cheers", "Thanks a bunch",
        "Many thanks", "I'm grateful", "Thank you kindly", "I owe you one", "Appreciate it",
        "Thanks for everything", "Thanks, that was helpful", "Thank you, really appreciate it",
        "Thanks a ton", "Sincere thanks"
    ],
    "apology": [
        "I'm sorry", "My apologies", "Sorry for that", "I apologize", "Please forgive me",
        "Sorry about that", "My bad", "I didn't mean that", "I am really sorry", "Apologies",
        "I regret that", "So sorry", "Excuse me", "Pardon me", "I beg your pardon",
        "I sincerely apologize", "Forgive me, please", "I apologize for any inconvenience",
        "I'm truly sorry", "Sorry, my mistake"
    ],
    "unknown": [
        "I don't know", "Can you repeat that?", "What do you mean?", "I don't understand",
        "Could you say that again?", "Not sure what you mean", "I'm confused", "What?",
        "Huh?", "I have no idea", "Could you clarify?", "I didn't catch that",
        "Sorry, what did you say?", "I am not sure I follow", "Please explain",
        "I don't follow", "Could you rephrase that?", "I don't comprehend", "Unclear to me", "Not sure"
    ]
}

texts = []
labels = []
for intent, phrases in intent_phrases.items():
    for phrase in phrases:
        texts.append(phrase)
        labels.append(intent)

X = np.array([get_sentence_embedding(pretrained_model, text) for text in texts])

print(X)

[[ 0.26688     0.39632     0.6169     ...  0.35842    -0.48464
   0.30728   ]
 [ 0.1444      0.23979     0.96693    ... -0.72424    -0.22632
  -0.030972  ]
 [ 0.1467225   0.561025    0.72091496 ... -0.324925    0.11294499
   0.42388   ]
 ...
 [-0.00800675  0.48382002  0.43498504 ... -0.5387625   0.057307
   0.52885497]
 [-0.03922867 -0.06117199  0.45376667 ... -0.16548167  0.17534967
   0.13063633]
 [-0.23837     0.278805    0.462135   ... -0.32915503 -0.0046095
   0.41613   ]]


In [None]:
print(f"pretrained_model[Hello] = {pretrained_model["hi"]}")

pretrained_model[Hello] = [ 0.1444    0.23979   0.96693   0.31629  -0.36064  -0.87674   0.098512
  0.31078   0.47929   0.27175   0.30005  -0.23732  -0.31517   0.17925
  0.61773   0.59821   0.49489   0.3423   -0.078034  0.60212   0.18683
  0.5207   -0.12331   0.48313  -0.24117   0.59696   0.61078  -0.84414
  0.27661   0.068767 -1.1388    0.089544  0.89842   0.53788   0.10841
 -0.10038   0.12921   0.11476  -0.474    -0.8049    0.96     -0.36602
 -0.43019  -0.39808  -0.096782 -0.71184  -0.31494   0.82346   0.42179
 -0.69205  -1.4864    0.29498  -0.30875  -0.49995  -0.4649   -0.44524
  0.8106    1.4757    0.53782  -0.28271  -0.045796  0.14454  -0.74485
  0.35495  -0.40961   0.35779   0.40061   0.37339   0.72163   0.40813
  0.26155  -0.14239  -0.020514 -1.1106   -0.4767    0.37832   0.89612
 -0.17323  -0.50137   0.22991   1.5324   -0.82032  -0.10096   0.45202
 -0.88639   0.089056 -0.19347  -0.42253   0.022429  0.29444   0.020747
  0.48935   0.35991   0.092758 -0.22428   0.60038  -0.3185   -

: 