# Intent Recognition & Context Management Integration

*This notebook integrates existing text cleaning and processing pipeline with intent recognition using Word2Vec-style embeddings (from the pre-trained GloVe model) and context management. We use an expanded dataset of common intents to train a classifier. The predicted intent and updated context for each input are printed without generating a response.*

#### Imports and Setup

In [2]:
import numpy as np
import string
import re
import nltk
import yaml
import gensim.downloader as api
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression

# Download necessary NLTK resources
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

# Load pre-trained GloVe model (using gensim)
pretrained_model = api.load("glove-wiki-gigaword-100")
english_stop_words = list(set(stopwords.words("english")))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asusg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asusg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\asusg\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### Text Pre-Processing Functions

In [3]:
def to_lowercase(prompt):
    return prompt.lower()

def delete_stopwords(prompt):
    return " ".join([word for word in prompt.split() if word not in english_stop_words])

def text_cleaning(prompt):
    ignore_character = list(string.punctuation)
    pattern = f"[{re.escape(''.join(ignore_character))}]"
    cleaned_prompt = re.sub(pattern, " ", prompt)
    cleaned_prompt = re.sub(r"\b[a-z]\b", "", cleaned_prompt)
    cleaned_prompt = re.sub(r"\s+", " ", cleaned_prompt)
    return cleaned_prompt.strip()

def tokenization(sentence):
    return word_tokenize(sentence)

def lemmatization(tokens):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

def nettoyage_corpus(corpus):
    """
    Process a list of documents: lowercasing, stopword deletion,
    cleaning, tokenization, and lemmatization.
    """
    cleaned_conversations = [
        lemmatization(tokenization(text_cleaning(delete_stopwords(to_lowercase(doc)))))
        for doc in corpus
    ]
    return cleaned_conversations

#### Sentence Embedding Function

In [4]:
def get_sentence_embedding(model, sentence):
    tokens = word_tokenize(sentence.lower())
    valid_tokens = [token for token in tokens if token in model]
    if not valid_tokens:
        return np.zeros(model.vector_size)
    embeddings = [model[token] for token in valid_tokens]
    return np.mean(embeddings, axis=0)

#### Expanded Intent Dataset and Classifier Training

In [5]:
intent_phrases = {
    "greeting": [
        "Hello", "Hi", "Hey there", "Good morning", "Good afternoon", "Good evening",
        "What's up", "Greetings", "Howdy", "Hi, how are you?", "Hey", "Hello there",
        "Hey, what's going on?", "Yo", "Hiya", "Hello, nice to see you!", "Hey buddy",
        "Good to see you", "Hi, hope you're well", "Hello, how do you do?"
    ],
    "goodbye": [
        "Goodbye", "Bye", "See you later", "Talk to you soon", "Farewell", "Take care",
        "Catch you later", "See ya", "Bye bye", "Adios", "Later", "So long", "Good night",
        "I'm off", "Peace out", "Ciao", "Until next time", "Farewell for now",
        "See you around", "Later alligator"
    ],
    "get_time": [
        "What time is it?", "Tell me the current time", "Could you give me the time?",
        "I need to know the time", "Time please", "Do you know what time it is?",
        "Can you tell me the time?", "What's the time now?", "Please share the time",
        "Current time?", "Time update", "What's the clock saying?", "Show me the time",
        "Time check", "What's the time, please?", "May I know the time?",
        "Could you update me with the time?", "Time now?", "Let me know the time", "Time?"
    ],
    "get_weather": [
        "What's the weather like today?", "Tell me the weather forecast", "How is the weather?",
        "Is it going to rain?", "Weather update please", "What's the temperature outside?",
        "Do I need an umbrella today?", "Weather report", "Current weather conditions?",
        "How's the weather outside?", "Forecast for today?", "Is it sunny or rainy?",
        "Weather status", "What's the climate like today?", "Do I need a jacket today?",
        "How's the weather looking?", "Any rain expected today?", "Weather check",
        "Let me know today's weather", "Weather update"
    ],
    "thanks": [
        "Thank you", "Thanks a lot", "Much appreciated", "Thanks", "Thank you very much",
        "I appreciate it", "Thanks a million", "Thank you so much", "Cheers", "Thanks a bunch",
        "Many thanks", "I'm grateful", "Thank you kindly", "I owe you one", "Appreciate it",
        "Thanks for everything", "Thanks, that was helpful", "Thank you, really appreciate it",
        "Thanks a ton", "Sincere thanks"
    ],
    "apology": [
        "I'm sorry", "My apologies", "Sorry for that", "I apologize", "Please forgive me",
        "Sorry about that", "My bad", "I didn't mean that", "I am really sorry", "Apologies",
        "I regret that", "So sorry", "Excuse me", "Pardon me", "I beg your pardon",
        "I sincerely apologize", "Forgive me, please", "I apologize for any inconvenience",
        "I'm truly sorry", "Sorry, my mistake"
    ],
    "unknown": [
        "I don't know", "Can you repeat that?", "What do you mean?", "I don't understand",
        "Could you say that again?", "Not sure what you mean", "I'm confused", "What?",
        "Huh?", "I have no idea", "Could you clarify?", "I didn't catch that",
        "Sorry, what did you say?", "I am not sure I follow", "Please explain",
        "I don't follow", "Could you rephrase that?", "I don't comprehend", "Unclear to me", "Not sure"
    ]
}

texts = []
labels = []
for intent, phrases in intent_phrases.items():
    for phrase in phrases:
        texts.append(phrase)
        labels.append(intent)

X = np.array([get_sentence_embedding(pretrained_model, text) for text in texts])
y = np.array(labels)

clf = LogisticRegression(max_iter=1000)
clf.fit(X, y)

def ml_intent(sentence):
    embedding = get_sentence_embedding(pretrained_model, sentence).reshape(1, -1)
    return clf.predict(embedding)[0]

#### Context Manager Class

In [6]:
class ContextManager:
    def __init__(self):
        self.user_context = {}
    def update_context(self, user_id, intent):
        self.user_context[user_id] = intent
    def get_context(self, user_id):
        return self.user_context.get(user_id, None)

#### Integration and Testing Without Response Generation

In [7]:

if __name__ == "__main__":
    # Optionally load conversation data from a YAML file
    try:
        with open("conversations.yml", "r", encoding="utf-8") as file:
            data = yaml.safe_load(file)
        conversations = data.get("conversations", {})
        print("Loaded Conversations:", conversations)
    except FileNotFoundError:
        print("conversations.yml not found; skipping YAML load.")
    
    # Simulated user inputs
    user_inputs = [
        "Hi there!",
        "Can you tell me what time it is?",
        "What's the weather like outside?",
        "Thanks for your help!",
        "I'm sorry, I didn't understand that.",
        "Bye!"
    ]
    
    context_manager = ContextManager()
    user_id = "user123"
    
    for user_input in user_inputs:
        # Process the input through the cleaning pipeline
        cleaned_corpus = nettoyage_corpus([user_input])
        cleaned_text = " ".join(cleaned_corpus[0])
        
        # Predict the intent using the ML-based classifier
        predicted_intent = ml_intent(cleaned_text)
        context_manager.update_context(user_id, predicted_intent)
        
        # Print the input, cleaned text, predicted intent, and current context
        print(f"Input: {user_input}")
        print(f"Cleaned Text: {cleaned_text}")
        print(f"Predicted Intent: {predicted_intent}")
        print(f"Current Context for {user_id}: {context_manager.get_context(user_id)}")
        print("-" * 50)


Loaded Conversations: ['Good morning, how are you?', 'I am doing well, how about you?', "I'm also good.", "That's good to hear.", 'Yes it is.', 'Hello', 'Hi', 'How are you doing?', 'I am doing well.', 'That is good to hear', 'Yes it is.', 'Can I help you with anything?', 'Yes, I have a question.', 'What is your question?', 'Could I borrow a cup of sugar?', "I'm sorry, but I don't have any.", 'Thank you anyway', 'No problem', 'How are you doing?', 'I am doing well, how about you?', 'I am also good.', "That's good.", 'Have you heard the news?', 'What good news?', 'What is your favorite book?', "I can't read.", "So what's your favorite color?", 'Blue', 'Who are you?', 'Who? Who is but a form following the function of what', 'What are you then?', 'A man in a mask.', 'I can see that.', "It's not your powers of observation I doubt, but merely the paradoxical nature of asking a masked man who is. But tell me, do you like music?", 'I like seeing movies.', 'What kind of movies do you like?', 'A