# Building the dataset

In [92]:
import pandas as pd
import numpy as np

In [93]:
text_lst = ["hi", "hello", "good day", "good morning", "morning", "good afternoon", "afternoon", "good evening", "evening", "good night", "night", "hola", "hey", "my guy", "ma guy", "yo", "thank you", "that was very helpful of you", "thanks a lot", "thanks", "i love you", "you are awesome", "i like you", "good job", "i am impressed", "excellent job", "impressive job", "outstanding performance", "you are smart", "nice", "that's very impressive", "i'm very impressed", "super", "great", "amazing", "What's the weather like today?", "Can you tell me the time?", 
            "Where is the nearest restaurant?", "How was your day?", "how old are you?", "how are you?", "how can i help you?", "which city are yu from?", "what are you doing?", "when did you come?", "are you hungry?", "foolish", "you are a fool", "stupid", "screw you", "fuck you", "i hate you", "get away", "Goodbye", "See you later", "Take care", "i'm going", "bye", "see you soon", "See you later"]
intent_lst = ["formal greet", "formal greet", "formal greet", "formal greet", "formal greet", "formal greet", "formal greet", "formal greet", "formal greet", "formal greet", "formal greet", "informal greeting", "informal greeting", "informal greeting", "informal greeting", "informal greeting", "compliment", "compliment", "compliment", "compliment", "compliment", "compliment", "compliment", "compliment", "compliment", "compliment", "compliment", "compliment", "compliment", "compliment", "compliment", "compliment", "compliment", "compliment", "compliment", 
             "Inquiry", "Inquiry", "Inquiry", "Inquiry", "Inquiry", "Inquiry", "Inquiry", "Inquiry", "Inquiry", "Inquiry", "Inquiry", "offensive", "offensive", "offensive", "offensive", "offensive", "offensive", "offensive", "Farewell", "Farewell", "Farewell", "Farewell", "Farewell", "Farewell", "Farewell"]

In [94]:
## creating 240 data points using above data
import random

# Categories from the intent list
categories = [
    "formal greet", "informal greeting", "compliment", "Inquiry", "offensive", "Farewell"
]

# Initialize the lists
text_lst = []
intent_lst = []

# Number of elements for each category
num_elements_per_category = 40 # 240/6

# Generate texts and assign intents
for category in categories:
    for _ in range(num_elements_per_category):
        # Generate random text
        if category == "formal greet":
            text = random.choice(["hello", "good day", "good morning", "good afternoon", "good evening", "hi"])
        elif category == "informal greeting":
            text = random.choice(["hey", "yo", "hola", "my guy", "ma guy"])
        elif category == "compliment":
            text = random.choice(["thank you", "you are awesome", "good job", "excellent job", "super", "great", "amazing", "nice"])
        elif category == "Inquiry":
            text = random.choice(["What's the weather like today?", "Can you tell me the time?", "Where is the nearest restaurant?", "How was your day?", "how old are you?", "how are you?", "how can i help you?", "which city are you from?", "what are you doing?", "when did you come?", "are you hungry?"])
        elif category == "offensive":
            text = random.choice(["foolish", "you are a fool", "stupid", "screw you", "fuck you", "i hate you", "get away"])
        elif category == "Farewell":
            text = random.choice(["Goodbye", "See you later", "Take care", "i'm going", "bye", "see you soon", "See you later"])

        # Append text and intent to the lists
        text_lst.append(text)
        intent_lst.append(category)

# Shuffle the lists to randomize the order
combined = list(zip(text_lst, intent_lst))
random.shuffle(combined)
text_lst[:], intent_lst[:] = zip(*combined)

# Checking the lengths of both lists
print(len(text_lst), len(intent_lst))

240 240


In [95]:
# creating the dataframe
intent_data = pd.DataFrame({'text': text_lst, 'intent': intent_lst})
intent_data.head(10)

Unnamed: 0,text,intent
0,stupid,offensive
1,amazing,compliment
2,foolish,offensive
3,ma guy,informal greeting
4,bye,Farewell
5,hello,formal greet
6,thank you,compliment
7,which city are you from?,Inquiry
8,super,compliment
9,foolish,offensive


In [96]:
# class imbalancing fixed
intent_data['intent'].value_counts()

intent
offensive            40
compliment           40
informal greeting    40
Farewell             40
formal greet         40
Inquiry              40
Name: count, dtype: int64

# Training

In [97]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

In [98]:
# splitting the dataset into training and testing
df = intent_data.copy()

X = intent_data['text']
y = intent_data['intent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [99]:
y_train.value_counts()

intent
Farewell             28
informal greeting    28
formal greet         28
compliment           28
Inquiry              28
offensive            28
Name: count, dtype: int64

In [100]:
y_test.value_counts()

intent
Inquiry              12
informal greeting    12
offensive            12
formal greet         12
Farewell             12
compliment           12
Name: count, dtype: int64

In [101]:
## using lazy classifier to identify the best models
from scipy.sparse import csr_matrix

vectorizer = CountVectorizer(max_features=1000)
X_train_new = vectorizer.fit_transform(X_train).toarray()
X_test_new = vectorizer.transform(X_test).toarray()

In [102]:
from lazypredict.Supervised import LazyClassifier

clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train_new, X_test_new, y_train, y_test)

print(models)

 86%|██████████████████████████████████████████████████████████████████████▋           | 25/29 [00:01<00:00, 25.58it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001226 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9
[LightGBM] [Info] Number of data points in the train set: 168, number of used features: 3
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759


100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:01<00:00, 20.67it/s]

                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
RandomForestClassifier             0.97               0.97    None      0.97   
LabelSpreading                     0.94               0.94    None      0.95   
SGDClassifier                      0.94               0.94    None      0.95   
RidgeClassifier                    0.94               0.94    None      0.95   
QuadraticDiscriminantAnalysis      0.94               0.94    None      0.95   
Perceptron                         0.94               0.94    None      0.95   
PassiveAggressiveClassifier        0.94               0.94    None      0.95   
NearestCentroid                    0.94               0.94    None      0.95   
LogisticRegression                 0.94               0.94    None      0.95   
LinearSVC                          0.94               0.94    None      0.95   
BaggingClassifier                  0.94 




### we choose random forest classifier

In [184]:
# Custom transformer for adding fallback message
class ConfidenceBasedFallback(BaseEstimator, TransformerMixin):
    def __init__(self, classifier, fallback_threshold=0.7, fallback_message="NLU fallback: Intent could not be confidently determined"):
        self.classifier = classifier
        self.fallback_threshold = fallback_threshold
        self.fallback_message = fallback_message
    
    def fit(self, X, y=None):
        self.classifier.fit(X, y)
        return self
    
    def transform(self, X):
        return X
    
    def predict(self, X):
        X_test_vectors = vectorizer.transform(X_test)
        confidence_rates = self.classifier.predict_proba(X_test_vectors)
        predictions = []
        for text, confidence_rate in zip(X, confidence_rates):
            if max(confidence_rate) < self.fallback_threshold:
                predictions.append(self.fallback_message)
            else:
                predictions.append(self.classifier.predict(text)[0])
        return predictions

In [185]:
from sklearn.ensemble import RandomForestClassifier

# using a pipeline to do the classification
pipeline = Pipeline([
    ('vect', CountVectorizer()),  # Convert text to word count vectors)
    ('clf', ConfidenceBasedFallback(classifier=RandomForestClassifier()))
])

In [186]:
# Train the model

pipeline.fit(X_train, y_train)

In [187]:
#prediction from original model
y_pred = pipeline.predict(X_test)

In [188]:
y_pred

['Inquiry',
 'informal greeting',
 'Inquiry',
 'offensive',
 'NLU fallback: Intent could not be confidently determined',
 'Farewell',
 'NLU fallback: Intent could not be confidently determined',
 'formal greet',
 'Farewell',
 'Farewell',
 'offensive',
 'compliment',
 'Farewell',
 'compliment',
 'Farewell',
 'compliment',
 'formal greet',
 'formal greet',
 'Farewell',
 'Farewell',
 'Inquiry',
 'informal greeting',
 'compliment',
 'informal greeting',
 'offensive',
 'Inquiry',
 'informal greeting',
 'offensive',
 'Inquiry',
 'formal greet',
 'offensive',
 'Inquiry',
 'formal greet',
 'formal greet',
 'formal greet',
 'NLU fallback: Intent could not be confidently determined',
 'offensive',
 'Farewell',
 'compliment',
 'Inquiry',
 'Farewell',
 'formal greet',
 'offensive',
 'informal greeting',
 'offensive',
 'compliment',
 'compliment',
 'compliment',
 'formal greet',
 'informal greeting',
 'Farewell',
 'Farewell',
 'Farewell',
 'Inquiry',
 'informal greeting',
 'informal greeting',
 'NL

# Saving the model

In [189]:
import joblib

joblib.dump(pipeline, 'model.pkl')

['model.pkl']

# Loding the model and example

In [190]:
loaded_model = joblib.load('model.pkl')

In [192]:
loaded_model.fit(X_train, y_train)

In [196]:
loaded_model.predict(['bye', 'hello', 'hi, how are you?'])

['Farewell', 'formal greet', 'Inquiry']