# `1) Task-Oriented NLU`

`TASK` -> Smart Home Controller

### `1.1) Imports`

In [19]:
import pandas as pd

In [20]:
nRowsRead = None

fs_df = pd.read_csv('Data/fluent_speech_commands_dataset/data/train_data.csv', delimiter=',', nrows = nRowsRead)
fs_df.dataframeName = 'train_data.csv'
nRow, nCol = fs_df.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 23132 rows and 7 columns


In [21]:
fs_df.head()

Unnamed: 0.1,Unnamed: 0,path,speakerId,transcription,action,object,location
0,0,wavs/speakers/2BqVo8kVB2Skwgyb/0a3129c0-4474-1...,2BqVo8kVB2Skwgyb,Change language,change language,none,none
1,1,wavs/speakers/2BqVo8kVB2Skwgyb/0ee42a80-4474-1...,2BqVo8kVB2Skwgyb,Resume,activate,music,none
2,2,wavs/speakers/2BqVo8kVB2Skwgyb/144d5be0-4474-1...,2BqVo8kVB2Skwgyb,Turn the lights on,activate,lights,none
3,3,wavs/speakers/2BqVo8kVB2Skwgyb/1811b6e0-4474-1...,2BqVo8kVB2Skwgyb,Switch on the lights,activate,lights,none
4,4,wavs/speakers/2BqVo8kVB2Skwgyb/1d9f3920-4474-1...,2BqVo8kVB2Skwgyb,Switch off the lights,deactivate,lights,none


**NLU:** `transcription` ---> `intent` + `object` + `location`

In [32]:
fs_df[["object", "location"]].value_counts()

object     location
volume     none        4395
heat       washroom    2565
           none        2105
lights     washroom    1397
           kitchen     1326
music      none        1326
heat       bedroom     1303
           kitchen     1282
lights     bedroom     1109
none       none         994
lights     none         962
lamp       none         792
newspaper  none         551
socks      none         538
shoes      none         536
juice      none         466
Chinese    none         449
English    none         349
Korean     none         345
German     none         342
dtype: int64

### `1.2) Data Exploration`

In [23]:
data = fs_df[["transcription", "action", "object", "location"]]

In [24]:
data.head()

Unnamed: 0,transcription,action,object,location
0,Change language,change language,none,none
1,Resume,activate,music,none
2,Turn the lights on,activate,lights,none
3,Switch on the lights,activate,lights,none
4,Switch off the lights,deactivate,lights,none


In [25]:
data["action"].value_counts()

increase           5953
decrease           5697
activate           3822
deactivate         3090
change language    2479
bring              2091
Name: action, dtype: int64

In [26]:
data["object"].value_counts()

heat         7255
lights       4794
volume       4395
music        1326
none          994
lamp          792
newspaper     551
socks         538
shoes         536
juice         466
Chinese       449
English       349
Korean        345
German        342
Name: object, dtype: int64

In [27]:
data["location"].value_counts()

none        14150
washroom     3962
kitchen      2608
bedroom      2412
Name: location, dtype: int64

### `1.3) Preprocessing`

#### 1.3.1- text correction

In [39]:
from textblob import TextBlob
import re

from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk

In [40]:
txt = "turn on the lights!"
txt = str(TextBlob(txt).correct())
txt

'turn on the lights!'

#### 1.3.2- lowecase letters

In [41]:
txt = "For HoNOR"
txt = txt.lower().strip()
txt

'for honor'

#### 1.3.3- remove stopwords

In [42]:
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

txt = "say hi to the neighbor"
txt = remove_stopwords(txt)
txt

'say hi neighbor'

#### 1.3.4- lemmentization

In [43]:
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV} 
# Pos tag, used Noun, Verb, Adjective and Adverb

def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
#     print(pos_tagged_text)
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

txt = "Ahmad is working on projects"
lemmatize_words(txt)

'Ahmad be work on project'

#### 1.3.5 apply text-cleaning methods


In [44]:
def text_clean(text):
    text = text.lower().strip()
    text = str(TextBlob(text).correct())
    text = lemmatize_words(text)
    text = remove_stopwords(text)
    return text

In [45]:
data["transcription_clean"] = data["transcription"].apply(lambda x: text_clean(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["transcription_clean"] = data["transcription"].apply(lambda x: text_clean(x))


#### `1.3.6) TF-IDF`

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import preprocessing

In [47]:
tfidf_vectorizer = TfidfVectorizer()

In [48]:
values = tfidf_vectorizer.fit_transform(data["transcription_clean"] )
# tfidf_feature_names = tfidf_vectorizer.get_feature_names()
# tfidf_df = pd.DataFrame(values.toarray(), columns = tfidf_feature_names)
# tfidf_df

In [28]:
# tfidf_vectorizer.get_feature_names_out()

In [29]:
# tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
# tfidf_df = pd.DataFrame(values.toarray(), columns = tfidf_feature_names)
# tfidf_df

#### `1.3.7) encoding categorical features`

In [51]:
le = preprocessing.LabelEncoder()

In [52]:
X = values

le_action = preprocessing.LabelEncoder()
le_object = preprocessing.LabelEncoder()
le_location = preprocessing.LabelEncoder()
y_action, y_object, y_location = le_action.fit_transform(data["action"]),\
                                                  le_object.fit_transform(data["object"]),\
                                                  le_location.fit_transform(data["location"])

# le_action.inverse_transform(y_action)

In [53]:
data["state"] = data["action"] + "-" + data["object"] + "-" + data["location"]
y_state = le.fit_transform(data["state"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["state"] = data["action"] + "-" + data["object"] + "-" + data["location"]


### `1.4) Training`

In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#### 1.4.1 choosing best model

In [None]:
# Model definitions
MultinomialNB_model = MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)
LogReg_model = LogisticRegression()
SGDClassifier_model = SGDClassifier(class_weight='balanced', penalty='l1')

models = [MultinomialNB_model, LogReg_model, SGDClassifier_model]
model_names = ['MultinomialNB', 'LogisticRegression', 'SGDClassifier']

def train_and_evaluate(X, y, label_col='y'):
    def train_models(X_tr, X_te, y_tr, y_te):
        for i, model in enumerate(models):
            print(f"Model: {model_names[i]}")
            model.fit(X_tr, y_tr)
            y_pred = model.predict(X_te)
            print('accuracy %s' % accuracy_score(y_te, y_pred))
            print()
    print("----training on data with labels for column: {}----".format(label_col))
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=110 , stratify=y)
    train_models(X_train, X_test, y_train, y_test)
    print("-" * 100)
    print()
    

In [None]:
# training + simple evaluation
train_and_evaluate(X, y_action, "action")
train_and_evaluate(X, y_object, "object")
train_and_evaluate(X, y_location, "location")
train_and_evaluate(X, y_state, "state")

`Evaluation`: Logisitic Regression is the best model in terms of its simplicity and accuracy

In [None]:
# Train Final Model
y_intents = [y_action, y_object, y_location]
y_intents = {"action": y_action, "object": y_object, "location": y_location}
les = {"action": le_action, "object": le_object, "location": le_location}
intent_models = {}
X = values
classes = []

for intent, y_intent in y_intents.items():
    print("column: {}".format(intent))
    LogReg_model = LogisticRegression()
    X_train, X_test, y_train, y_test = train_test_split(X, y_intent, test_size=0.2, random_state=110 , stratify=y_intent)
    LogReg_model.fit(X_train, y_train)
    y_pred = LogReg_model.predict(X_test)
    intent_models[intent] = LogReg_model
    print('accuracy %s' % accuracy_score(y_test, y_pred))
    print()

### `1.5) Applying Model`

the model trained (logistic regression) is going to be applied in transcripted text for implementation and testing purposes 

In [158]:
# apply intent classification
transcript_eg = "Switch on the lights"

def get_intents(transcript):
    transcript = tfidf_vectorizer.transform([transcript])
    action_pred = les["action"].inverse_transform(intent_models["action"].predict(transcript))[0]
    object_pred = les["object"].inverse_transform(intent_models["object"].predict(transcript))[0]
    location_pred = les["location"].inverse_transform(intent_models["location"].predict(transcript))[0]
    
    return {"action": action_pred, "object": object_pred, "location": location_pred}

get_intents(transcript_eg)

{'action': 'deactivate', 'object': 'volume', 'location': 'none'}

# `2) Non-Task Oriented NLU`

In [2]:
from transformers import pipeline, Conversation

### `2.1) Zero-Shot Classification`

In [48]:
query_test = "turn on the lights"
discourse_labels = ["conversational", "command"]

classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

In [53]:
zsc_result = classifier(query_test, Discourse)
zsc_labels = zsc_result["labels"]  
zsc_scores = zsc_result["scores"]
zsc_result

{'sequence': 'turn on the lights',
 'labels': ['command', 'conversational'],
 'scores': [0.9863330125808716, 0.013666972517967224]}

`Note` we use 'zsc_labels' instead of 'classifier' list because the order of classes used might change after using classifier

In [54]:
discourse_prediction = zsc_labels[zsc_scores.index(max(zsc_scores))]
discourse_prediction

'command'

### `2.2) Conversational`

In [55]:
converser = pipeline("conversational",
                      model="facebook/blenderbot-400M-distill")

Downloading:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/730M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/127k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/62.9k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [65]:
query_test = "hi, how are you?"
conversation = Conversation(query_test)

In [66]:
response = converser(conversation).generated_responses[-1]
response

" I'm doing well, thank you. How are you this fine evening? I hope you are as well."

### `2.3) Rephrasing`

In [9]:
from parrot import Parrot
import torch
import warnings
warnings.filterwarnings("ignore")

''' 
uncomment to get reproducable paraphrase generations
def random_state(seed):
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

random_state(1234)
'''

#Init models (make sure you init ONLY once if you integrate this to your code)
parrot = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5", use_gpu=False)

Downloading:   0%|          | 0.00/913 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/736 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/686 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

----------------------------------------------------------------------------------------------------
Input_phrase:  Can you recommed some upscale restaurants in Newyork?
----------------------------------------------------------------------------------------------------
('can you recommend some of the finest restaurants in new york?', 26)
('can you recommend some restaurants in new york?', 22)
('can you recommend some good restaurants in new york?', 21)
('can you recommend some upscale restaurants in new york?', 14)
('can you recommend some upscale restaurants in newyork?', 13)
----------------------------------------------------------------------------------------------------
Input_phrase:  What are the famous places we should not miss in Russia?
----------------------------------------------------------------------------------------------------
('recommend some of the famous places to be visited in russia?', 33)
('which are some places we should not miss in russia?', 23)


In [None]:
phrases = ["Can you recommed some upscale restaurants in Newyork?",
           "What are the famous places we should not miss in Russia?"
]

for phrase in phrases:
    print("-"*100)
    print("Input_phrase: ", phrase)
    print("-"*100)
    para_phrases = parrot.augment(input_phrase=phrase)
    for para_phrase in para_phrases:
        print(para_phrase)


In [3]:
rephraser = pipeline("text2text-generation",
                      model="prithivida/parrot_paraphraser_on_T5")

In [8]:
query_test = "what do you think of the recent events that transpired in new yourk?"

rephraser(query_test)

[{'generated_text': 'What do you think of the recent events in new yourk?'}]

### `2.4) story telling`

In [8]:
from transformers import pipeline

In [11]:
story_gen = pipeline("text-generation", "pranavpsv/gpt2-genre-story-generator")
print(story_gen("<BOS> <superhero> Batman"))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': '<BOS> <superhero> Batman and Robin, along with Professor Charles Xavier\'s X-Men, raid an underground facility that holds the mutant "Quicksilver". The group includes the superpowered Jean Grey, Iceman, Scarlet Witch, Magik, Sabret'}]


### `2.5) automatic keyword extraction`

In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

In [11]:
# keyword_tokenizer = AutoTokenizer.from_pretrained("yanekyuk/bert-uncased-keyword-extractor")
keyword_extractor = pipeline("token-classification", model="yanekyuk/bert-uncased-keyword-extractor")

In [17]:
sentence = "do you know new york jets?"
keywords = keyword_extractor(sentence)

In [18]:
keywords

[{'entity': 'B-KEY',
  'score': 0.9975394,
  'index': 4,
  'word': 'new',
  'start': 12,
  'end': 15},
 {'entity': 'I-KEY',
  'score': 0.9922999,
  'index': 5,
  'word': 'york',
  'start': 16,
  'end': 20},
 {'entity': 'I-KEY',
  'score': 0.9709819,
  'index': 6,
  'word': 'jets',
  'start': 21,
  'end': 25}]

---

# `3) using source code (src)`

--> **implementing with scripts written in src folder**

### `3.1) src implementation` - training

In [1]:
import pandas as pd
from src.NLU.preprocess import prepare_data, get_tfidf, encode_label, split
from src.NLU.intention import IntentRecognizer
import pickle

In [2]:
fs_df = pd.read_csv('Data/fluent_speech_commands_dataset/data/train_data.csv', delimiter=',')

In [3]:
fs_df_clean = prepare_data(fs_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["transcription_clean"] = data["transcription"].apply(lambda x: text_clean(x))


In [4]:
X, tfidf_vectorizer = get_tfidf(fs_df_clean["transcription_clean"])

In [5]:
les = {}
ys = {}
ys["action"], les["action"] = encode_label(fs_df_clean["action"])
ys["object"], les["object"] = encode_label(fs_df_clean["object"])
ys["location"], les["location"] = encode_label(fs_df_clean["location"])

print(les)
print(ys)

{'action': LabelEncoder(), 'object': LabelEncoder(), 'location': LabelEncoder()}
{'action': array([2, 0, 0, ..., 2, 2, 3]), 'object': array([10,  8,  7, ...,  3,  1,  7]), 'location': array([2, 2, 2, ..., 2, 2, 3])}


In [6]:
ir = IntentRecognizer()

In [7]:
for intent, y_intent in ys.items():
    print("intent: {}".format(intent))
    X_train, X_test, ys_train, ys_test = split(X, ys[intent], test_size=0.2, stratify=True)
    
    ir.learn_intents(X_train, ys_train, intent)
    ir.evaluate(X_test, ys_test, intent)
    print("-" * 50)
    

intent: action
accuracy 1.0
--------------------------------------------------
intent: object
accuracy 1.0
--------------------------------------------------
intent: location
accuracy 1.0
--------------------------------------------------


In [8]:
ir.intent_models

{'action': LogisticRegression(),
 'object': LogisticRegression(),
 'location': LogisticRegression()}

In [9]:
transcript_eg = "activate lights in kitchen"
ir.get_intents(transcript_eg, les, tfidf_vectorizer)

{'action': 'activate', 'object': 'lights', 'location': 'kitchen'}

In [12]:

# save work
with open('Saved/intention_models.pickle', 'wb') as handle:
    pickle.dump(ir.intent_models, handle, protocol=4)  
with open('Saved/tfidfV.pickle', 'wb') as handle:
    pickle.dump(tfidf_vectorizer, handle, protocol=4)  
with open('Saved/les.pickle', 'wb') as handle:
    pickle.dump(les, handle, protocol=4)  

### `3.2) src implementation` - usage

In [13]:
with open('Saved/intention_models.pickle', 'rb') as handle:
    intention_models_b = pickle.load(handle)
with open('Saved/tfidfV.pickle', 'rb') as handle:
    tfidfv_b = pickle.load(handle)
with open('Saved/les.pickle', 'rb') as handle:
    les_b = pickle.load(handle)

In [14]:
ir_b = IntentRecognizer()

In [15]:
ir_b.intent_models = intention_models_b

In [18]:
transcript_eg = "turn off music"
ir_b.get_intents(transcript_eg, les_b, tfidfv_b)

{'action': 'deactivate', 'object': 'music', 'location': 'none'}

---

## Further Work

- https://medium.com/swlh/using-xlnet-for-sentiment-classification-cfa948e65e85
- https://igorizraylevych.medium.com/how-do-task-oriented-dialogue-systems-work-and-what-benefits-they-bring-for-business-20691bf2e0ae
- https://medium.com/analytics-vidhya/creating-your-own-intent-classifier-b86e000a4926
- https://medium.com/analytics-vidhya/a-guide-to-your-own-a-i-voice-assistant-using-python-17f79c94704

- https://www.kaggle.com/datasets/elvinagammed/chatbots-intent-recognition-dataset
- https://www.kaggle.com/datasets/lorencpetr/chatbot-intent-classification