In [2]:
import pandas as pd

In [3]:
nRowsRead = None

fs_df = pd.read_csv('Data/fluent_speech_commands_dataset/data/train_data.csv', delimiter=',', nrows = nRowsRead)
fs_df.dataframeName = 'train_data.csv'
nRow, nCol = fs_df.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 23132 rows and 7 columns


In [12]:
fs_df[["object", "location"]].value_counts()

object     location
volume     none        4395
heat       washroom    2565
           none        2105
lights     washroom    1397
           kitchen     1326
music      none        1326
heat       bedroom     1303
           kitchen     1282
lights     bedroom     1109
none       none         994
lights     none         962
lamp       none         792
newspaper  none         551
socks      none         538
shoes      none         536
juice      none         466
Chinese    none         449
English    none         349
Korean     none         345
German     none         342
dtype: int64

In [4]:
fs_df

Unnamed: 0.1,Unnamed: 0,path,speakerId,transcription,action,object,location
0,0,wavs/speakers/2BqVo8kVB2Skwgyb/0a3129c0-4474-1...,2BqVo8kVB2Skwgyb,Change language,change language,none,none
1,1,wavs/speakers/2BqVo8kVB2Skwgyb/0ee42a80-4474-1...,2BqVo8kVB2Skwgyb,Resume,activate,music,none
2,2,wavs/speakers/2BqVo8kVB2Skwgyb/144d5be0-4474-1...,2BqVo8kVB2Skwgyb,Turn the lights on,activate,lights,none
3,3,wavs/speakers/2BqVo8kVB2Skwgyb/1811b6e0-4474-1...,2BqVo8kVB2Skwgyb,Switch on the lights,activate,lights,none
4,4,wavs/speakers/2BqVo8kVB2Skwgyb/1d9f3920-4474-1...,2BqVo8kVB2Skwgyb,Switch off the lights,deactivate,lights,none
...,...,...,...,...,...,...,...
23127,23127,wavs/speakers/zZezMeg5XvcbRdg3/b946b340-45e0-1...,zZezMeg5XvcbRdg3,I need to practice my Chinese. Switch the lang...,change language,Chinese,none
23128,23128,wavs/speakers/zZezMeg5XvcbRdg3/beb27cb0-45e0-1...,zZezMeg5XvcbRdg3,I need to practice my German. Switch the language,change language,German,none
23129,23129,wavs/speakers/zZezMeg5XvcbRdg3/c45f94e0-45e0-1...,zZezMeg5XvcbRdg3,I need to practice my Korean. Switch the language,change language,Korean,none
23130,23130,wavs/speakers/zZezMeg5XvcbRdg3/ca60c080-45e0-1...,zZezMeg5XvcbRdg3,I need to practice my English. Switch the lang...,change language,English,none


NLU: `transcription` ---> `intent` + `object` + `location`

In [12]:
data = fs_df[["transcription", "action", "object", "location"]]

In [13]:
data

Unnamed: 0,transcription,action,object,location
0,Change language,change language,none,none
1,Resume,activate,music,none
2,Turn the lights on,activate,lights,none
3,Switch on the lights,activate,lights,none
4,Switch off the lights,deactivate,lights,none
...,...,...,...,...
23127,I need to practice my Chinese. Switch the lang...,change language,Chinese,none
23128,I need to practice my German. Switch the language,change language,German,none
23129,I need to practice my Korean. Switch the language,change language,Korean,none
23130,I need to practice my English. Switch the lang...,change language,English,none


In [14]:
data["action"].value_counts()

increase           5953
decrease           5697
activate           3822
deactivate         3090
change language    2479
bring              2091
Name: action, dtype: int64

In [15]:
data["object"].value_counts()

heat         7255
lights       4794
volume       4395
music        1326
none          994
lamp          792
newspaper     551
socks         538
shoes         536
juice         466
Chinese       449
English       349
Korean        345
German        342
Name: object, dtype: int64

In [16]:
data["location"].value_counts()

none        14150
washroom     3962
kitchen      2608
bedroom      2412
Name: location, dtype: int64

## `Preprocessing`

#### 1- text correction

In [46]:
from textblob import TextBlob
import re

from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk

In [47]:
txt = "turn on the lights!"
txt = str(TextBlob(txt).correct())
txt

'turn on the lights!'

#### 2- lowecase letters

In [48]:
txt = "For HoNOR"
txt = txt.lower().strip()
txt

'for honor'

#### 3- remove stopwords

In [49]:
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

txt = "say hi to the neighbor"
txt = remove_stopwords(txt)
txt

'say hi neighbor'

#### 4- lemmentization

In [57]:
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV} 
# Pos tag, used Noun, Verb, Adjective and Adverb

def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
#     print(pos_tagged_text)
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

txt = "Ahmad is working on projects"
lemmatize_words(txt)

'Ahmad be work on project'

#### - apply


In [58]:
def text_clean(text):
    text = text.lower().strip()
    text = str(TextBlob(text).correct())
    text = lemmatize_words(text)
    text = remove_stopwords(text)
    return text

In [59]:
data["transcription_clean"] = data["transcription"].apply(lambda x: text_clean(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["transcription_clean"] = data["transcription"].apply(lambda x: text_clean(x))


## `TF-IDF`

In [99]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import preprocessing

In [63]:
tfidf_vectorizer = TfidfVectorizer()

In [66]:
values = tfidf_vectorizer.fit_transform(data["transcription_clean"] )
# tfidf_feature_names = tfidf_vectorizer.get_feature_names()
# tfidf_df = pd.DataFrame(values.toarray(), columns = tfidf_feature_names)
# tfidf_df

In [68]:
tfidf_vectorizer.get_feature_names_out()

array(['allow', 'anything', 'audit', 'bathroom', 'bedroom', 'bring',
       'can', 'change', 'chinese', 'cooper', 'could', 'decrease',
       'device', 'different', 'english', 'far', 'fetch', 'german', 'get',
       'go', 'hear', 'heat', 'heating', 'increase', 'it', 'juice',
       'kitchen', 'lamp', 'language', 'less', 'level', 'light', 'loud',
       'louder', 'low', 'lower', 'main', 'make', 'max', 'mushroom',
       'music', 'mute', 'need', 'newspaper', 'ok', 'open', 'organ',
       'pause', 'phone', 'play', 'please', 'practice', 'put', 'quiet',
       'quieter', 'reduce', 'resume', 'set', 'setting', 'shoe', 'sock',
       'softer', 'sound', 'start', 'stop', 'switch', 'system',
       'temperature', 'that', 'this', 'turn', 'use', 'video', 'volume'],
      dtype=object)

In [69]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(values.toarray(), columns = tfidf_feature_names)
tfidf_df

Unnamed: 0,allow,anything,audit,bathroom,bedroom,bring,can,change,chinese,cooper,...,stop,switch,system,temperature,that,this,turn,use,video,volume
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.855572,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.639286,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.775144,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.775144,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23127,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.506352,0.0,...,0.0,0.323679,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
23128,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.319092,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
23129,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.319240,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
23130,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.319435,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [100]:
le = preprocessing.LabelEncoder()

In [127]:
X = values

le_action = preprocessing.LabelEncoder()
le_object = preprocessing.LabelEncoder()
le_location = preprocessing.LabelEncoder()
y_action, y_object, y_location = le_action.fit_transform(data["action"]),\
                                                  le_object.fit_transform(data["object"]),\
                                                  le_location.fit_transform(data["location"])

# le_action.inverse_transform(y_action)

In [125]:
data["state"] = data["action"] + "-" + data["object"] + "-" + data["location"]
y_state = le.fit_transform(data["state"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["state"] = data["action"] + "-" + data["object"] + "-" + data["location"]


## `Training`

In [116]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [117]:
# Model definitions
MultinomialNB_model = MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)
LogReg_model = LogisticRegression()
SGDClassifier_model = SGDClassifier(class_weight='balanced', penalty='l1')

models = [MultinomialNB_model, LogReg_model, SGDClassifier_model]
model_names = ['MultinomialNB', 'LogisticRegression', 'SGDClassifier']

def train(X, y, label_col='y'):
    def train_models(X_tr, X_te, y_tr, y_te):
        for i, model in enumerate(models):
            print(f"Model: {model_names[i]}")
            model.fit(X_tr, y_tr)
            y_pred = model.predict(X_te)
            print('accuracy %s' % accuracy_score(y_te, y_pred))
            print()
    print("----training on data with labels for column: {}----".format(label_col))
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=110 , stratify=y)
    train_models(X_train, X_test, y_train, y_test)
    print("-" * 100)
    print()
    

In [126]:
train(X, y_action, "action")
train(X, y_object, "object")
train(X, y_location, "location")

train(X, y_state, "state")

----training on data with labels for column: action----
Model: MultinomialNB
accuracy 0.7516749513723795

Model: LogisticRegression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy 0.7516749513723795

Model: SGDClassifier
accuracy 0.7618327209855198

----------------------------------------------------------------------------------------------------

----training on data with labels for column: object----
Model: MultinomialNB
accuracy 0.988329371082775

Model: LogisticRegression
accuracy 1.0

Model: SGDClassifier
accuracy 1.0

----------------------------------------------------------------------------------------------------

----training on data with labels for column: location----
Model: MultinomialNB
accuracy 0.9394856278366112

Model: LogisticRegression
accuracy 1.0

Model: SGDClassifier
accuracy 1.0

----------------------------------------------------------------------------------------------------

----training on data with labels for column: state----
Model: MultinomialNB
accuracy 0.7516749513723795

Model: LogisticRegression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy 0.7609682299546142

Model: SGDClassifier
accuracy 0.7611843527123406

----------------------------------------------------------------------------------------------------



In [142]:
# Train Final Model
y_intents = [y_action, y_object, y_location]
y_intents = {"action": y_action, "object": y_object, "location": y_location}
les = {"action": le_action, "object": le_object, "location": le_location}
intent_models = {}
X = values
classes = []

for intent, y_intent in y_intents.items():
    print("column: {}".format(intent))
    SGDClassifier_model = SGDClassifier(class_weight='balanced', penalty='l1')
    X_train, X_test, y_train, y_test = train_test_split(X, y_intent, test_size=0.2, random_state=110 , stratify=y_intent)
    SGDClassifier_model.fit(X_train, y_train)
    y_pred = SGDClassifier_model.predict(X_test)
    intent_models[intent] = SGDClassifier_model
    print('accuracy %s' % accuracy_score(y_test, y_pred))
    print()
    

column: action
accuracy 0.7594553706505295

column: object
accuracy 1.0

column: location
accuracy 1.0



## `Evaluation`

## `Applying Model`

In [166]:
list(data["location"].unique()) 

['none', 'kitchen', 'bedroom', 'washroom']

In [158]:
# apply intent classification
transcript_eg = "Switch on the lights"

def get_intents(transcript):
    transcript = tfidf_vectorizer.transform([transcript])
    action_pred = les["action"].inverse_transform(intent_models["action"].predict(transcript))[0]
    object_pred = les["object"].inverse_transform(intent_models["object"].predict(transcript))[0]
    location_pred = les["location"].inverse_transform(intent_models["location"].predict(transcript))[0]
    
    return {"action": action_pred, "object": object_pred, "location": location_pred}

get_intents(transcript_eg)

{'action': 'deactivate', 'object': 'volume', 'location': 'none'}

In [None]:
# using SGD classifier for best results provided


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=110 , stratify=y)
SGDClassifier_model.fit(X_tr, y_tr)
y_pred = SGDClassifier_model.predict(X_te)
print('accuracy %s' % accuracy_score(y_te, y_pred))

---

## Further Work

- https://medium.com/swlh/using-xlnet-for-sentiment-classification-cfa948e65e85
- https://igorizraylevych.medium.com/how-do-task-oriented-dialogue-systems-work-and-what-benefits-they-bring-for-business-20691bf2e0ae
- https://medium.com/analytics-vidhya/creating-your-own-intent-classifier-b86e000a4926
- https://medium.com/analytics-vidhya/a-guide-to-your-own-a-i-voice-assistant-using-python-17f79c94704