<a href="https://colab.research.google.com/github/Jeevesh8/chat_command_detect/blob/main/notebooks/baseline_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
%%capture
!git clone https://Jeevesh8:4ff802d0f4f472c8dfeeea0edb7168928652f542@github.com/Jeevesh8/chat_command_detect
!pip install spacy nltk xgboost scikit-learn fasttext

In [29]:
import os
import io

from functools import lru_cache
from itertools import chain

import nltk
import spacy
nlp = spacy.load("en_core_web_sm")

import numpy as np
import pandas as pd

from fasttext import load_model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

### Load Data into a DataFrame

In [5]:
data_dir = "./chat_command_detect/data/"
data_files = [os.path.join(data_dir, filename) 
                for filename in os.listdir(data_dir)
                    if filename.endswith(".csv")]

dfs = []
for filename in data_files:
    df = pd.read_csv(filename)
    df["split"] = [filename.split("/")[-1][:-len(".csv")]]*len(df)
    dfs.append(df)

df = pd.concat(dfs)
df = df.drop(["path"], axis=1)
df = df.drop_duplicates()
df["transcription"] = df["transcription"].map(lambda text: text.replace("’", "'"))
train_df, valid_df = df[df["split"]=="train_data"], df[df["split"]=="valid_data"]

# BoW Models

## TF-IDF Vectorisation

In [3]:
#Lemmatisation and remove stop words
def lemmatize_df(df, possible_tokens=None):
    lemmatized_tscrpt = df["transcription"].map(
        lambda transcript: ' '.join([token.lemma_
                                     for token in nlp(transcript)
                                     if possible_tokens is None or
                                        token.lemma_ in possible_tokens]
                                    )
        )
    
    possible_tokens = set([word for sent in lemmatized_tscrpt for word in sent.split()])
    return lemmatized_tscrpt, possible_tokens

lemmatized_tscrpt, possible_tokens = lemmatize_df(train_df)
train_df = train_df.assign(lemmatized_tscrpt=lemmatized_tscrpt)

lemmatized_tscrpt, _ = lemmatize_df(valid_df, possible_tokens)
valid_df = valid_df.assign(lemmatized_tscrpt=lemmatized_tscrpt)

In [66]:
train_text = "lemmatized_tscrpt"
vectorizer = TfidfVectorizer()
vectorizer.fit(train_df[train_text])
train_X = vectorizer.transform(train_df[train_text])
valid_X = vectorizer.transform(valid_df[train_text])

In [43]:
valid_df.head()

Unnamed: 0,transcription,action,object,location,split
0,Turn on the lights,activate,lights,none,valid_data
1,Turn off the lights,deactivate,lights,none,valid_data
2,Change language,change language,none,none,valid_data
3,Pause the music,deactivate,music,none,valid_data
4,Resume,activate,music,none,valid_data


## Naive Bayes Classifier

In [69]:
def train_and_eval(objective):
    nb_classifier = MultinomialNB()
    nb_classifier.fit(train_X, train_df[objective])
    valid_preds = nb_classifier.predict(valid_X)
    print(classification_report(valid_df[objective], valid_preds))

In [70]:
for objective in ["action", "object", "location"]:
    print(f"Evaluation after training for predicting {objective}:")
    train_and_eval(objective)

Evaluation after training for predicting action:
                 precision    recall  f1-score   support

       activate       1.00      1.00      1.00        40
          bring       1.00      1.00      1.00        23
change language       1.00      1.00      1.00        28
     deactivate       1.00      1.00      1.00        33
       decrease       0.98      1.00      0.99        61
       increase       1.00      0.98      0.99        63

       accuracy                           1.00       248
      macro avg       1.00      1.00      1.00       248
   weighted avg       1.00      1.00      1.00       248

Evaluation after training for predicting object:
              precision    recall  f1-score   support

     Chinese       1.00      1.00      1.00         5
     English       1.00      0.25      0.40         4
      German       1.00      0.25      0.40         4
      Korean       1.00      0.25      0.40         4
        heat       0.97      0.97      0.97        78
    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Word-Embedding Based Model

In [None]:
%%capture
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M-subword.zip

In [None]:
%%capture
!unzip crawl-300d-2M-subword.zip

In [None]:
subword_model = load_model("crawl-300d-2M-subword.bin")

In [26]:
def get_cosine_similarity(word1, word2):
    vec1 = subword_model.get_word_vector(word1)
    vec2 = subword_model.get_word_vector(word2)
    return np.dot(vec1, vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2))

print(get_cosine_similarity("doesnot", "doesnot"),
      get_cosine_similarity("does not", "does n't"))

1.0000001 0.88693386


In [35]:
def eval(objective):
    valid_labels = valid_df[objective]
    valid_preds = []

    possible_labels = list(valid_labels.unique())

    for sent in valid_df["transcription"]:
        
        similarity_scores = [max([get_cosine_similarity(word, word2) 
                                for word2 in sent.split()]
                                )
                                for word in possible_labels]
        
        valid_preds.append(possible_labels[similarity_scores.index(
                                            max(similarity_scores))
                                        ]
                        )

    print(classification_report(valid_labels, valid_preds))

In [42]:
for objective in ["action", "object", "location"]:
    print("Zero Shot evaluation using cosine similarity")
    print(f"with fastText word embeddings for predicting {objective}:")
    eval(objective)

Zero Shot evaluation using cosine similarity
with fastText word embeddings for predicting action:
                 precision    recall  f1-score   support

       activate       0.83      0.38      0.52        40
          bring       0.14      0.91      0.24        23
change language       0.39      0.46      0.43        28
     deactivate       0.00      0.00      0.00        33
       decrease       0.95      0.30      0.45        61
       increase       0.77      0.27      0.40        63

       accuracy                           0.34       248
      macro avg       0.51      0.39      0.34       248
   weighted avg       0.62      0.34      0.37       248

Zero Shot evaluation using cosine similarity
with fastText word embeddings for predicting object:
              precision    recall  f1-score   support

     Chinese       1.00      1.00      1.00         5
     English       0.24      1.00      0.38         4
      German       1.00      0.75      0.86         4
      Korean  