In [1]:
import numpy as np
import time
import argparse
import json
import random
from nltk.tokenize import regexp_tokenize
import numpy as np
from nltk.corpus import stopwords

In [2]:
# load input data from json file
with open('data_full.json') as f:
    data = json.load(f)
    train_frame = np.array(data['train'])
    train_label = train_frame[:,1]

In [3]:
train_frame

array([['what expression would i use to say i love you if i were an italian',
        'translate'],
       ["can you tell me how to say 'i do not speak much spanish', in spanish",
        'translate'],
       ["what is the equivalent of, 'life is good' in french",
        'translate'],
       ...,
       ['how come my card was not accepted yesterday', 'card_declined'],
       ['find out what happened to make my card get declined yesterday',
        'card_declined'],
       ['why was my card declined at safeway', 'card_declined']],
      dtype='<U136')

In [5]:
train_label

array(['translate', 'translate', 'translate', ..., 'card_declined',
       'card_declined', 'card_declined'], dtype='<U136')

In [7]:
#select 20 random labels
unique_labels = np.unique(train_label)
choosen = random.sample(list(unique_labels),20)
choosen

['calendar',
 'insurance',
 'schedule_meeting',
 'tire_pressure',
 'accept_reservations',
 'thank_you',
 'cook_time',
 'w2',
 'change_speed',
 'nutrition_info',
 'credit_limit',
 'international_fees',
 'date',
 'change_language',
 'time',
 'ingredient_substitution',
 'weather',
 'book_hotel',
 'last_maintenance',
 'change_volume']

In [8]:
#filter the input dataset with selected 20 labels
train_frame = train_frame[np.in1d(train_label,np.array(choosen))]
test_frame = np.array(data['test'])
test_label = test_frame[:, 1]
test_frame = test_frame[np.in1d(test_label,np.array(choosen))]
train_frame

array([['adjust your language setting to english', 'change_language'],
       ['change and set your language setting to english',
        'change_language'],
       ['set your language setting to english', 'change_language'],
       ...,
       ["what's it doing outside right now", 'weather'],
       ["what's today's weather going to be", 'weather'],
       ['please tell me the weather forecast', 'weather']], dtype='<U136')

In [9]:
test_frame

array([['please speak in tagalog', 'change_language'],
       ['speak in german', 'change_language'],
       ['speak to me in dutch', 'change_language'],
       ...,
       ['tell me the weather for today', 'weather'],
       ['what is the current weather like', 'weather'],
       ['las vegas weather today', 'weather']], dtype='<U125')

In [10]:
test_label

array(['translate', 'translate', 'translate', ..., 'card_declined',
       'card_declined', 'card_declined'], dtype='<U125')

In [12]:
default_pattern =  r"""(?x)                  
                        (?:[A-Z]\.)+          
                        |\$?\d+(?:\.\d+)?%?    
                        |\w+(?:[-']\w+)*      
                        |\.\.\.               
                        |(?:[.,;"'?():-_`])    
                    """
def tokenize(text, pattern = default_pattern):
    text = text.lower()
    return regexp_tokenize(text, pattern)

In [13]:
# Tokenize text into tokens
tokenized_text = []
for i in range(0, len(train_frame)):
    tokenized_text.append(tokenize(train_frame[i][0]))
tokenized_text

[['adjust', 'your', 'language', 'setting', 'to', 'english'],
 ['change', 'and', 'set', 'your', 'language', 'setting', 'to', 'english'],
 ['set', 'your', 'language', 'setting', 'to', 'english'],
 ['make', 'your', 'preferred', 'language', 'english'],
 ['i',
  'would',
  'like',
  'to',
  'have',
  'language',
  'set',
  'to',
  'spanish',
  ',',
  'please'],
 ['please',
  'give',
  'me',
  'your',
  'responses',
  'only',
  'in',
  'french',
  'from',
  'now',
  'on'],
 ['can', 'we', 'speak', 'in', 'english', 'rather', 'than', 'zulu'],
 ['i',
  'would',
  'like',
  'to',
  'change',
  'your',
  'language',
  'from',
  'portuguese',
  'to',
  'italian'],
 ['change', 'the', 'response', 'language', 'to', 'french', 'please'],
 ['what', 'languages', 'can', 'i', 'switch', 'to', 'for', 'your', 'responses'],
 ['can',
  'you',
  'give',
  'me',
  'the',
  'answers',
  'in',
  'italian',
  'instead',
  'of',
  'english'],
 ['i',
  'want',
  'us',
  'to',
  'speak',
  'to',
  'each',
  'other',
  '

In [14]:
class UnigramFeature(object):
    def __init__(self):
        self.unigram = {}

    def fit(self, text_set: list):
        # Fit a feature extractor based on given data
        index = 0
        for i in range(0, len(text_set)):
            for j in range(0, len(text_set[i])):
                if text_set[i][j].lower() not in self.unigram:
                    self.unigram[text_set[i][j].lower()] = index
                    index += 1
                else:
                    continue

    def transform(self, text: list):
        # Transform a given sentence into vectors based on the extractor you got from self.fit()
        feature = np.zeros(len(self.unigram))
        for i in range(0, len(text)):
            if text[i].lower() in self.unigram:
                feature[self.unigram[text[i].lower()]] += 1
        return feature

    def transform_list(self, text_set: list):
        # Transform a list of tokenized sentences into vectors based on the extractor you got from self.fit()
        features = []
        for i in range(0, len(text_set)):
            features.append(self.transform(text_set[i]))
        return np.array(features)

In [15]:
feat_extractor = UnigramFeature()
feat_extractor.fit(tokenized_text)

In [17]:
# form train set for training
X_train = feat_extractor.transform_list(tokenized_text)
Y_train = train_frame[:,1]
X_train

array([[1., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
Y_train

array(['change_language', 'change_language', 'change_language', ...,
       'weather', 'weather', 'weather'], dtype='<U136')

In [19]:
# form test set for evaluation
tokenized_text = []
for i in range(0, len(test_frame)):
    tokenized_text.append(tokenize(test_frame[i][0]))
X_test = feat_extractor.transform_list(tokenized_text)
Y_test = test_frame[:,1]

In [20]:
class NaiveBayesClassifier:
    def __init__(self):
        self.features_prob = []
        self.label_prob = []

    def fit(self, X, Y):
        # calculate the feature probability and 20 label probability 
        for i in range(len(choosen)):
            features_count = X[np.in1d(Y, choosen[i])]
            features_count = np.sum(features_count,axis=0)
            features_count += 1
            self.features_prob.append(np.divide(features_count, np.sum(features_count)))
            self.label_prob.append(np.sum(np.in1d(Y, choosen[i])) / len(Y))

    def predict(self, X):
        # predict the label result for the given test data
        pred = []
        for row in X:
            pred_prob = np.log(self.label_prob) + np.sum(np.log(np.power(self.features_prob, row)),axis=1)
            pred.append(choosen[np.argmax(pred_prob)])
        return np.array(pred)

In [22]:
def accuracy(pred, labels):
    correct = (np.array(pred) == np.array(labels)).sum()
    accuracy = correct/len(pred)
    print("Accuracy: %i / %i = %.4f " %(correct, len(pred), accuracy))

In [23]:
model = NaiveBayesClassifier()
model.fit(X_train,Y_train)
print("===== Train Accuracy =====")
accuracy(model.predict(X_train), Y_train)
print("===== Test Accuracy =====")
accuracy(model.predict(X_test), Y_test)

===== Train Accuracy =====
Accuracy: 1974 / 2000 = 0.9870 
===== Test Accuracy =====
Accuracy: 579 / 600 = 0.9650 
