## Intent Classification

This notebook is an exercise in model training for language data-sets. The notebook is divided into two parts:

1. Intent Classification using a general Model
2. Intent Classification using a known language model (in this case, BERT).

The code is an exercise of the tools learned in the "Intro to Deep Learning with PyTorch" course by udemy. The data-set used for this is the Kaggle "ATIS" data-set, containing travel requests and intent labeling.

 ### Initializing liberaries

In [167]:
import torch
from torch import nn, optim
import torch.nn.functional as F

from torchvision import datasets, transforms, models

import numpy as np

import transformers
from transformers import AdamW, get_linear_schedule_with_warmup, TrainingArguments

# preprocessing libraries

from string import punctuation
from collections import Counter

### Preprocessing

#### functions

In [168]:
# loading file
def file_loader(DIR):
    '''Loading the file in the directory and returning it in read mode'''
    if type(DIR) != str:
        print ('Please enter file name in string format')
    else:
        with open(DIR, 'r') as f:
            return f.read()
        
# making a list of the words in the text        
def remove_puctuation(text):
    '''Receives a string and returns the same string with the puctuation removed'''
    text = text.lower() # lowercase, standardize
    all_text = ''.join([c for c in text if c not in punctuation])
    return all_text

def split_text(text):
    ''' Receives a string and returns the same string split text by new lines and spaces'''
    text_split = text.split('\n')
    all_text = ' '.join(text_split)
    return all_text

def text_to_word_list(text):
    '''Takes a string and returns a list of all the words used'''
    text = remove_puctuation(text)
    all_text = split_text(text)
    return all_text.split()

# making a word list into a tokenized dictionary
def tokenized_dictionaty(word_list):
    '''Builds a dictionary that maps words to integers'''
    counts = Counter(word_list)
    vocab = sorted(counts, key=counts.get, reverse=True)
    return {word: ii for ii, word in enumerate(vocab, 1)}


def tokenize_list(text, int_dict):
    '''Uses the dict to tokenize each review in split clean data'''
    list_ints = []
    sentence_list = remove_puctuation(text).split('\n')
    for sentence in sentence_list:
        list_ints.append([int_dict[word] for word in sentence.split()])
    return list_ints

def tokenize_label_list(text):
    '''Uses the dict to tokenize each review in train_x_split'''
    list_ints = []
    word_list = text.split('\n')
    int_dict = tokenized_dictionaty(word_list)
    for word in word_list:
        list_ints.append([int_dict[word] for word in text.split()])
    return int_dict, list_ints

## remove any train_x/labels with zero length from the train_x_ints list.

# 
def zero_len_indices(tokenized_list):
    '''get indices of any training data with length 0'''
    return [ii for ii, entry in enumerate(tokenized_list) if len(entry) != 0]


# 
def remove_sentence_zero_entry(data_ints, label_ints):
    '''remove 0-length training data and their labels'''
    non_zero_idx =  zero_len_indices(data_ints)
    data_ints = [data_ints[ii] for ii in non_zero_idx]
    label_ints = np.array([label_ints[ii] for ii in non_zero_idx][0])
    return data_ints, label_ints


#### Preprocessing the training files

##### Loading training files

In [222]:
train_x = file_loader('train/seq.in')
train_y = file_loader('train/label')

In [170]:
# viewing a sample of the data
print(train_x[:494])
print()
print(train_y[:49])

i want to fly from baltimore to dallas round trip
round trip fares from baltimore to philadelphia less than 1000 dollars round trip fares from denver to philadelphia less than 1000 dollars round trip fares from pittsburgh to philadelphia less than 1000 dollars
show me the flights arriving on baltimore on june fourteenth
what are the flights which depart from san francisco fly to washington via indianapolis and arrive by 9 pm
which airlines fly from boston to washington dc via other cities


atis_flight
atis_airfare
atis_flight
atis_flight



##### Creating tokenized dictionary for the words used

In [171]:
train_words = text_to_word_list(train_x)

In [172]:
# sapmling the word-list created
train_words[:10]

['i',
 'want',
 'to',
 'fly',
 'from',
 'baltimore',
 'to',
 'dallas',
 'round',
 'trip']

In [173]:
len(train_words)

50497

##### Building tokenized dictionary

In [174]:
vocab_to_int = tokenized_dictionaty(train_words)
train_x_ints = tokenize_list(train_x, train_x_vocab_to_int)

In [175]:
print('Unique words: ', len((vocab_to_int)))
print()

print('Tokenized request: \n', train_x_ints[:1])

Unique words:  865

Tokenized request: 
 [[18, 68, 1, 36, 2, 22, 1, 21, 50, 46]]


##### encoding the lables

In [176]:
train_y_vocab_to_int, train_y_ints = tokenize_label_list(train_y)

train_y_words = train_y.split('\n')

train_y_vocab_to_int = tokenized_dictionaty(train_y_words)
train_y_ints = tokenize_label_list(train_y, train_y_vocab_to_int)


In [177]:
print('Lable: ', len((train_y_vocab_to_int)))
print()

print('Tokenized lables: \n', train_y_ints[:1])

Lable:  22

Tokenized lables: 
 [[1, 2, 1, 1, 4, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 8, 1, 2, 8, 1, 1, 1, 10, 3, 1, 1, 1, 1, 1, 1, 9, 5, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 5, 1, 4, 3, 1, 1, 1, 6, 1, 4, 1, 1, 1, 1, 1, 1, 2, 4, 1, 1, 2, 1, 1, 5, 1, 1, 1, 1, 1, 2, 1, 2, 4, 5, 1, 1, 1, 11, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 3, 1, 1, 1, 5, 1, 3, 1, 1, 1, 1, 2, 1, 1, 1, 10, 1, 1, 1, 1, 3, 1, 1, 1, 1, 11, 1, 1, 1, 2, 4, 1, 4, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 3, 5, 3, 1, 4, 1, 1, 1, 1, 1, 2, 1, 1, 3, 1, 1, 2, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 3, 1, 2, 5, 1, 1, 2, 1, 2, 1, 1, 6, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 10, 1, 1, 1, 3, 8, 1, 2, 6, 1, 1, 1, 1, 3, 1, 1, 13, 5, 1, 1, 1, 3, 2, 4, 1, 1, 1, 1, 3, 1, 3, 1, 2, 1, 2, 1, 1, 2, 3, 1, 1, 1, 8, 1, 1, 1, 1, 1, 4, 1, 1, 6, 1, 1, 1, 11, 2, 1, 2, 1, 1, 1, 1, 1, 8, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 14, 1, 3, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 2, 1, 1

##### removing the outliers

In [178]:
# outlier review stats
review_lens = Counter([len(x) for x in train_x_ints])
print("Zero-length train_x: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length train_x: 1
Maximum review length: 46


In [179]:
print('Number of train_x before removing outliers: ', len(train_x_ints))

train_x_ints, train_y_ints =  remove_sentence_zero_entry(train_x_ints, train_y_ints)

print('Number of train_x after removing outliers: ', len(train_x_ints))

Number of train_x before removing outliers:  4479
Number of train_x after removing outliers:  4478


##### padding the requests

In [180]:
def pad_features(train_x_ints, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's 
        or truncated to the input seq_length.
    '''
    
    # getting the correct rows x cols shape
    features = np.zeros((len(train_x_ints), seq_length), dtype=int)

    # for each review, I grab that review and 
    for i, row in enumerate(train_x_ints):
        features[i, -len(row):] = np.array(row)[:seq_length]
    
    return features

In [181]:
seq_length = 8

features = pad_features(train_x_ints, seq_length=seq_length)

## test statements -  ##
assert len(features)==len(train_x_ints), "Your features should have as many rows as train_x."
assert len(features[0])==seq_length, "Each feature row should contain seq_length values."

# print first 10 values of the first 10 batches 
print(features[:10,:10])

[[ 18  68   1  36   2  22   1  21]
 [ 50  46  62   2  22   1  24 190]
 [ 10   6   4   3  58   5  22   5]
 [  7  26   4   3  63 191   2  11]
 [ 63  28  36   2   9   1  32  75]
 [171 233  38  13   8   2 105   1]
 [195  16 287   2  19  45  25   1]
 [ 10   6  23   4   3   2  24   1]
 [195  45  25  13   8   5 121 124]
 [  5  82   7   3  72 108   1  86]]


#### Repering the process for validation

In [182]:
val_x = file_loader('dev/seq.in')
val_y = file_loader('dev/label')

In [183]:
print(val_x[:486])
print()
print(val_y[:48])

i want to fly from boston at 838 am and arrive in denver at 1110 in the morning
show me all round trip flights between houston and las vegas
i would like some information on a flight from denver to san francisco on united airlines
what are the coach flights between dallas and baltimore leaving august tenth and returning august twelve
i'm flying from boston to the bay area
okay can you tell me the flight cost between denver and atlanta
from montreal to las vegas
what is the earliest

atis_flight
atis_flight
atis_flight
atis_flight



##### Creating tokenized dictionary for the words used

In [184]:
val_words = text_to_word_list(val_x)

In [185]:
# sampling the word-list created
val_words[:10]

['i', 'want', 'to', 'fly', 'from', 'boston', 'at', '838', 'am', 'and']

In [186]:
len(val_words)

5703

##### Building tokenized dictionary

In [187]:
val_vocab_to_int = tokenized_dictionaty(val_words)
val_x_ints = tokenize_list(val_x, val_vocab_to_int)

In [188]:
print('Unique val_words: ', len((val_vocab_to_int)))
print()

print('Tokenized request: \n', val_x_ints[:1])

Unique val_words:  463

Tokenized request: 
 [[20, 113, 1, 42, 2, 9, 83, 291, 67, 17, 98, 12, 11, 83, 292, 12, 4, 28]]


##### encoding the lables

In [189]:
val_y_vocab_to_int, val_y_ints = tokenize_label_list(val_y)

In [190]:
val_y_ints[:10]

[[1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  1,
  2,
  1,
  1,
  1,
  1,
  13,
  3,
  5,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  6,
  1,
  1,
  1,
  1,
  2,
  1,
  1,
  1,
  1,
  4,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  7,
  8,
  1,
  1,
  1,
  1,
  3,
  1,
  1,
  1,
  8,
  1,
  8,
  9,
  6,
  1,
  12,
  1,
  1,
  2,
  1,
  2,
  1,
  1,
  1,
  1,
  7,
  8,
  3,
  1,
  1,
  1,
  1,
  1,
  3,
  1,
  1,
  1,
  10,
  1,
  8,
  1,
  14,
  1,
  1,
  8,
  1,
  1,
  2,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  6,
  5,
  2,
  1,
  1,
  1,
  1,
  1,
  4,
  1,
  5,
  1,
  2,
  2,
  2,
  1,
  2,
  2,
  1,
  1,
  2,
  1,
  1,
  4,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  4,
  7,
  1,
  1,
  1,
  6,
  1,
  1,
  15,
  7,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  4,
  2,
  1,
  1,
  1,
  7,
  1,
  1,
  1,
  1,
  1,


In [191]:
print('Lable: ', len((val_y_vocab_to_int)))
print()

print('Tokenized lables: \n', val_y_ints[:1])

Lable:  17

Tokenized lables: 
 [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 13, 3, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 1, 2, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 7, 8, 1, 1, 1, 1, 3, 1, 1, 1, 8, 1, 8, 9, 6, 1, 12, 1, 1, 2, 1, 2, 1, 1, 1, 1, 7, 8, 3, 1, 1, 1, 1, 1, 3, 1, 1, 1, 10, 1, 8, 1, 14, 1, 1, 8, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 6, 5, 2, 1, 1, 1, 1, 1, 4, 1, 5, 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 4, 7, 1, 1, 1, 6, 1, 1, 15, 7, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 4, 2, 1, 1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 6, 1, 1, 3, 11, 1, 1, 2, 1, 1, 1, 5, 1, 8, 4, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 3, 1, 1, 1, 1, 5, 1, 8, 1, 1, 5, 6, 1, 1, 1, 1, 2, 4, 1, 1, 6, 1, 3, 1, 1, 1, 1, 1, 8, 1, 1, 3, 3, 2, 9, 1, 1, 3, 2, 3, 1, 1, 1, 1, 1, 4, 1, 1, 1, 3, 5, 1, 1, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

##### removing the outliers

In [192]:
# outlier review stats
review_lens = Counter([len(x) for x in val_x_ints])
print("Zero-length val_x: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length val_x: 1
Maximum review length: 35


In [193]:
print('Number of train_x before removing outliers: ', len(val_x_ints))

val_x_ints, val_y_ints =  remove_sentence_zero_entry(val_x_ints, val_y_ints)

print('Number of train_x after removing outliers: ', len(val_x_ints))

Number of train_x before removing outliers:  501
Number of train_x after removing outliers:  500


#### padding the requests

In [194]:
seq_length = 8

val_features = pad_features(val_x_ints, seq_length=seq_length)

## test statements - do not change - ##
assert len(val_features)==len(val_x_ints), "Your features should have as many rows as val_x."
assert len(val_features[0])==seq_length, "Each feature row should contain seq_length values."

# print first 10 values of the first 10 batches 
print(val_features[:10,:10])

[[ 20 113   1  42   2   9  83 291]
 [  8   7  25  93  68   3  35 140]
 [ 20  36  26 235  84   5  15  10]
 [  6  24   4 180   3  35  23  17]
 [141 181   2   9   1   4 294 295]
 [160  80  43 133   7   4  10 124]
 [  0   0   0   2 115   1  78  69]
 [  6  19   4  64 203  10   2  18]
 [  3   2  22   1  21  35 127  67]
 [  6  19   4 128  10 161   2   9]]


In [195]:
test_x = file_loader('test/seq.in')
test_y = file_loader('test/label')

In [196]:
print(test_x[:494])
print()
print(test_y[:49])

i would like to find a flight from charlotte to las vegas that makes a stop in st. louis
on april first i need a ticket from tacoma to san jose departing before 7 am
on april first i need a flight going from phoenix to san diego
i would like a flight traveling one way from phoenix to san diego on april first
i would like a flight from orlando to salt lake city for april first on delta airlines
i need a flight from toronto to newark one way leaving wednesday evening or thursday morning
mond

atis_flight
atis_airfare
atis_flight
atis_flight



##### Creating tokenized dictionary for the words used

In [197]:
test_words = text_to_word_list(test_x)

In [198]:
# Sampling the word-list created
test_words[:10]

['i', 'would', 'like', 'to', 'find', 'a', 'flight', 'from', 'charlotte', 'to']

In [199]:
len(test_words)

9164

##### Building tokenized dictionary

In [200]:
test_vocab_to_int = tokenized_dictionaty(test_words)
test_x_ints = tokenize_list(test_x, test_vocab_to_int)

In [201]:
print('Unique test_words: ', len((test_vocab_to_int)))
print()

print('Tokenized request: \n', val_x_ints[:1])

Unique test_words:  448

Tokenized request: 
 [[20, 113, 1, 42, 2, 9, 83, 291, 67, 17, 98, 12, 11, 83, 292, 12, 4, 28]]


##### encoding the lables

In [202]:
test_y_vocab_to_int, test_y_ints = tokenize_label_list(test_y)

In [203]:
test_y_ints[:10]

[[1,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  8,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  4,
  1,
  16,
  1,
  16,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  13,
  13,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  7,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  3,
  17,
  1,
  1,
  14,
  3,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  4,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  3,
  1,
  1,
  1,
  1,
  1,
  1,
  4,
  12,
  12,
  1,
  4,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  3,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  15,
  15,
  15,
  1,
  1,
  7,
  1,
  1,
  1,
  2,
  2,
  1,
  5,
  13,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  9,
  9,
  9,
  9,
  12,
  12,
  12,


In [204]:
print('Lable: ', len((test_y_vocab_to_int)))
print()

print('Tokenized lables: \n', test_y_ints[:1])

Lable:  21

Tokenized lables: 
 [[1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 16, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 13, 13, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 17, 1, 1, 14, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 4, 12, 12, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 15, 15, 15, 1, 1, 7, 1, 1, 1, 2, 2, 1, 5, 13, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 12, 12, 12, 12, 3, 1, 1, 2, 10, 1, 2, 1, 8, 8, 8, 1, 1, 1, 1, 8, 3, 3, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 18, 5, 9, 9, 9, 9, 12, 1, 1, 1, 1, 1, 3, 6, 6, 1, 1, 1, 1, 1, 1, 1, 2, 5, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 2, 2, 2, 2, 4, 1, 4,

##### removing the outliers

In [205]:
# outlier review stats
review_lens = Counter([len(x) for x in test_x_ints])
print("Zero-length test_x: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length test_x: 1
Maximum review length: 30


In [206]:
print('Number of train_x before removing outliers: ', len(test_x_ints))

test_x_ints, test_y_ints =  remove_sentence_zero_entry(test_x_ints, test_y_ints)

print('Number of train_x after removing outliers: ', len(test_x_ints))

Number of train_x before removing outliers:  894
Number of train_x after removing outliers:  893


##### padding the requests

In [207]:
seq_length = 8

test_features = pad_features(test_x_ints, seq_length=seq_length)

## test statements - do not change - ##
assert len(test_features)==len(test_x_ints), "Your features should have as many rows as test_x."
assert len(test_features[0])==seq_length, "Each feature row should contain seq_length testues."

# print first 10 testues of the first 10 batches 
print(test_features[:10,:10])

[[ 11  37  22   1  68   9   6   2]
 [  5 104  82  11  21   9 183   2]
 [  5 104  82  11  21   9   6 198]
 [ 11  37  22   9   6 324  99  97]
 [ 11  37  22   9   6   2  25   1]
 [ 11  21   9   6   2  45   1  83]
 [ 60  23  11  37  22   1  61   2]
 [  5  33 104 151  11  37  22   1]
 [ 66 325  19   5  33 104 151  11]
 [ 26 109 156   3   2 155 165   1]]


#### reshaping the data

In [208]:
train_x= features
train_y = train_y_ints.T

val_x, test_x = val_features, test_features
val_y, test_y = val_y_ints.T, test_y_ints.T

## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(4478, 8) 
Validation set: 	(500, 8) 
Test set: 		(893, 8)


In [209]:
from torch.utils.data import TensorDataset, DataLoader

# Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# dataloaders
batch_size = 10

# loading the data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last=True)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size, drop_last=True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size, drop_last=True)

In [210]:
# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([10, 8])
Sample input: 
 tensor([[ 10,   6,  23,   3,   2,  17,   1,  11],
        [ 45,  25,   1, 186,   4,  76,   8,   2],
        [ 10,   6,   4,   3,   2, 136, 137,   1],
        [  7,  47,  51,  20,  57,   2,   4,  19],
        [ 18,  54,  13,   8, 135,   2,  17,   1],
        [ 34,  66,   6,   4,   8, 255,  18,  39],
        [  7,  26,   4,  37,   3,   2,  24,   1],
        [ 10,   6,   4,   3,   2,  22,   1,   9],
        [ 13, 295,  29,  23,   3,   2,   9,   1],
        [  0,  63,  28,  36,  31,   9,  16,  24]], dtype=torch.int32)

Sample label size:  torch.Size([10])
Sample label: 
 tensor([ 1,  1,  1, 21,  1,  7,  1,  1,  1,  4], dtype=torch.int32)


### Intent Classification using general Model

#### Building RNN model

In [211]:
import torch.nn as nn

class SentimentRNN(nn.Module):
  
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super(SentimentRNN, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)

        # embeddings and lstm_out
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        
        lstm_out = lstm_out[:, -1, :] # getting the last time step output
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # sigmoid function
        sig_out = self.sig(out)
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden
        

#### Initiate the model w/ hyperparams

In [212]:
vocab_size = len(vocab_to_int)+len(val_vocab_to_int)+len(test_vocab_to_int)+1
output_size = 1
embedding_dim = 128
hidden_dim = 32
n_layers = 2

net = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

print(net)

SentimentRNN(
  (embedding): Embedding(1777, 128)
  (lstm): LSTM(128, 32, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=32, out_features=1, bias=True)
  (sig): Sigmoid()
)


#### Training

In [213]:
# loss and optimization functions
lr=0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

In [214]:
# First checking if GPU is available
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

No GPU available, training on CPU.


In [215]:
# training params

epochs = 4 # 

counter = 0
print_every = 100
clip=1 # gradient clipping

# move model to GPU, if available
if(train_on_gpu):
    net.cuda()

net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        output, h = net(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()

                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

Epoch: 1/4... Step: 100... Loss: -5.451442... Val Loss: -5.681074
Epoch: 1/4... Step: 200... Loss: -14.726797... Val Loss: -9.377963
Epoch: 1/4... Step: 300... Loss: -3.869335... Val Loss: -12.703091
Epoch: 1/4... Step: 400... Loss: -14.509479... Val Loss: -15.866137
Epoch: 2/4... Step: 500... Loss: -4.453131... Val Loss: -109.000000
Epoch: 2/4... Step: 600... Loss: -157.971191... Val Loss: -109.000000
Epoch: 2/4... Step: 700... Loss: -80.000000... Val Loss: -109.000000
Epoch: 2/4... Step: 800... Loss: 0.000000... Val Loss: -109.000000
Epoch: 3/4... Step: 900... Loss: -140.000000... Val Loss: -109.000000
Epoch: 3/4... Step: 1000... Loss: -80.000000... Val Loss: -109.000000
Epoch: 3/4... Step: 1100... Loss: -150.000000... Val Loss: -109.000000
Epoch: 3/4... Step: 1200... Loss: -30.000000... Val Loss: -109.000000
Epoch: 3/4... Step: 1300... Loss: -240.000000... Val Loss: -109.000000
Epoch: 4/4... Step: 1400... Loss: -70.000000... Val Loss: -109.000000
Epoch: 4/4... Step: 1500... Loss: -1

In [216]:
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0

# init hidden state
h = net.init_hidden(batch_size)

net.eval()
# iterate over test data
for inputs, labels in test_loader:

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
    
    # get predicted outputs
    output, h = net(inputs, h)
    
    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer
    
    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: -142.022
Test accuracy: 0.707


In [217]:
def tokenize_req(test_req):
    test_req = test_req.lower() # lowercase
    # get rid of punctuation
    test_text = ''.join([c for c in test_req if c not in punctuation])

    # splitting by spaces
    test_words = test_text.split()

    # tokens
    test_ints = []
    test_ints.append([vocab_to_int.get(word, 0) for word in test_words])

    return test_ints

In [218]:
def predict(net, test_req, sequence_length=200):
    
    net.eval()
    
    # tokenize req
    test_ints = tokenize_req(test_req)
    
    # pad tokenized sequence
    seq_length=sequence_length
    features = pad_features(test_ints, seq_length)
    
    # convert to tensor to pass into your model
    feature_tensor = torch.from_numpy(features)
    
    batch_size = feature_tensor.size(0)
    
    # initialize hidden state
    h = net.init_hidden(batch_size)
    
    if(train_on_gpu):
        feature_tensor = feature_tensor.cuda()
    
    # get the output from the model
    output, h = net(feature_tensor, h)
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze()) 
    # printing output value, before rounding
    print('Prediction value: {:.6f}'.format(output.item()))
        

In [223]:
test_x_split = split_text(train_x)
test_x_split[np.random.randint(len(test_x_split))]

'a'

In [224]:
# call function
seq_length=8

predict(net, test_x_split[np.random.randint(len(test_x_split))], seq_length)

Prediction value: 1.000000
