# Settyl Data Science And Machine Learning Engineer Task

## Import necessary modules and utility functions

In [1]:
import json, numpy as np

from utils import preprocess, train_test_split, tokenize, labels2tensor, dynamically_batch

np.random.seed(0)

## Importing and visualizing the dataset

We laod the data from the json path below. Data is in the form of a list of dictionaries, each containing pair of values of external status and internal status. To visualize the data, we print the tail of the dataset (last 5 rows). We use the last 5 rows instead of the first 5 to check for any redundant rows that may have been loaded due to newlines at the end of the data file.

In [2]:
# Path to json
json_path = "dataset.json"

# Loading the data
with open(json_path) as fp:
    data = json.load(fp)

# Tail of the data
data[-5:]

[{'externalStatus': 'Import Loaded on Rail',
  'internalStatus': 'Loaded on Vessel'},
 {'externalStatus': 'Full Transshipment Loaded',
  'internalStatus': 'Loaded on Vessel'},
 {'externalStatus': 'Full Transshipment Loaded',
  'internalStatus': 'Loaded on Vessel'},
 {'externalStatus': 'Export Loaded on Vessel',
  'internalStatus': 'Loaded on Vessel'},
 {'externalStatus': 'Empty to Shipper',
  'internalStatus': 'Empty Container Released'}]

## Preprocessing

To preprocess the data, we iterate over the pairs of external and internal statuses. We then strip off any white spaces and convert the texts to lower case. We also remove any non-alphanumeric characters for simplicity. Check the `preprocess` function in [utils.py](utils.py) for more details.

We extract the inputs (external statuses), labels (internal statuses), unique labels, and vocabulary after preprocessing. Note that the vocabulary only contains distinct words from external statuses since internal statuses are treated as classes.

In [3]:
texts, labels, unique_labels, vocab = preprocess(data)

In [4]:
# Length of our vocabulary

len(vocab)

148

In [5]:
# Unique labels in our dataset

unique_labels

['arrival',
 'departure',
 'empty container released',
 'empty return',
 'gate in',
 'gate out',
 'inbound terminal',
 'intransit',
 'loaded on vessel',
 'off rail',
 'on rail',
 'outbound terminal',
 'port in',
 'port out',
 'unloaded on vessel']

In [6]:
# Maximum number of words in our texts

max(map(lambda x: len(x.split()), texts))

12

### Splitting the data

We split the data into train and test sets. We will use the train set for training and test set for evaluation.

In [7]:
train_ratio = 0.8
shuffle = True

train_texts, train_labels, test_texts, test_labels = train_test_split(texts, labels, train_ratio, shuffle)

In [8]:
# Lengths of train and test sets

len(train_texts), len(test_texts)

(977, 245)

### Creating tensor dataset

The train data is dynamically batched (batching similar length sequences together) and the labels are vectorized. We dynamically batch our train set for efficient computations, since padding will be minimal. The test set in vectorized into one single batch.

In [9]:
max_tokens = 12

test_inputs, test_classes = tokenize(test_texts, vocab, max_tokens), labels2tensor(test_labels, unique_labels)

In [10]:
# Tail of vectorized test set

test_inputs[-5:], test_classes[-5:]

(tensor([[103, 119,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
         [103, 119,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
         [103, 119,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
         [103, 119,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
         [103, 119,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]]),
 tensor([ 1,  1, 14,  5,  1]))

Now, we will implement dynamic batching for train set. To do so, we sort the train texts by the number of tokens and batch accordingly. Refer [utils.py](utils.py) for more details.

In [13]:
batch_size = 32

batched_train = dynamically_batch(train_texts, train_labels, batch_size, vocab, unique_labels, max_tokens)

In [15]:
# Tail of the dynamically batched train set

batched_train[-5:]

[(tensor([[ 87, 104,  95, 130,  27],
          [ 87, 104,  95, 130,  27],
          [ 87, 104,  95, 130,  27],
          [ 87, 104,  95, 130,  27],
          [ 87, 104,  95, 130,  27],
          [ 87, 104,  95, 130,  27],
          [ 87, 104,  95, 130,  27],
          [ 87, 104,  95, 130,  27],
          [ 87, 104,  95, 130,  27],
          [ 87, 104,  95, 130,  27],
          [ 87, 104,  95, 130,  27],
          [ 87, 104,  95, 130,  27],
          [ 87, 104,  95, 130,  27],
          [ 87, 104,  95, 130,  27],
          [ 87, 104,  95, 130,  27],
          [ 87, 104,  95, 130,  27],
          [ 87, 104,  95, 130,  27],
          [ 87, 104,  95, 130,  27],
          [ 87, 104,  95, 130,  27],
          [ 87, 104,  95, 130,  27],
          [ 87, 104,  95, 130,  27],
          [ 87, 104,  95, 130,  27],
          [ 87, 104,  95, 130,  27],
          [ 87, 104,  95, 130,  27],
          [ 87, 104,  95, 130,  27],
          [ 87, 104,  95, 130,  27],
          [ 87, 104,  95, 130,  27],
 