# NLP Architect - NP Semantic segmentation tutorial

Let's import all the relevant classes

In [None]:
import os
os.chdir(os.path.dirname(os.path.dirname(os.getcwd())))
os.getcwd()

In [None]:
import examples

In [None]:
from examples.np_semantic_segmentation.data import NpSemanticSegData
from examples.np_semantic_segmentation.preprocess_tratz2011 import *
from examples.np_semantic_segmentation.data import *
from nlp_architect.models.np_semantic_segmentation import NpSemanticSegClassifier

## Preparing the data

The first step is the download the dataset into a folder. You can download Tratz 2011 et al. dataset from the following link: [Tratz 2011 Dataset](https://vered1986.github.io/papers/Tratz2011_Dataset.tar.gz). Is also available in [here](https://www.isi.edu/publications/licensed-sw/fanseparser/index.html). (The terms and conditions of the data set license apply. Intel does not grant any rights to the data files or database).

After downloading and unzipping the dataset, the following method will labels some portion of the data, and will output two `.csv` files that will assist us to train and evaluate the trained model.

In [None]:
dataset_path = '<Tratz2011_dataset_local_path>'
preprocess_tratz_2011(dataset_path)

Once the dataset is saved and labeled we need to vectories the data:

In [None]:
# labeled_data_path is the output of preprocess_tratz_2011()
labeled_train_data_path = os.path.join(dataset_path,'tratz2011_coarse_grained_random/train.csv')
labeled_val_data_path = os.path.join(dataset_path,'tratz2011_coarse_grained_random/val.csv')
word2vec_path = '<local_path_to_word_embeddings>/GoogleNews-vectors-negative300.bin.gz'
# output_path is location to save the vectors
train_output_path = 'nlp_architect/data/np_semantic_segmentation/prepared_data_train.csv'
val_output_path = 'nlp_architect/data/np_semantic_segmentation/prepared_data_val.csv'
http_proxy = None
https_proxy = None

In [None]:

prepare_data(labeled_train_data_path, train_output_path, word2vec_path, http_proxy, https_proxy)
prepare_data(labeled_val_data_path, val_output_path, word2vec_path, http_proxy, https_proxy)

We now need to load the data into NpSemanticSegmentation object.

In [None]:
data_set = NpSemanticSegData(train_output_path, train_to_test_ratio=0.8)

## Build the model

In [None]:
    model_file_path = 'np_semantic_segmentation.h5'
    num_epochs = 200
    model = NpSemanticSegClassifier(num_epochs=200, callback_args=None)
    input_dim = data_set.train_set_x.shape[1]
    model.build(input_dim)
    model.fit(data_set.train_set)


Great! We now have a MLP classifier for collocations. Let's evaluate it on the val_data_set:

In [None]:
val_dataset = data_set = NpSemanticSegData(val_output_path, train_to_test_ratio=1)

loss, binary_accuracy, precision, recall, f1 = model.eval(val_dataset.train_set)
print('loss = %.1f%%' % (loss))
print('Test binary_accuracy rate = %.1f%%' % (binary_accuracy * 100))
print('Test precision rate = %.1f%%' % (precision * 100))
print('Test recall rate = %.1f%%' % (recall * 100))
print('Test f1 rate = %.1f%%' % (f1 * 100))
