In [1]:
import poutyne

import os

from deepparse import download_from_url
from deepparse.dataset_container import PickleDatasetContainer
from deepparse.parser import AddressParser

In [8]:
# First, let's download the train and test data with "new tags" from the public repository.
saving_dir = "/tf/empty_homes_data/deepparse_data"
file_extension = "p"
training_dataset_name = "sample_incomplete_data"
test_dataset_name = "test_sample_data"
download_from_url(training_dataset_name, saving_dir, file_extension=file_extension)
download_from_url(test_dataset_name, saving_dir, file_extension=file_extension)

In [9]:
# Now let's create a training and test container.
training_container = PickleDatasetContainer(os.path.join(saving_dir, training_dataset_name + "." + file_extension))
test_container = PickleDatasetContainer(os.path.join(saving_dir, test_dataset_name + "." + file_extension))

In [4]:
# We will retrain the fasttext attention version of our pretrained model.
model = "bpemb"
address_parser = AddressParser(model_type=model, device=0, attention_mechanism=True)

Loading the embeddings model
downloading https://nlp.h-its.org/bpemb/multi/multi.wiki.bpe.vs100000.model


100%|██████████| 1965223/1965223 [00:00<00:00, 5283216.75B/s]


downloading https://nlp.h-its.org/bpemb/multi/multi.wiki.bpe.vs100000.d300.w2v.bin.tar.gz


100%|██████████| 112202964/112202964 [00:13<00:00, 8608446.26B/s]


Downloading the weights for the network bpemb_attention.


In [12]:
help(PickleDatasetContainer)

Help on class PickleDatasetContainer in module deepparse.dataset_container.dataset_container:

class PickleDatasetContainer(DatasetContainer)
 |  PickleDatasetContainer(*args, **kwds)
 |  
 |  Pickle dataset container that imports a list of addresses in pickle format and does some validation on it.
 |  
 |  The dataset needs to be a list of tuples where the first element of each tuple is the address (a string),
 |  and the second is a list of the expected tag to predict (e.g. ``[('an address', ['a_tag', 'another_tag']), ...]``).
 |  The len of the tags needs to be the same as the len of the address when whitespace split.
 |  
 |  For a training container, the validation tests applied on the dataset are the following:
 |  
 |      - all addresses are not empty,
 |      - all addresses are not whitespace string,
 |      - all tags are not empty, if data is a list of tuple (``[('an address', ['a_tag', 'another_tag']), ...]``), and
 |      - if the addresses (whitespace-split) are the same

In [5]:
# Now, let's retrain for 5 epochs using a batch size of 8 since the data is really small for the example.
# Let's start with the default learning rate of 0.01 and use a learning rate scheduler to lower the learning rate
# as we progress.
lr_scheduler = poutyne.StepLR(step_size=1, gamma=0.1)  # reduce LR by a factor of 10 each epoch

In [10]:
# The path to save our checkpoints
logging_path = saving_dir +"/checkpoints21"

address_parser.retrain(
    training_container, 0.8, epochs=5, batch_size=8, num_workers=2, callbacks=[lr_scheduler], logging_path=logging_path
)

Epoch: 1/5 Train steps: 100 Val steps: 25 6.18s loss: 4.841451 accuracy: 85.520933 val_loss: 1.745821 val_accuracy: 91.753353
Epoch 1: val_loss improved from inf to 1.74582, saving file to /tf/empty_homes_data/deepparse_data/checkpoints21/checkpoint_epoch_1.ckpt
Epoch: 2/5 Train steps: 100 Val steps: 25 6.15s loss: 1.477323 accuracy: 94.480761 val_loss: 1.559152 val_accuracy: 92.116948
Epoch 2: val_loss improved from 1.74582 to 1.55915, saving file to /tf/empty_homes_data/deepparse_data/checkpoints21/checkpoint_epoch_2.ckpt
Epoch: 3/5 Train steps: 100 Val steps: 25 6.25s loss: 1.396164 accuracy: 94.783523 val_loss: 1.552267 val_accuracy: 92.205836
Epoch 3: val_loss improved from 1.55915 to 1.55227, saving file to /tf/empty_homes_data/deepparse_data/checkpoints21/checkpoint_epoch_3.ckpt
Epoch: 4/5 Train steps: 100 Val steps: 25 6.30s loss: 1.251943 accuracy: 94.933769 val_loss: 1.551720 val_accuracy: 92.205836
Epoch 4: val_loss improved from 1.55227 to 1.55172, saving file to /tf/empty_

[{'epoch': 1,
  'time': 6.178594456003339,
  'loss': 4.841450645063157,
  'accuracy': 85.52093291700932,
  'val_loss': 1.745821045935154,
  'val_accuracy': 91.75335266113281},
 {'epoch': 2,
  'time': 6.149312863999512,
  'loss': 1.4773228324594951,
  'accuracy': 94.48076105715339,
  'val_loss': 1.5591523513197898,
  'val_accuracy': 92.11694763183594},
 {'epoch': 3,
  'time': 6.2536688369946205,
  'loss': 1.3961643166933442,
  'accuracy': 94.78352298593163,
  'val_loss': 1.552267059981823,
  'val_accuracy': 92.20583648681641},
 {'epoch': 4,
  'time': 6.29782110099768,
  'loss': 1.2519434146713793,
  'accuracy': 94.93376857595037,
  'val_loss': 1.5517196476459503,
  'val_accuracy': 92.20583648681641},
 {'epoch': 5,
  'time': 6.322195367007225,
  'loss': 1.2358837545963757,
  'accuracy': 95.18101277745755,
  'val_loss': 1.5516423231363297,
  'val_accuracy': 92.20583648681641}]

In [11]:
# Now, let's test our fine-tuned model using the best checkpoint (default parameter).
address_parser.test(test_container, batch_size=256)

Running test
Test steps: 1 0.19s test_loss: 1.189186 test_accuracy: 96.875000                               


{'time': 0.187858019999112,
 'test_loss': 1.1891863346099854,
 'test_accuracy': 96.875}

In [13]:
training_container


<deepparse.dataset_container.dataset_container.PickleDatasetContainer at 0x7f49e7b52be0>