In [1]:
# VERSION SUMMARY

# version 6: small bugfix
# version 5: added example for tokenization and prediction
# version 4: added apex install for mixed precision training 

In [2]:
import numpy as np 
import pandas as pd 
import os
import torch

## Install requirements

0 (optional). install apex for mixed presicion support

In [3]:
!cd ../input/apex-master/apex-master/apex-master/ && pip install --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .

  cmdoptions.check_install_build_global(options)
Processing /kaggle/input/apex-master/apex-master/apex-master
Installing collected packages: apex
  Running setup.py install for apex ... [?25l- \ | / - \ | / - \ | / - \ | / - \ done
[?25hSuccessfully installed apex-0.1


1. pip install pytorch-pretrained-bert without internet

In [4]:
os.system('pip install --no-index --find-links="../input/pytorchpretrainedbert/" pytorch_pretrained_bert')

0

## Import Bert

In [5]:
from pytorch_pretrained_bert import BertTokenizer
from pytorch_pretrained_bert.modeling import BertModel

In [6]:
BERT_FP = '../input/torch-bert-weights/bert-base-uncased/bert-base-uncased/'

2. create BERT model and put on GPU

In [7]:
bert = BertModel.from_pretrained(BERT_FP).cuda()
bert.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): FusedLayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): FusedLayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1)
          )
        )
        (interme

## Setup tokenizer

In [8]:
tokenizer = BertTokenizer(vocab_file='../input/torch-bert-weights/bert-base-uncased-vocab.txt')

## Make prediction

In [9]:
# lets tokenize some text (I intentionally mispelled 'plastic' to check berts subword information handling)
text = 'hi my name is Dieter and I like wearing my yellow pglastic hat while coding.'
tokens = tokenizer.tokenize(text)
tokens

['hi',
 'my',
 'name',
 'is',
 'dieter',
 'and',
 'i',
 'like',
 'wearing',
 'my',
 'yellow',
 'pg',
 '##lastic',
 'hat',
 'while',
 'coding',
 '.']

In [10]:
# added start and end token and convert to ids
tokens = ["[CLS]"] + tokens + ["[SEP]"]
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids

[101,
 7632,
 2026,
 2171,
 2003,
 27976,
 1998,
 1045,
 2066,
 4147,
 2026,
 3756,
 18720,
 28723,
 6045,
 2096,
 16861,
 1012,
 102]

In [11]:
# put input on gpu and make prediction
bert_output = bert(torch.tensor([input_ids]).cuda())
bert_output

([tensor([[[ 0.0687,  0.0265, -0.2058,  ...,  0.2069, -0.1011, -0.0442],
           [ 0.2943, -0.5316,  0.5222,  ..., -0.4604, -0.2933, -0.8875],
           [ 0.2012,  0.5539, -0.7429,  ..., -1.3243,  0.4854, -0.2764],
           ...,
           [ 1.3986,  0.2449,  0.3170,  ...,  0.8570, -0.0756, -0.0035],
           [-0.1987,  0.2619,  0.0311,  ...,  0.4481, -0.1807,  0.3393],
           [-0.1851,  0.0641, -0.0412,  ..., -0.0486,  0.0755, -0.1101]]],
         device='cuda:0', grad_fn=<FusedLayerNormAffineFunction>),
  tensor([[[-0.0126, -0.2542, -0.3644,  ...,  0.3059,  0.0251, -0.0566],
           [ 0.6743, -0.1612,  1.0591,  ..., -0.0583,  0.0757, -1.3226],
           [ 0.2076,  0.2962, -0.4196,  ..., -0.9514,  0.6205, -0.5414],
           ...,
           [ 1.8828, -0.0887,  0.2906,  ...,  1.2699, -0.0794, -0.2816],
           [-0.2364,  0.0968,  0.2366,  ...,  0.0532, -0.2873,  0.3852],
           [-0.2301, -0.1536,  0.1353,  ..., -0.0651,  0.1175, -0.3021]]],
         device='cuda

## (Optional) Convert model to fp16

In [12]:
import apex
bert.half()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): FusedLayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): FusedLayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1)
          )
        )
        (interme