In [1]:
include("./XLNet.jl")
using PyCall
using DelimitedFiles
using JLD2
using Knet
using .XLNet

│ Some functionality might not work. For a fully-supported set-up, please use an older version of CUDA.jl
└ @ CUDA /home/ec2-user/.julia/packages/CUDA/gKMm0/src/state.jl:251


## Set Hyperparameter

In [2]:
hparams = xlnet_base_hparams #xlnet_base_params are defined inside XLNet Module

 #Set sequance length and batch size accoring to your GPU
SEQ_LEN = 340
BATCH_SIZE = 4

#this hparam stands for how many layers to freeze from beggining.
#You may play with this hparam according to your GPU memory. (There are tottaly 12 layers)
hparams["n_freeze"] = 6

6

## Prepare Dataset

For this example, we will work on sentiment classification on IMDB dataset (https://ai.stanford.edu/~amaas/data/sentiment/). Dataset can be downloaded as follows:

```$ wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz```  
```$ tar -xf aclImdb_v1.tar.gz```

In [3]:
dataset_path = "/home/ec2-user/aclImdb"

#Read Train Data
pos_train_files = readdir(dataset_path * "/train/pos")
neg_train_files = readdir(dataset_path * "/train/neg")
n_train = length(pos_train_files) + length(neg_train_files)
n_classes = 2
xtrn,ytrn = [],[]

for filename in pos_train_files
    s = open( dataset_path * "/train/pos/" * filename) do file; read(file, String); end
    push!(xtrn,s)
    push!(ytrn,"pos")
end

for filename in neg_train_files
    s = open( dataset_path * "/train/neg/" * filename) do file; read(file, String); end
    push!(xtrn,s)
    push!(ytrn,"neg")
end

#Read Test Data
pos_test_files = readdir(dataset_path * "/test/pos")
neg_test_files = readdir(dataset_path * "/test/neg")
n_train = length(pos_test_files) + length(neg_test_files)
xtst,ytst = [],[]

for filename in pos_test_files
    s = open( dataset_path * "/test/pos/" * filename) do file; read(file, String); end
    push!(xtst,s)
    push!(ytst,"pos")
end

for filename in neg_test_files
    s = open( dataset_path * "/test/neg/" * filename) do file; read(file, String); end
    push!(xtst,s)
    push!(ytst,"neg")
end

In [4]:
#=
This function converts given text sample to token ids, adjust its length based on the sequance length and
adds special tokens at the end of the sample.
=#

function prepare_sample( text, seq_len, sp )
  input_ids = sp.encode_as_ids( text )
  input_length = length(input_ids)

  if( input_length > seq_len - 2 ); input_length = seq_len - 2; end
  input_ids = input_ids[1:input_length]
  
  push!(input_ids, specaialTokens["<SEP>"] )
  push!(input_ids, specaialTokens["<CLS>"] )

  attn_mask = zeros(Int32, seq_len)
  padded_input_ids = zeros(Int32, seq_len)
  
  attn_mask[1:input_length + 2] .= 1
  attn_mask = 1 .- attn_mask
  padded_input_ids[1:input_length + 2] = input_ids
  (padded_input_ids, attn_mask)
end

prepare_sample (generic function with 1 method)

In [5]:
#We need to use sentencepice tokenizer from pycall
#sentencepiece tokenizer : https://github.com/google/sentencepiece

spm = pyimport("sentencepiece")
sp = spm.SentencePieceProcessor()
sp.load("/home/ec2-user/project/checkpoint/spiece.model")

println("prepearing train data...")
xtrn = [ prepare_sample( x, SEQ_LEN, sp ) for x in progress(xtrn) ]
ytrn = [ y=="pos" ? 2 : 1 for y in progress(ytrn) ]

trn_token_ids = [x[1] for x in xtrn ]
trn_attn_masks= [x[2] for x in xtrn ]
xtrn = permutedims( hcat(trn_token_ids, trn_attn_masks ) , [2,1] )

println("prepearing test data...")
xtst = [ prepare_sample( x, SEQ_LEN, sp ) for x in progress(xtst) ]
ytst = [ y == "pos" ? 2 : 1 for y in progress(ytst) ]

tst_token_ids = [x[1] for x in xtst ]
tst_attn_masks= [x[2] for x in xtst ]
xtst = permutedims( hcat(tst_token_ids, tst_attn_masks ), [2,1] )

dtrn = minibatch( xtrn, ytrn ,BATCH_SIZE, shuffle=true )
dtst = minibatch( xtst, ytst ,BATCH_SIZE, shuffle=true )

prepearing train data...


┣████████████████████┫ [100.00%, 25000/25000, 00:25/00:25, 983.28i/s] 
┣████████████████████┫ [100.00%, 25000/25000, 00:00/00:00, 857608.31i/s] 
┣                    ┫ [0.00%, 1/25000, 00:00/00:00, 56837.56i/s] 

prepearing test data...


┣████████████████████┫ [100.00%, 25000/25000, 00:26/00:26, 964.06i/s] 
┣████████████████████┫ [100.00%, 25000/25000, 00:00/00:00, 838001.59i/s] 


6250-element Knet.Train20.Data{Tuple{Array{Array{Int64,1},N} where N,Array{Int64,N} where N}}

## Define XLNet Model

To be able to run finetuning, you need to download pretrained weights. I have prepared pretrained weights in JLD2 format.  
## TODO: Add weights and tokenizer files to drive and give link

In [6]:
#Load Weights
@load "/home/ec2-user/project/checkpoint/weights_base.jld2" weights
weights

Dict{Any,Any} with 18 entries:
  "layer_0"  => Dict{Any,Any}("ff"=>Dict{Any,Any}("layer_1"=>Dict{Any,Any}("ker…
  "layer_5"  => Dict{Any,Any}("ff"=>Dict{Any,Any}("layer_1"=>Dict{Any,Any}("ker…
  "layer_1"  => Dict{Any,Any}("ff"=>Dict{Any,Any}("layer_1"=>Dict{Any,Any}("ker…
  "layer_11" => Dict{Any,Any}("ff"=>Dict{Any,Any}("layer_1"=>Dict{Any,Any}("ker…
  "layer_4"  => Dict{Any,Any}("ff"=>Dict{Any,Any}("layer_1"=>Dict{Any,Any}("ker…
  "layer_9"  => Dict{Any,Any}("ff"=>Dict{Any,Any}("layer_1"=>Dict{Any,Any}("ker…
  "mask_emb" => Float32[1.0, 0.0, 1.0]
  "seg_emb"  => Float32[-0.089485 0.00394292; -0.0242453 0.0256528; … ; -0.0594…
  "layer_7"  => Dict{Any,Any}("ff"=>Dict{Any,Any}("layer_1"=>Dict{Any,Any}("ker…
  "r_r_bias" => Float32[-0.0875984 0.117815 … -0.0863528 0.0588675; 0.326463 0.…
  "layer_8"  => Dict{Any,Any}("ff"=>Dict{Any,Any}("layer_1"=>Dict{Any,Any}("ker…
  "layer_6"  => Dict{Any,Any}("ff"=>Dict{Any,Any}("layer_1"=>Dict{Any,Any}("ker…
  "r_w_bias" => Float32[-0.0260591 0.07

In [7]:
model = create_xlnet_model( hparams, weights )
classifier = XLNetClassifier( hparams["d_model"], n_classes , model )
println("classifier model created")

classifier model created


In [None]:
acc = accuracy( classifier ,  progress( dtst ) )
println("accuracy : " , acc)

└ @ Knet.Ops20 /home/ec2-user/.julia/packages/Knet/C0PoK/src/ops20/loss.jl:205
┣▎                   ┫ [1.86%, 116/6250, 17:06/15:21:43, 8.52s/i] 