In [1]:
include("./XLNet.jl")
using PyCall
using DelimitedFiles
using JLD2
using Knet
using Random
using .XLNet

│ Some functionality might not work. For a fully-supported set-up, please use an older version of CUDA.jl
└ @ CUDA /home/ec2-user/.julia/packages/CUDA/gKMm0/src/state.jl:251


## Set Hyperparameter

In [2]:
hparams = xlnet_base_hparams #xlnet_base_params are defined inside XLNet Module

#Set sequance length and batch size accoring to your GPU
SEQ_LEN = 340
BATCH_SIZE = 1

#this hparam stands for how many layers to freeze from beggining.
#You may play with this hparam according to your GPU memory. (There are tottaly 12 layers)
hparams["n_freeze"] = 6

6

## Prepare Dataset

For this example, we will work on sentiment classification on IMDB dataset (https://ai.stanford.edu/~amaas/data/sentiment/). Dataset can be downloaded as follows:

```$ wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz```  
```$ tar -xf aclImdb_v1.tar.gz```

In [3]:
dataset_path = "/home/ec2-user/aclImdb"

#Read Train Data
pos_train_files = readdir(dataset_path * "/train/pos")
neg_train_files = readdir(dataset_path * "/train/neg")
n_train = length(pos_train_files) + length(neg_train_files)
n_classes = 2 #classes
xtrn,ytrn = [],[]

for filename in pos_train_files
    s = open( dataset_path * "/train/pos/" * filename) do file; read(file, String); end
    push!(xtrn,s)
    push!(ytrn,"pos")
end

for filename in neg_train_files
    s = open( dataset_path * "/train/neg/" * filename) do file; read(file, String); end
    push!(xtrn,s)
    push!(ytrn,"neg")
end

#Read Test Data
pos_test_files = readdir(dataset_path * "/test/pos")
neg_test_files = readdir(dataset_path * "/test/neg")
n_train = length(pos_test_files) + length(neg_test_files)
xtst,ytst = [],[]

for filename in pos_test_files
    s = open( dataset_path * "/test/pos/" * filename) do file; read(file, String); end
    push!(xtst,s)
    push!(ytst,"pos")
end

for filename in neg_test_files
    s = open( dataset_path * "/test/neg/" * filename) do file; read(file, String); end
    push!(xtst,s)
    push!(ytst,"neg")
end


In [4]:
#=
This function converts given text sample to token ids, adjust its length based on the sequance length and
adds special tokens at the end of the sample.
=#

function prepare_sample( text, seq_len, sp )
  input_ids = sp.encode_as_ids( text )
  input_length = length(input_ids)

  if( input_length > seq_len - 2 ); input_length = seq_len - 2; end
  input_ids = input_ids[1:input_length]
  
  push!(input_ids, specaialTokens["<SEP>"] )
  push!(input_ids, specaialTokens["<CLS>"] )

  attn_mask = zeros(Int32, seq_len)
  padded_input_ids = zeros(Int32, seq_len)
  
  attn_mask[1:input_length + 2] .= 1
  attn_mask = 1 .- attn_mask
  padded_input_ids[1:input_length + 2] = input_ids
  (padded_input_ids, attn_mask)
end


prepare_sample (generic function with 1 method)

In [5]:
#We need to use sentencepice tokenizer from pycall
#sentencepiece tokenizer : https://github.com/google/sentencepiece

spm = pyimport("sentencepiece")
sp = spm.SentencePieceProcessor()
sp.load("/home/ec2-user/project/checkpoint/spiece.model")

println("prepearing train data...")
xtrn = [ prepare_sample( x, SEQ_LEN, sp ) for x in progress(xtrn) ]
ytrn = [ y=="pos" ? 2 : 1 for y in progress(ytrn) ]

trn_token_ids = hcat( [x[1] for x in xtrn ]... )
trn_attn_masks= hcat( [x[2] for x in xtrn ]... )
xtrn = cat(trn_token_ids,trn_attn_masks,dims=3)
xtrn = permutedims( xtrn, [1,3,2] )

println("prepearing test data...")
xtst = [ prepare_sample( x, SEQ_LEN, sp ) for x in progress(xtst) ]
ytst = [ y == "pos" ? 2 : 1 for y in progress(ytst) ]

tst_token_ids = hcat( [x[1] for x in xtst ]... )
tst_attn_masks= hcat( [x[2] for x in xtst ]... )
xtst = cat(tst_token_ids,tst_attn_masks,dims=3)
xtst = permutedims( xtst, [1,3,2] )

#Split Validation
order = shuffle( collect(1:n_train) )
xtrn = xtrn[:,:,order]
ytrn = ytrn[order]


nval = 2000
xval = xtrn[:,:,1:nval]
yval = ytrn[1:nval]
xtrn = xtrn[:,:,nval+1:end]
ytrn = ytrn[nval+1:end]

dtrn = minibatch( xtrn, ytrn ,BATCH_SIZE, shuffle=true )
dval = minibatch( xval, yval ,BATCH_SIZE, shuffle=true )
dtst = minibatch( xtst, ytst ,BATCH_SIZE, shuffle=true )

prepearing train data...


┣████████████████████┫ [100.00%, 25000/25000, 00:23/00:23, 1076.48i/s] 
┣████████████████████┫ [100.00%, 25000/25000, 00:00/00:00, 884708.47i/s] 
┣                    ┫ [0.00%, 1/25000, 00:00/00:00, 60830.95i/s] 

prepearing test data...


┣████████████████████┫ [100.00%, 25000/25000, 00:24/00:24, 1053.16i/s] 
┣████████████████████┫ [100.00%, 25000/25000, 00:00/00:00, 866233.82i/s] 


25000-element Knet.Train20.Data{Tuple{Array{Int64,N} where N,Array{Int64,N} where N}}

## Define XLNet Model

To be able to run finetuning, you need to download pretrained weights. I have prepared pretrained weights in JLD2 format.  
## TODO: Add weights and tokenizer files to drive and give link

In [12]:
#Load Weights;
@load "/home/ec2-user/project/checkpoint/weights_base.jld2" weights
model = create_xlnet_model( hparams, weights )
classifier = XLNetClassifier( hparams["d_model"], n_classes , model )
println("classifier model created")

classifier model created


## Training !

Here we will train for 3 epoch, and save the best performing weights on the validation set.

In [None]:
trainer = adam( classifier, dtrn , lr = 1e-5 , eps=1e-8 )
best_acc = 0
for i=1:3
    println("training epoch ",i)
    progress!(trainer)
    acc = acc = accuracy( classifier ,  progress( dval ) )
    if( acc > best_acc )
        best_acc = acc
        save("best.jld2", classifier ) #pretty easy saving :)
    end
end

training epoch 1


┣                    ┫ [0.40%, 92/23000, 08:40/36:05:45, 5.41s/i]  

## Test !

Load the saved weights  and test on test data.

In [10]:
classifier = XLNetClassifier("best.jld2") #Load model from saved path
acc = accuracy( classifier ,  progress( dtst ) )
println("accuracy : " , acc )

┣                    ┫ [0.00%, 1/25000, 00:00/00:12, 2120.52i/s] 

LoadError: InterruptException:

## Playground