# Hugging Face Albert
>HuggingFace 라이브러리를 사용하여 한글 언어 모델을 처음부터 훈련하고 자신의 모델을 훈련하는 방법입니다. 

 - Albert Scratch Train
 - Albert는 Sentence Piece tokenizer를 사용합니다.
 - 전처리가 완료된 데이터셋을 불러와 Tokenizer에 적용합니다.

# Load the Essential modules & files Download.

In [None]:
!git clone https://github.com/huggingface/transformers \
&& cd transformers \
&& git checkout a3085020ed0d81d4903c50967687192e3101e770  

In [None]:
!pip install ./transformers
!pip install tensorboardX
!pip install datasets

In [None]:
!mkdir kor \
&& cd kor \
&& wget https://korquad.github.io/dataset/KorQuAD_v1.0_train.json \
&& wget https://korquad.github.io/dataset/KorQuAD_v1.0_dev.json

# Preprocessing

## Albert need sentencepiece tokenizer.

In [None]:
import sentencepiece as spm
spm.SentencePieceTrainer.train(input='/content/drive/MyDrive/Colab_Notebooks/NLP/datasets/wiki_space_tokenizer.txt', 
                              model_prefix='spiece', vocab_size=10000)

In [None]:
import os
try:
  os.mkdir('kor_model')
except:    # 예외가 발생했을 때 실행됨
  print('예외가 발생했습니다.')
os.rename('spiece.model','kor_model/spiece.model')
os.rename('spiece.vocab','kor_model/spiece.vocab')

## Load pre-trained tokenizer.

In [None]:
from transformers import *

In [None]:
#Keep in mind, This is a tokenizer for Albert, unlike the previous one, which is a generic one.
#We'll load it in the form of Albert Tokenizer.
tokenizer = AlbertTokenizer.from_pretrained("/content/kor_model")

In [None]:
op = tokenizer.encode("멤버십 만료일은 2021년 입니다.")
tokenizer.decode(op)

In [None]:
#Checking vocabulary size
vocab_size=tokenizer.vocab_size
vocab_size

## Expotr Model & Tokenizer model configuration setting for JSON.

In [None]:
import json

config = {
    "architectures": [
        "AlbertModel"
    ],
	"attention_probs_dropout_prob": 0.1,
	"hidden_act": "gelu",
	"hidden_dropout_prob": 0.1,
	"hidden_size": 768,
	"initializer_range": 0.02,
	"intermediate_size": 3072,
	"layer_norm_eps": 1e-05,
	"max_position_embeddings": 512,
	"model_type": "albert",
	"num_attention_heads": 12,
	"num_hidden_layers": 6,
	"type_vocab_size": 1,
	"vocab_size": vocab_size
}
with open("/content/kor_model/config.json", 'w') as fp:
    json.dump(config, fp)


#Configuration for tokenizer.
#Note: I set do_lower_case: False, and keep_accents:True

tokenizer_config = {
	"max_len": 512,
	"model_type": "albert",
	"do_lower_case":False, 
	"keep_accents":True
}
with open("/content/kor_model/tokenizer_config.json", 'w') as fp:
    json.dump(tokenizer_config, fp)

In [None]:
torch.cuda.empty_cache()
gc.collect()

# Train Step

In [None]:
!export SQUAD_DIR=/content/kor \
&& python transformers/examples/run_squad.py \
  --model_type albert \
  --model_name_or_path /content/drive/MyDrive/albert_model \
  --output_dir /content/drive/MyDrive/Colab_Notebooks/NLP/ALBERT/QnA \
  --config_name /content/drive/MyDrive/albert_model \
  --tokenizer_name /content/drive/MyDrive/albert_model \
  --do_train \
  --do_eval \
  --train_file $SQUAD_DIR/KorQuAD_v1.0_train.json \
  --predict_file $SQUAD_DIR/KorQuAD_v1.0_dev.json \
  --learning_rate 3e-5 \
  --num_train_epochs 0.1 \
  --max_seq_length 512 \
  --doc_stride 128 \
#  --overwrite_output_dir \
#  --save_steps 1000 \
#  --max_answer_length 30
#  --per_gpu_train_batch_size 12 \
#  --threads 4 \
#  --version_2_with_negative 

In [None]:
torch.cuda.empty_cache()
gc.collect() 

# When you load the pre-trained model.

## Load

In [None]:
from transformers import AlbertTokenizer, AlbertModel
atokenizer = AlbertTokenizer.from_pretrained("/content/drive/MyDrive/kor_model")
atokenizer.save_pretrained("/content/drive/MyDrive/my_albert")

In [None]:
op = atokenizer.encode("멤버십 만료일은 2021년 입니다. 다음 시간에 이용해주세요.")
print(atokenizer.decode(op))

In [None]:
#I am using chackoint because os not much training
model = AlbertModel.from_pretrained("/content/drive/MyDrive/albert_model/checkpoint-14000")
model.save_pretrained("/content/drive/MyDrive/my_albert")

## Test

In [None]:
tokenizer = AlbertTokenizer.from_pretrained("/content/drive/MyDrive/my_albert")

In [None]:
txt = "멤버십 만료일은 2021년 입니다."
op = tokenizer.encode(txt)

In [None]:
op
#See howw it's tokenized!

In [None]:
tokenizer.decode(op[:5]), tokenizer.decode(op[5:])

In [None]:
ps = model(torch.tensor(op).unsqueeze(1))

In [None]:
print(ps[0].shape)