# 0. Import

In [None]:
! pip install datasets
! pip install transformers
! pip install sentencepiece

In [4]:
import pandas as pd
from datasets import *
from transformers import AlbertTokenizerFast, AlbertConfig, AlbertForMaskedLM
import sentencepiece as sp
from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer
from tokenizers import *
import torch
import os
import json

In [5]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


# 1. Clean Korean Legal Corpus(CKLC)

## 1-1. Law data

In [None]:
df = pd.read_csv('./data/cases(20220521).csv', index_col=False)  # index_col = 'case_number
df = df.dropna(how='any')
print(df.isnull().sum())
print(">> Number of Korea legal precedents : ", len(df))

case_name            0
case_number          0
date                 0
case_code            0
judgment_issue       0
judgment_summary     0
judgment_contents    0
dtype: int64
>> Number of Korea legal precedents :  62919


## 1-2. Data Sum

In [6]:
# Data_Sum
files = [
         'law_summary.txt',
         'law_issue.txt',
         'law_contents.txt',
         'contract.txt'
         ]
dataset = load_dataset("text", data_files=files, split="train")

data = dataset.train_test_split(test_size=0.08)

Using custom data configuration default-4979e5acb8be5700


Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-4979e5acb8be5700/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-4979e5acb8be5700/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8. Subsequent calls will reuse this data.


In [8]:
# Text Data check!
for t in data['train']['text'][:10]:
    print(t)
    print("===="*50)

1 특허무효심판을 청구할 수 있는 이해관계인의 의미와 그 범위 2 명칭이 반도체디바이스 시험장치인 특허발명의 특허청구범위 제2항 제3항 제21항 제23항 제24항의 구성과 같은 형태의 물품을 제조 판매하거나 같은 형태의 방법을 실시하고 있지 않지만 특허발명과 같은 종류의 반도체디바이스 시험장치를 제조 판매하거나 같은 방법의 반도체디바이스 검출방법을 실시하고 있는 이상 위 청구항들에 대하여 특허무효심판을 청구할 수 있는 이해관계를 인정할 수 있다고 한 사례 3 명칭이 반도체디바이스 시험장치인 특허발명의 특허청구범위 제1항은 비교대상발명 1 2를 결합하여 쉽게 발명할 수 있어 진보성이 부정되고 위 제1항의 진보성이 인정됨을 전제로 이를 인용하고 있는 특허청구범위 제4항 내지 제10항 등 또한 진보성이 부정된다고 한 사례
국고수표 발행 직무담당 분임출납관의 지출원인 없는 국고 수표발행 행위도 직무집행 행위로 보아야 한다
"【피 고 인】 【상 고 인】   피고인【변 호 인】   변호사 강장환【원심판결】 육군고등군법회의 1984.2.21. 선고 83고군형항제434호 판결【주    문】  상고를 기각한다.【이    유】  피고인 및 변호인의 각 상고이유를 함께 판단한다.  원심이 유지한 제1심판결 거시의 증거에 의하면,그 판시사실이 적법히 인정되며 거기에 소론과 같이 심리를 다하지 아니하거나 채증법칙에 위배하여 사실을 오인한 위법이 없다.피고인의 원판시 소위가 군내부에서 부하인 방위병들의 훈련중에 그들에게 군인정신을 환기시키기위하여 한 일이라 하더라도 원심이 확정한 바와 같은 감금과 구타행위는 징계권 내지 훈계권의 범위를 넘어선 위법한 감금, 폭행행위가 된다고 보아야 할 것이므로 같은 취지의 원심판결은 정당하고 거기에 소론과 같은 정당한 행위에 관한 법리오해의 위법이 없다.        따라서 상고를 기각하기로 하여 관여 법관의 일치된 의견으로 주문과 같이 판결한다.대법관 정태균(재판장) 윤일영 김덕주 오성환"
피해자의 과실 유무에 대하여 직권으로 증거조사를 하지 않은 

# 2. Tokenizer(Make Vocab)
    - Byte-Pair Encoding(BPE) : Roberta
    - WordPiece : BERT, DistilBERT
    - SentencePiece : ALBERT, XLNet, T5

In [None]:
def dataset_to_text(dataset, output_filename='data.txt'):
    with open(output_filename, "w") as f:
        for t in dataset['text']:
            print(t, file=f)

# Save train / text dataset -> txt 
dataset_to_text(data["train"], "train_3.txt")
dataset_to_text(data["test"], "test_3.txt")

In [9]:
# Parameters
files = ["train_3.txt", "test_3.txt"]
vocab_size = 30522
max_length = 512
truncate_longer_samples = False  # No cut!

In [None]:
sp.SentencePieceTrainer.train(input=files, model_prefix='spiece', vocab_size=vocab_size-10) # Need to -10 : Special Tokens

In [None]:
model_path = "pretrained-KorLawAlBERT_2"
if not os.path.isdir(model_path):
  os.mkdir(model_path)
os.rename('spiece.model','pretrained-KorLawAlBERT_2/spiece.model')
os.rename('spiece.vocab','pretrained-KorLawAlBERT_2/spiece.vocab')

In [10]:
model_path = "pretrained-KorLawAlBERT_2"
tokenizer = AlbertTokenizerFast.from_pretrained(model_path, max_len=max_length)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# 3. Tokenizing the Dataset

In [None]:
def encode_with_truncation(examples):
    """Mapping function to tokenize the sentences passed with truncation"""
    return tokenizer(examples["text"],
                     truncation=True,
                     padding="max_length",
                     max_length=max_length,
                     return_special_tokens_mask=True)

def encode_without_truncation(examples):
  """Mapping function to tokenize the sentences passed without truncation"""
  return tokenizer(examples["text"], return_special_tokens_mask=True)


# 1.Encoding : The encode function will depend on the truncate_longer_samples variable
encode = encode_with_truncation if truncate_longer_samples else encode_without_truncation

# 2. Tokenizing the Train & Test Dataset 
train_dataset = data['train'].map(encode, batched=True)
test_dataset = data['test'].map(encode, batched=True)

if truncate_longer_samples:
  # remove other columns and set input_ids and attention_mask as 
  train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
  test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
else:
  test_dataset.set_format(columns=["input_ids", "attention_mask", "special_tokens_mask"])
  train_dataset.set_format(columns=["input_ids", "attention_mask", "special_tokens_mask"])

train_dataset, test_dataset

In [None]:
# 3. Main data processing function that will concatenate all texts from our dataset and generate chunks of max_seq_length.

def group_texts(examples):
    # 1.Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # 2. Drop the small remainder
    if total_length >= max_length:
        total_length = (total_length // max_length) * max_length
    # 3. Split by Chunk of Max_len
    result = {
        k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

# 'batched=True' : This map processes 1,000 Texts together, so group_texts THROWS AWAY a remainder for each of those groups of 1,000 texts. 
if not truncate_longer_samples:
  train_dataset = train_dataset.map(group_texts, batched=True, batch_size=2000,
                                    desc=f"Grouping texts in chunks of {max_length}")
  test_dataset = test_dataset.map(group_texts,  batched=True, batch_size=2000,
                                  desc=f"Grouping texts in chunks of {max_length}")

# 4. Loading the Model

In [13]:
# 1. initialize the model with the config
model_config = AlbertConfig(vocab_size=vocab_size,
                            hidden_size= 768,
                            intermediate_size=3072,
                            num_attention_heads=12,
                            ## embedding_size = 128, vs  BERT / RoBERTa = 768 ##
                            )
model = AlbertForMaskedLM.from_pretrained(os.path.join(model_path, "checkpoint-16000"))

# 5. Pre-Training(ALBERT Light MLM Task)

In [14]:
# MLM : Randomly Masking 20% of the tokens For the Dynamic Roberta MLM Task
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=True,
                                                mlm_probability=0.2)
training_args = TrainingArguments(
    output_dir=model_path,            # output directory to where save model checkpoint
    evaluation_strategy="steps",      # evaluate each `logging_steps` steps
    overwrite_output_dir=True,      
    num_train_epochs=10,              # number of training epochs
    per_device_train_batch_size=10,   # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=8,    # accumulating the gradients before updating the weights
    per_device_eval_batch_size=32,    # evaluation batch size
    logging_steps=1000,               # evaluate, log and save model checkpoints every 1000 step
    save_steps=1000,
    # load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    # save_total_limit=3,           # whether you don't have much space so you let only 3 model weights saved in the disk
)

In [15]:
# initialize the trainer and pass everything to it
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [None]:
# train the model(110,000 Steps)
trainer.train()

The following columns in the training set don't have a corresponding argument in `AlbertForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `AlbertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 205081
  Num Epochs = 10
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 80
  Gradient Accumulation steps = 8
  Total optimization steps = 25630


Step,Training Loss,Validation Loss
1000,0.5797,0.568688
2000,0.5728,0.554168
3000,0.5622,0.555608
4000,0.5555,0.546256
5000,0.548,0.539434
6000,0.5408,0.528284
7000,0.5339,0.525289
8000,0.5257,0.512149
9000,0.5188,0.515815
10000,0.5133,0.499341


The following columns in the evaluation set don't have a corresponding argument in `AlbertForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `AlbertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 17618
  Batch size = 32
Saving model checkpoint to pretrained-KorLawAlBERT_2/checkpoint-1000
Configuration saved in pretrained-KorLawAlBERT_2/checkpoint-1000/config.json
Model weights saved in pretrained-KorLawAlBERT_2/checkpoint-1000/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `AlbertForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `AlbertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 17618
  Batch size = 32
Saving model checkpoint to pretrained-KorLawAlBERT_2/checkpoint-2000
Configuration saved in pretrai