# Generate SentencePeice models

In [1]:
import sentencepiece as spm
import os


## Define paths

In [2]:
root = os.getcwd()
FR_informal_path = os.path.join(root,'GYAFC_Corpus','Family_Relationships','train','informal')
FR_formal_path = os.path.join(root,'GYAFC_Corpus','Family_Relationships','train','formal')
FR_test_raw_informal_path = os.path.join(root,'GYAFC_Corpus','Family_Relationships','test','informal')
FR_test_raw_formal_path = os.path.join(root,'GYAFC_Corpus','Family_Relationships','test','formal')
FR_test_ref0_formal_path = os.path.join(root,'GYAFC_Corpus','Family_Relationships','test','formal.ref0')
FR_test_ref0_informal_path = os.path.join(root,'GYAFC_Corpus','Family_Relationships','test','informal.ref0')

EM_informal_path = os.path.join(root,'GYAFC_Corpus','Entertainment_Music','train','informal')
EM_formal_path = os.path.join(root,'GYAFC_Corpus','Entertainment_Music','train','formal')
EM_test_raw_informal_path = os.path.join(root,'GYAFC_Corpus','Entertainment_Music','test','informal')
EM_test_raw_formal_path = os.path.join(root,'GYAFC_Corpus','Entertainment_Music','test','formal')
EM_test_ref0_informal_path = os.path.join(root,'GYAFC_Corpus','Entertainment_Music','test','informal.ref0')
EM_test_ref0_formal_path = os.path.join(root,'GYAFC_Corpus','Entertainment_Music','test','formal.ref0')

## Make small model

In [3]:
VOCAB_SIZE_small = 16000
model_name_small = f"FRtraintest{VOCAB_SIZE_small}"
sp_train_args_small = f"--input={FR_informal_path},{FR_formal_path},{FR_test_raw_informal_path},{FR_test_raw_formal_path},{FR_test_ref0_informal_path},{FR_test_ref0_formal_path} --model_prefix={model_name_small} --vocab_size={VOCAB_SIZE_small} --pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 --pad_piece=[PAD] --unk_piece=[UNK] --bos_piece=[BOS] --eos_piece=[EOS], --normalization_rule_name=nfkc_cf"

spm.SentencePieceTrainer.train(sp_train_args_small)
sp = spm.SentencePieceProcessor()
sp.Load(model_name_small + '.model')

True

In [4]:
print('bos=', sp.bos_id())
print('eos=', sp.eos_id())
print('unk=', sp.unk_id())
print('pad=', sp.pad_id())

bos= 2
eos= 3
unk= 1
pad= 0


## Make large model

In [5]:
VOCAB_SIZE_large = 32000
model_name_large = f"fulltraintest{VOCAB_SIZE_large}"
sp_train_args_large = f"--input={FR_informal_path},{FR_formal_path},{FR_test_raw_informal_path},{FR_test_raw_formal_path},{FR_test_ref0_informal_path},{FR_test_ref0_formal_path},{EM_informal_path},{EM_formal_path},{EM_test_raw_informal_path},{EM_test_raw_formal_path},{EM_test_ref0_informal_path},{EM_test_ref0_formal_path} --model_prefix={model_name_large} --vocab_size={VOCAB_SIZE_large} --pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 --pad_piece=[PAD] --unk_piece=[UNK] --bos_piece=[BOS] --eos_piece=[EOS], --normalization_rule_name=nfkc_cf"

spm.SentencePieceTrainer.train(sp_train_args_large)
sp = spm.SentencePieceProcessor()
sp.Load(model_name_large + '.model')

True

In [6]:
print('bos=', sp.bos_id())
print('eos=', sp.eos_id())
print('unk=', sp.unk_id())
print('pad=', sp.pad_id())

bos= 2
eos= 3
unk= 1
pad= 0
