In [2]:
import sentencepiece as spm
import os

In [4]:
DATA_DIR = '../data'
curated_names_unique_file = os.path.join(DATA_DIR, 'curated_names_unique')

if os.path.exists(curated_names_unique_file):
    print("File exists!")
else:
    print("File does not exist.")



File exists!


In [6]:
# --- Train the SentencePiece Model ---
spm.SentencePieceTrainer.train(
    input=curated_names_unique_file,
    model_prefix='curated_names',
    vocab_size=7000,
    character_coverage=0.9995
)

# --- Load the Trained Model ---
sp = spm.SentencePieceProcessor(model_file='curated_names.model')

# --- Tokenize a Sample Name ---
vocab_size = sp.get_piece_size()

# Get the pieces and their corresponding IDs
vocab = {sp.id_to_piece(i): i for i in range(vocab_size)}

# Function to convert a protein name into a target vector
def name_to_target_vector(name):
  subwords = sp.encode(name, out_type=str)
  target_vector = [vocab.get(token, vocab['<unk>']) for token in subwords]
  return target_vector

# --- Tokenize a Sample Name ---
protein_names = [
    'HAD family hydrolase',
    'thrB family protein',
    'PLN02365 family protein',
    'FixB family protein',
    'Phage_GP20 domain-containing protein',
    'PRK08188 family protein',
    'PurU family protein',
    'bifunctional 3,4-dihydroxy-2-butanone-4-phosphate synthase/GTP cyclohydrolase II',
    'MutS family DNA mismatch repair protein',
    'RAN domain-containing protein'
]

for protein_name in protein_names:
    subwords = sp.encode(protein_name, out_type=str)
    vec = name_to_target_vector(protein_name)
    print(protein_name, subwords, vec)

HAD family hydrolase ['▁HAD', '▁family', '▁hydrolase'] [740, 5, 82]
thrB family protein ['▁thr', 'B', '▁family', '▁protein'] [4530, 35, 5, 3]
PLN02365 family protein ['▁PL', 'N', '02', '365', '▁family', '▁protein'] [3135, 74, 1030, 5555, 5, 3]
FixB family protein ['▁Fix', 'B', '▁family', '▁protein'] [3757, 35, 5, 3]
Phage_GP20 domain-containing protein ['▁P', 'hage', '_', 'GP', '20', '▁domain', '-', 'containing', '▁protein'] [103, 5468, 0, 2472, 158, 7, 4, 6, 3]
PRK08188 family protein ['▁PRK', '08', '188', '▁family', '▁protein'] [2798, 2124, 1741, 5, 3]
PurU family protein ['▁Pur', 'U', '▁family', '▁protein'] [2375, 290, 5, 3]
bifunctional 3,4-dihydroxy-2-butanone-4-phosphate synthase/GTP cyclohydrolase II ['▁bi', 'f', 'unction', 'al', '▁3,4-', 'dihydroxy', '-2-', 'butanone', '-4-', 'phosphate', '▁synthase', '/', 'GTP', '▁cyclohydrolase', '▁II'] [48, 38, 50, 17, 1099, 1295, 348, 2880, 345, 62, 21, 10, 1175, 670, 55]
MutS family DNA mismatch repair protein ['▁MutS', '▁family', '▁DNA', 

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: ../data/curated_names_unique
  input_format: 
  model_prefix: curated_names
  model_type: UNIGRAM
  vocab_size: 7000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differenti