In [1]:
import os

from transformers import AutoTokenizer
from collections import Counter
from datasets import load_dataset

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Puctuation or Whitespace

In [None]:
indices = list(range(0, 255+1))
tokens = tokenizer.convert_ids_to_tokens(indices)

# 결과 출력
for idx, token in zip(indices, tokens):
    print(f"{idx} : {token}")

### Check Special tokens

In [None]:
special_tokens = tokenizer.special_tokens_map

print("Special Tokens:")
for token_name, token_value in special_tokens.items():
    token_id = tokenizer.convert_tokens_to_ids(token_value)
    print(f"{token_name}: {token_value}, Token ID: {token_id}")


In [None]:
# 1부터 100까지의 인덱스 생성
indices = list(range(128000, 128255+1))
# 인덱스를 토큰으로 변환
tokens = tokenizer.convert_ids_to_tokens(indices)

# 결과 출력
for idx, token in zip(indices, tokens):
    print(f"{idx} : {token}")

### Top N

In [2]:
# 데이터셋 로드
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

# 훈련 데이터에서 텍스트 추출
texts = dataset['train']['text']

# 단어 빈도를 계산하기 위한 함수
def get_word_frequencies(texts, tokenizer):
    word_counter = Counter()
    for text in texts:
        # 토큰화
        tokens = tokenizer.tokenize(text)
        word_counter.update(tokens)
    return word_counter

# 단어 빈도 계산
word_frequencies = get_word_frequencies(texts, tokenizer)



Downloading readme: 100%|██████████| 10.5k/10.5k [00:00<00:00, 40.4MB/s]
Downloading data: 100%|██████████| 733k/733k [00:00<00:00, 1.47MB/s]
Downloading data: 100%|██████████| 6.36M/6.36M [00:01<00:00, 4.27MB/s]
Downloading data: 100%|██████████| 657k/657k [00:00<00:00, 1.68MB/s]
Generating test split: 100%|██████████| 4358/4358 [00:00<00:00, 144511.11 examples/s]
Generating train split: 100%|██████████| 36718/36718 [00:00<00:00, 1402493.91 examples/s]
Generating validation split: 100%|██████████| 3760/3760 [00:00<00:00, 1082549.63 examples/s]


In [3]:
# 가장 자주 사용되는 상위 100개 단어 추출
top_N_words = word_frequencies.most_common(3000)

top_N_tokens = set()

print("Top 100 most common tokens:")
for token, freq in top_N_words:
    token_id = tokenizer.convert_tokens_to_ids(token)
    top_N_tokens.add(token_id)
    print(f"Token: {token}, Token ID: {token_id}, Frequency: {freq}")

Top 100 most common tokens:
Token: Ġthe, Token ID: 279, Frequency: 113179
Token: Ġ,, Token ID: 1174, Frequency: 99913
Token: Ġ., Token ID: 662, Frequency: 73416
Token: Ġ, Token ID: 220, Frequency: 64876
Token: Ġof, Token ID: 315, Frequency: 56891
Token: Ġand, Token ID: 323, Frequency: 50607
Token: Ġin, Token ID: 304, Frequency: 39686
Token: Ġto, Token ID: 311, Frequency: 39224
Token: Ġa, Token ID: 264, Frequency: 34370
Token: Ġ=, Token ID: 284, Frequency: 29570
Token: Ġ", Token ID: 330, Frequency: 28309
Token: ĠĊ, Token ID: 720, Frequency: 23764
Token: Ġ@, Token ID: 571, Frequency: 22801
Token: Ġwas, Token ID: 574, Frequency: 21006
Token: Ġ', Token ID: 364, Frequency: 18638
Token: ĠThe, Token ID: 578, Frequency: 17676
Token: -, Token ID: 12, Frequency: 17019
Token: @, Token ID: 31, Frequency: 16906
Token: s, Token ID: 82, Frequency: 16668
Token: Ġthat, Token ID: 430, Frequency: 14138
Token: Ġas, Token ID: 439, Frequency: 14074
Token: Ġon, Token ID: 389, Frequency: 13708
Token: Ġfor, To

In [13]:
print(top_N_tokens)

import pickle

with open("top_1000_tokens.pkl", "wb") as f:
    pickle.dump(top_N_tokens, f)

{2052, 4101, 6149, 2053, 2057, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 4113, 6166, 8220, 31, 8223, 2085, 6186, 6187, 4140, 4147, 6197, 8246, 2103, 4156, 8254, 64, 65, 66, 2115, 68, 67, 70, 71, 72, 6216, 74, 75, 76, 77, 78, 2128, 8272, 82, 83, 2132, 84, 2134, 86, 88, 89, 85, 6237, 14433, 6244, 2148, 6250, 4207, 4208, 2162, 2163, 8308, 14454, 4216, 14458, 6267, 8316, 18561, 14467, 6280, 6287, 2191, 2192, 4245, 4251, 2204, 8351, 2209, 2212, 4261, 59562, 10411, 6319, 2225, 4273, 4279, 2231, 2237, 10434, 6342, 2254, 16591, 4311, 4314, 4315, 220, 4325, 4330, 18671, 2288, 6385, 35061, 2294, 6393, 8448, 2305, 258, 259, 4356, 261, 6406, 263, 264, 265, 266, 4363, 268, 2316, 6411, 267, 269, 273, 274, 272, 276, 2324, 278, 279, 2326, 277, 4376, 2331, 284, 285, 6424, 287, 288, 281, 282, 291, 292, 293, 294, 295, 296, 78118, 299, 300, 301, 10541, 8494, 304, 305, 303, 307, 6445, 309, 2349, 311, 312, 4409, 2360, 315, 2363, 2361, 2362, 4410, 320, 10555, 316, 323, 6460, 6469, 2373, 324, 328, 329, 