In [None]:
import os

from transformers import AutoTokenizer
from collections import Counter
from datasets import load_dataset

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B", use_auth_token = os.getenv('HUGGINGFACE_AUTH_|TOKEN'))

### Puctuation or Whitespace

In [None]:
indices = list(range(0, 255+1))
tokens = tokenizer.convert_ids_to_tokens(indices)

# 결과 출력
for idx, token in zip(indices, tokens):
    print(f"{idx} : {token}")

### Check Special tokens

In [None]:
special_tokens = tokenizer.special_tokens_map

print("Special Tokens:")
for token_name, token_value in special_tokens.items():
    token_id = tokenizer.convert_tokens_to_ids(token_value)
    print(f"{token_name}: {token_value}, Token ID: {token_id}")


In [None]:
# 1부터 100까지의 인덱스 생성
indices = list(range(128000, 128255+1))
# 인덱스를 토큰으로 변환
tokens = tokenizer.convert_ids_to_tokens(indices)

# 결과 출력
for idx, token in zip(indices, tokens):
    print(f"{idx} : {token}")

### Top N

In [None]:
# 데이터셋 로드
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

# 훈련 데이터에서 텍스트 추출
texts = dataset['train']['text']

# 단어 빈도를 계산하기 위한 함수
def get_word_frequencies(texts, tokenizer):
    word_counter = Counter()
    for text in texts:
        # 토큰화
        tokens = tokenizer.tokenize(text)
        word_counter.update(tokens)
    return word_counter

# 단어 빈도 계산
word_frequencies = get_word_frequencies(texts, tokenizer)



In [12]:
# 가장 자주 사용되는 상위 100개 단어 추출
top_N_words = word_frequencies.most_common(1000)

top_N_tokens = set()

print("Top 100 most common tokens:")
for token, freq in top_N_words:
    token_id = tokenizer.convert_tokens_to_ids(token)
    top_N_tokens.add(token_id)
    print(f"Token: {token}, Token ID: {token_id}, Frequency: {freq}")

Top 100 most common tokens:
Token: Ġthe, Token ID: 279, Frequency: 113179
Token: Ġ,, Token ID: 1174, Frequency: 99913
Token: Ġ., Token ID: 662, Frequency: 73416
Token: Ġ, Token ID: 220, Frequency: 64876
Token: Ġof, Token ID: 315, Frequency: 56891
Token: Ġand, Token ID: 323, Frequency: 50607
Token: Ġin, Token ID: 304, Frequency: 39686
Token: Ġto, Token ID: 311, Frequency: 39224
Token: Ġa, Token ID: 264, Frequency: 34370
Token: Ġ=, Token ID: 284, Frequency: 29570
Token: Ġ", Token ID: 330, Frequency: 28309
Token: ĠĊ, Token ID: 720, Frequency: 23764
Token: Ġ@, Token ID: 571, Frequency: 22801
Token: Ġwas, Token ID: 574, Frequency: 21006
Token: Ġ', Token ID: 364, Frequency: 18638
Token: ĠThe, Token ID: 578, Frequency: 17676
Token: -, Token ID: 12, Frequency: 17019
Token: @, Token ID: 31, Frequency: 16906
Token: s, Token ID: 82, Frequency: 16668
Token: Ġthat, Token ID: 430, Frequency: 14138
Token: Ġas, Token ID: 439, Frequency: 14074
Token: Ġon, Token ID: 389, Frequency: 13708
Token: Ġfor, To

In [10]:
print(top_N_tokens)

{1027, 520, 12, 13, 15, 16, 527, 17, 18, 20, 19, 21, 1047, 24, 1049, 22, 1051, 539, 23, 1053, 31, 551, 555, 1077, 568, 571, 574, 578, 1101, 1102, 82, 2652, 617, 1139, 662, 1174, 1176, 679, 1193, 59562, 682, 1202, 704, 706, 709, 719, 720, 220, 763, 1283, 264, 279, 1306, 284, 291, 810, 813, 814, 304, 311, 1847, 315, 320, 832, 323, 330, 2380, 339, 1364, 2391, 358, 872, 362, 364, 1389, 369, 883, 374, 889, 1403, 892, 387, 389, 902, 922, 927, 420, 430, 1455, 433, 1461, 439, 449, 459, 32213, 477, 994, 2550, 505, 1023}
