In [9]:
import sys
import os

project_root = os.path.abspath('..')
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    
input_file = "../data/simple_wiki.train"    
output_file = "../data/simple_wiki_cleaned.train"

with open(input_file, encoding='utf-8') as f:
        content = f.read()
        
lines = content.split('\n')
cleaned_lines = []

for line in lines:
    if not line.strip().startswith('= = ='):
        cleaned_lines.append(line.strip())

# Join all lines without newlines
result = ''.join(cleaned_lines)

with open(output_file, 'w', encoding='utf-8') as f:
    f.write(result)

print(f"Cleaned data saved to: {output_file}")

Cleaned data saved to: ../data/simple_wiki_cleaned.train


In [17]:
# tokenize the data for some analysis
from src.data.tokenizer import get_tokenizer, text_to_token_ids

tokenizer = get_tokenizer()

tokens = text_to_token_ids(result, tokenizer)

print(f"Number of words: {len(result.split())}")
print(f"Number of tokens: {len(tokens[0])}")

Number of words: 1310381
Number of tokens: 1832887


In [None]:
import numpy as np

token_array = tokens[0].numpy()

unique_tokens, counts = np.unique(token_array, return_counts=True)
print(f"Unique tokens: {len(unique_tokens)}")
mft = unique_tokens[np.argmax(counts)]
lft = unique_tokens[np.argmin(counts)]
print(f"Most frequent token: {mft}:{tokenizer.decode([mft])} (appears {counts.max()} times)")
print(f"Least frequent token: {lft}:{tokenizer.decode([lft])} (appears {counts.min()} times)")

Unique tokens: 38793
Most frequent token: 13:. (appears 92567 times)
Least frequent token: 104:� (appears 1 times)


In [25]:
from collections import Counter
from src.data.tokenizer import token_ids_to_text
import torch

# Get top 20 most frequent tokens
token_counter = Counter(token_array)
top_tokens = token_counter.most_common(20)

print("Top 20 most frequent tokens:")
for i, (token_id, count) in enumerate(top_tokens, 1):
    token_text = token_ids_to_text(torch.tensor([[token_id]]), tokenizer)
    percentage = (count / len(token_array)) * 100
    print(f"{i:2d}. Token ID: {token_id:5d} | Text: '{token_text}' | Count: {count:8d} | Percentage: {percentage:5.2f}%")

Top 20 most frequent tokens:
 1. Token ID:    13 | Text: '.' | Count:    92567 | Percentage:  5.05%
 2. Token ID:   262 | Text: ' the' | Count:    78173 | Percentage:  4.27%
 3. Token ID:    11 | Text: ',' | Count:    68567 | Percentage:  3.74%
 4. Token ID:   286 | Text: ' of' | Count:    44572 | Percentage:  2.43%
 5. Token ID:   287 | Text: ' in' | Count:    39220 | Percentage:  2.14%
 6. Token ID:   290 | Text: ' and' | Count:    34393 | Percentage:  1.88%
 7. Token ID:   257 | Text: ' a' | Count:    29133 | Percentage:  1.59%
 8. Token ID:   318 | Text: ' is' | Count:    25858 | Percentage:  1.41%
 9. Token ID:   284 | Text: ' to' | Count:    22592 | Percentage:  1.23%
10. Token ID:   373 | Text: ' was' | Count:    22345 | Percentage:  1.22%
11. Token ID:   366 | Text: ' "' | Count:    14348 | Percentage:  0.78%
12. Token ID:    12 | Text: '-' | Count:    12922 | Percentage:  0.71%
13. Token ID:   357 | Text: ' (' | Count:    12112 | Percentage:  0.66%
14. Token ID:   329 | Text: 