In [12]:
import csv
from transformers import RobertaTokenizer

# Load RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

input_csv = r"NGSL_lists\All_words_from_NGLS.csv"           # Input file
output_multi = "multi_tokens.csv" # File with multi-token words
output_single = "single_tokens.csv" # File with single-token rows

rows = []

# Read all rows
with open(input_csv, "r", encoding="utf-8") as f:
    reader = csv.reader(f)
    rows = [row for row in reader if row]  # skip empty rows

header = rows[0] if rows else []
data = rows[1:] if len(rows) > 1 else rows

single_rows = []
multi_token = []

# Process each row
for row in data:
    word = " " + row[0].strip()
    tokens = tokenizer.tokenize(word)
    if len(tokens) == 1:
        single_rows.append(row)
    else:
        multi_token.append((word, " ".join(tokens)))

# Print statistics
print(f"Total words: {len(data)}")
print(f"Single-token words: {len(single_rows)}")
print(f"Multi-token words: {len(multi_token)}")

# Save multi-token words and their splits
with open(output_multi, "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["Word", "Tokenization"])
    for word, tokens in multi_token:
        writer.writerow([word, tokens])

# Save all original columns for single-token rows
with open(output_single, "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f)
    if header:
        writer.writerow(header)
    writer.writerows(single_rows)

print(f"Multi-token words saved to {output_multi}")
print(f"Single-token rows saved to {output_single}")

Total words: 5831
Single-token words: 5539
Multi-token words: 292
Multi-token words saved to multi_tokens.csv
Single-token rows saved to single_tokens.csv
