In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from torch.utils.data import DataLoader
from torch.optim import AdamW
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score
import os
from sklearn.model_selection import KFold
from tqdm import tqdm
from tokenizers import Tokenizer

In [2]:
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer , XLMRobertaForMaskedLM

# Specify your local model path
hugging_dir = './models/'
model_path = hugging_dir+"xlm-roberta-base"


# Load tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained(model_path)

# Load model for binary classification (num_labels=2)
model = XLMRobertaForSequenceClassification.from_pretrained(model_path, num_labels=2)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ./models/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# test original tokenizer

In [3]:
model = XLMRobertaForMaskedLM.from_pretrained(model_path)
# Set model to eval mode
model.eval()

# Define a test sentence with a masked word
test_sentence = "Hello, how <mask> you?  你好，你怎么样？"

# Tokenize input
inputs = tokenizer(test_sentence, return_tensors="pt")
print(inputs)
# Get model predictions
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits  # (batch_size, seq_len, vocab_size)

# Get the masked token index
mask_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1].item()

# Get top 5 predicted tokens for the masked position
top_5_ids = logits[0, mask_index].topk(5).indices
top_5_tokens = tokenizer.convert_ids_to_tokens(top_5_ids)

print("\nTop 5 Predictions for <mask>:")
for i, token in enumerate(top_5_tokens):
    print(f"{i+1}. {token}")

Some weights of the model checkpoint at ./models/xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'input_ids': tensor([[     0,  35378,      4,   3642, 250001,    398,     32,      6, 124084,
              4,    994,  93985,     32,      2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

Top 5 Predictions for <mask>:
1. ▁are
2. ▁mate
3. ▁about
4. ▁e
5. is


# Extract all tokens from BRCC

In [4]:
import re

def is_valid_language(text):
    """
    Check if the text contains only English, Malay, or Chinese characters.
    """
    # Regex for English/Malay (Latin letters and common punctuation)
    latin_pattern = re.compile(r'^[\u0000-\u024F\s]+$')
    
    # Regex for Chinese characters
    chinese_pattern = re.compile(r'^[\u4E00-\u9FFF\s]+$')
    
    # Check if the text matches either pattern or a combination of both
    if latin_pattern.match(text) or chinese_pattern.match(text):
        return True
    elif re.search(r'[\u4E00-\u9FFF]', text) and re.search(r'[\u0000-\u024F]', text):
        # If the text has both Chinese and Latin characters, still valid
        return True
    return False

'''def clean_text(text):
    """
    Remove unwanted characters but keep common punctuation, numbers, and symbols.
    """
    # Retain:
    # - Latin (A-Za-z) and Chinese (\u4E00-\u9FFF)
    # - Numbers (0-9)
    # - Common punctuation and symbols: . , ! ? : ; ' " ( ) [ ] { } - _ + = / %
    # - Spaces (\s)
    allowed_chars = r'[^A-Za-z\u4E00-\u9FFF0-9\s.,!?;:\'\"()\[\]{}\-_+=/%$€¥£]@'
    
    # Remove unwanted characters
    text = re.sub(allowed_chars, '', text)
    
    # Normalize multiple spaces to a single space
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text'''
def clean_text(text):
    """
    Remove unwanted characters but keep common punctuation, numbers, and symbols.
    Exclude Japanese, Korean, and Arabic characters.
    """
    # Allowed characters:
    # - Latin: A-Za-z
    # - Chinese: \u4E00-\u9FFF
    # - Numbers: 0-9
    # - Common punctuation and symbols: . , ! ? : ; ' " ( ) [ ] { } - _ + = / % $ € ¥ £
    # - Spaces: \s
    unwanted_chars = r'[^A-Za-z\u4E00-\u9FFF0-9\s.,!?;:\'\"()\[\]{}\-_+=/%$€¥£]'

    # Exclude the following Unicode ranges:
    # - Japanese (Hiragana, Katakana): \u3040-\u30FF
    # - Korean (Hangul): \uAC00-\uD7AF
    # - Arabic: \u0600-\u06FF, \u0750-\u077F
    excluded_chars = r'[\u3040-\u30FF\uAC00-\uD7AF\u0600-\u06FF\u0750-\u077F]'

    # Remove unwanted characters
    text = re.sub(unwanted_chars, '', text)

    # Remove Japanese, Korean, and Arabic characters
    text = re.sub(excluded_chars, '', text)

    # Normalize multiple spaces to a single space
    text = re.sub(r'\s+', ' ', text).strip()

    return text
# Path to your dataset
dataset_path = './data/BRCC/mix.train'
# Clean and filter the dataset
cleaned_lines = []
with open(dataset_path, 'r', encoding='utf-8') as file:
    for line in file:
        if is_valid_language(line):
            cleaned_line = clean_text(line)
            if cleaned_line:  # Ensure non-empty lines
                cleaned_lines.append(cleaned_line.lower())

# Build new vocab set

In [5]:


# Tokenize dataset and get unique token IDs
all_tokens = []
for text in tqdm(cleaned_lines):
    tokenized = tokenizer(text, add_special_tokens=False)["input_ids"]
    all_tokens.extend(tokenized)
    

unique_token_ids = (set(all_tokens)) # Ensure sorted order


  0%|▏                                                                        | 7766/2601564 [00:01<05:41, 7594.46it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1135 > 512). Running this sequence through the model will result in indexing errors
100%|██████████████████████████████████████████████████████████████████████| 2601564/2601564 [06:58<00:00, 6222.16it/s]


In [6]:
# List of unique token IDs that appear in your dataset
#unique_token_ids = sorted(list(set(all_tokens)))  # Ensure sorted order

unique_token_ids = sorted(set(all_tokens))
print(f"New vocab size: {len(unique_token_ids)}")

New vocab size: 54682


In [7]:
set1 = set()

set2=set([0, 2, 3, 1, 250001])
set1

set()

In [8]:
set1.update(set2)
set1

{0, 1, 2, 3, 250001}

In [9]:
set1.update(unique_token_ids)
250001 in set1

True

In [10]:
unique_token_ids = set1

In [11]:
unique_token_ids

{0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 27,
 28,
 31,
 32,
 33,
 34,
 36,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 47,
 48,
 51,
 53,
 54,
 55,
 56,
 57,
 58,
 60,
 64,
 66,
 67,
 68,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 78,
 79,
 80,
 81,
 82,
 83,
 85,
 86,
 88,
 90,
 91,
 92,
 93,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 104,
 106,
 107,
 108,
 110,
 111,
 112,
 113,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 126,
 127,
 128,
 131,
 132,
 133,
 136,
 138,
 139,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 156,
 157,
 158,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 177,
 182,
 184,
 185,
 186,
 187,
 188,
 190,
 191,
 192,
 193,
 194,
 195,
 196,
 198,
 199,
 200,
 201,
 202,
 203,
 206,
 208,
 209,
 211,
 213,
 214,
 217,
 219,
 220,
 221,
 223,
 224,
 225,
 228,
 229,
 232,
 233,
 234,
 236,
 237,
 238,
 242

In [12]:
print(f"Original vocab size: {model.config.vocab_size}")
print(f"New vocab size: {len(unique_token_ids)}")

Original vocab size: 250002
New vocab size: 54686


In [13]:
unique_tokens = tokenizer.convert_ids_to_tokens(unique_token_ids)

# Print the tokens to check if the conversion worked
unique_tokens

['<s>',
 '<pad>',
 '</s>',
 '<unk>',
 ',',
 '.',
 '▁',
 's',
 '▁de',
 '-',
 '▁a',
 'a',
 ':',
 'e',
 'i',
 '▁(',
 ')',
 '▁i',
 't',
 'n',
 '▁-',
 '▁la',
 '▁en',
 '▁in',
 '▁na',
 "'",
 '...',
 '▁e',
 'o',
 '?',
 'en',
 'u',
 '▁o',
 '!',
 'm',
 '▁se',
 '▁que',
 'r',
 '的',
 '▁"',
 '▁di',
 '▁to',
 '▁da',
 '▁un',
 'y',
 '▁do',
 '▁je',
 'er',
 '▁sa',
 '"',
 '▁og',
 '/',
 'an',
 'te',
 '▁die',
 '▁the',
 'd',
 '▁er',
 'in',
 ';',
 '▁u',
 'na',
 '▁si',
 '▁ja',
 '▁za',
 '▁v',
 '▁et',
 '▁is',
 'da',
 'ne',
 '▁el',
 'es',
 '▁s',
 'k',
 'ni',
 '▁le',
 '▁l',
 '▁z',
 '▁on',
 '▁at',
 '▁for',
 '▁_',
 'ta',
 '▁d',
 '▁1',
 're',
 '▁ne',
 '▁no',
 '▁of',
 'de',
 '▁y',
 '▁du',
 '▁2',
 '▁per',
 'ti',
 '▁yang',
 '▁te',
 '▁para',
 '▁der',
 '▁dan',
 '▁som',
 'et',
 'h',
 '▁med',
 '▁van',
 '(',
 'le',
 '▁and',
 '▁3',
 '▁til',
 'l',
 '▁an',
 'la',
 '▁al',
 'ja',
 '▁del',
 'ar',
 '▁w',
 '▁det',
 'li',
 '▁ya',
 '▁:',
 '▁...',
 '▁ka',
 'no',
 '▁con',
 '▁po',
 'ka',
 'as',
 '▁me',
 'is',
 '▁und',
 '▁su',
 '▁den',
 'z

# Edit the xlmr tokenizer to use our new vocab set

In [14]:
import json
from transformers import AutoTokenizer


# Special tokens to keep (e.g., <s>, </s>, <mask>, <pad>, <unk>)
tokenizer = AutoTokenizer.from_pretrained(model_path)
special_tokens = ["<s>", "</s>", "<mask>", "<pad>", "<unk>"]
special_token_ids = {token: tokenizer.convert_tokens_to_ids(token) for token in special_tokens}
special_token_ids_set = set(special_token_ids.values())

# Get the current vocab state
model_state = json.loads(tokenizer.backend_tokenizer.model.__getstate__())
original_vocab = model_state["vocab"]
print(f"Original vocab size: {len(original_vocab)}")


Original vocab size: 250002


In [15]:
from tokenizers import models
original_vocab = model_state["vocab"]

# Check the original size of the vocabulary
print(f"Original vocab size: {len(original_vocab)}")

# Convert the set of all_used_token_ids to strings if needed (assuming tokens are strings)
all_used_token_ids = set(unique_tokens)  # Make sure your `unique_tokens` is a set of strings (e.g., {'hello', '<mask>', '<pad>'})

# In-place filtering of the vocab
original_vocab[:] = [entry for entry in original_vocab if entry[0] in all_used_token_ids]

# Check the new size of the vocabulary
print(f"Filtered vocab size: {len(original_vocab)}")

Original vocab size: 250002
Filtered vocab size: 54686


In [16]:
from tokenizers import models


original_vocab = model_state["vocab"]
print(f"Original vocab size: {len(original_vocab)}")
all_used_token_ids = set(unique_tokens)  # Ensure it's a set for fast lookup
filtered_vocab = [entry for entry in original_vocab if entry[0] in all_used_token_ids]


model_state["vocab"] 

Original vocab size: 54686


[['<s>', 0.0],
 ['<pad>', 0.0],
 ['</s>', 0.0],
 ['<unk>', 0.0],
 [',', -3.4635426998138428],
 ['.', -3.625642776489258],
 ['▁', -3.9299705028533936],
 ['s', -5.072621822357178],
 ['▁de', -5.306643009185791],
 ['-', -5.404437065124512],
 ['▁a', -5.530364990234375],
 ['a', -5.5477118492126465],
 [':', -5.629745960235596],
 ['e', -5.701941967010498],
 ['i', -5.785372257232666],
 ['▁(', -5.926211357116699],
 [')', -5.9974517822265625],
 ['▁i', -6.0461626052856445],
 ['t', -6.071900844573975],
 ['n', -6.093497276306152],
 ['▁-', -6.1764984130859375],
 ['▁la', -6.233835697174072],
 ['▁en', -6.31805419921875],
 ['▁in', -6.3201680183410645],
 ['▁na', -6.327768802642822],
 ["'", -6.345553398132324],
 ['...', -6.38959264755249],
 ['▁e', -6.3929057121276855],
 ['o', -6.417782306671143],
 ['?', -6.460666179656982],
 ['en', -6.465692520141602],
 ['u', -6.467804431915283],
 ['▁o', -6.582267761230469],
 ['!', -6.61658239364624],
 ['m', -6.647110939025879],
 ['▁se', -6.716159820556641],
 ['▁que', -6.

In [17]:
all_used_token_ids

{'avimo',
 '隊',
 '价',
 '或其他',
 '看上去',
 'pending',
 '▁ervan',
 'sinya',
 '否认',
 '▁material',
 '在此',
 '▁mile',
 '▁kesabaran',
 '这部',
 'kinti',
 '▁palm',
 '▁berdin',
 '▁little',
 '▁clo',
 '▁tuum',
 '历史',
 'portu',
 '▁komisar',
 '544',
 'geist',
 '罩',
 '▁multo',
 '▁verdik',
 '迷人的',
 'matka',
 '▁mire',
 'doni',
 'ndet',
 '▁ingilis',
 '蝶',
 '愈',
 '面前',
 '▁impression',
 '▁territoire',
 'lac',
 '脆弱',
 '81',
 'gemeinde',
 '番茄',
 '妳',
 '▁esas',
 '▁night',
 '新年',
 '▁shape',
 '▁pastinya',
 'ca',
 '质',
 '慣',
 '▁tovar',
 '▁flori',
 '▁uko',
 '院',
 '熱情',
 'ame',
 '取决于',
 'vje',
 '的基本',
 'aynay',
 '逐步',
 '温柔',
 'gangen',
 '▁rydy',
 '▁leksik',
 '▁priorit',
 '▁traduk',
 '▁dibiarkan',
 'unggal',
 '▁nama',
 '▁ternyata',
 'gence',
 'rul',
 '▁demonstrat',
 '有什麼',
 '▁bos',
 'verk',
 '▁baud',
 '▁ove',
 '奶油',
 '闵',
 '的利益',
 'fall',
 '▁tutma',
 'tsi',
 '▁sudu',
 '▁2004,',
 '涸',
 '蕾',
 'o',
 '▁profili',
 '▁nim',
 '在地',
 '适合',
 '▁dissi',
 '▁cao',
 '▁2100',
 '马尔',
 '▁sota',
 '▁sediment',
 '▁iniciativa',
 '▁differen

In [18]:
print(model_state.keys()) 

dict_keys(['type', 'unk_id', 'vocab'])


In [19]:
model_state['vocab'] = tuple(tuple(entry) for entry in model_state['vocab'] )

model_state['vocab']

(('<s>', 0.0),
 ('<pad>', 0.0),
 ('</s>', 0.0),
 ('<unk>', 0.0),
 (',', -3.4635426998138428),
 ('.', -3.625642776489258),
 ('▁', -3.9299705028533936),
 ('s', -5.072621822357178),
 ('▁de', -5.306643009185791),
 ('-', -5.404437065124512),
 ('▁a', -5.530364990234375),
 ('a', -5.5477118492126465),
 (':', -5.629745960235596),
 ('e', -5.701941967010498),
 ('i', -5.785372257232666),
 ('▁(', -5.926211357116699),
 (')', -5.9974517822265625),
 ('▁i', -6.0461626052856445),
 ('t', -6.071900844573975),
 ('n', -6.093497276306152),
 ('▁-', -6.1764984130859375),
 ('▁la', -6.233835697174072),
 ('▁en', -6.31805419921875),
 ('▁in', -6.3201680183410645),
 ('▁na', -6.327768802642822),
 ("'", -6.345553398132324),
 ('...', -6.38959264755249),
 ('▁e', -6.3929057121276855),
 ('o', -6.417782306671143),
 ('?', -6.460666179656982),
 ('en', -6.465692520141602),
 ('u', -6.467804431915283),
 ('▁o', -6.582267761230469),
 ('!', -6.61658239364624),
 ('m', -6.647110939025879),
 ('▁se', -6.716159820556641),
 ('▁que', -6.

In [20]:


from tokenizers import models

model_class = getattr(models, model_state.pop("type"))


# Create a new Unigram model with the filtered vocab
tokenizer.backend_tokenizer.model =  model_class(**model_state)



# Edited tokenizer may seem to not work, save it regardless

In [21]:
# Define a test sentence with a masked word
test_sentence = "Hello, how <mask>  you? 你好，你怎么样？"

# Tokenize input
inputs = tokenizer(test_sentence, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[     0,      6,      3,   6143,      4,   1838, 250001,    269,     29,
              6,  37483,      4,    562,  30782,     29,      2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [22]:
tokenizer.vocab

{'▁przed': 3523,
 'lea': 6404,
 '1/': 18919,
 'evro': 37655,
 'vii': 15124,
 '▁sot': 5419,
 'ooo': 39270,
 '▁prepar': 21254,
 '▁piccola': 29557,
 '▁numera': 32413,
 '▁sangre': 40640,
 '存储': 44632,
 '▁baixo': 9727,
 '▁ancak': 12191,
 'irea': 8814,
 'trici': 27722,
 '違法': 45990,
 '▁honom': 13056,
 '化': 1074,
 '▁sof': 19188,
 '▁dosis': 32574,
 '▁qara': 13983,
 '▁announced': 45515,
 '▁siste': 7309,
 '▁kiusa': 45531,
 '坠': 48862,
 '▁rigid': 38372,
 'ifa': 12940,
 '▁1859': 48367,
 '煲': 53016,
 '耷': 54518,
 '寞': 54672,
 'oje': 2878,
 '说着': 44319,
 '▁rinj': 27629,
 '▁wij': 3624,
 'poste': 34064,
 'mea': 14768,
 '凍': 39773,
 '▁italiane': 27519,
 'prend': 16210,
 'zun': 28236,
 'bia': 4251,
 '▁viza': 24186,
 '▁olha': 36199,
 '▁bravo': 40641,
 'list': 3129,
 '先生': 4189,
 '▁segundo': 4604,
 '▁komanda': 9083,
 '▁filled': 42618,
 '▁kennen': 16253,
 'tsje': 24728,
 'laim': 20362,
 '▁hub': 10965,
 '▁nere': 11938,
 '▁prostat': 25376,
 'tischen': 31624,
 '▁kereszt': 33269,
 'rupa': 20554,
 '▁economico':

In [23]:
tokenizer.mask_token_id

54685

In [24]:
list(unique_token_ids)[-10:]

[249952,
 249959,
 249962,
 249979,
 249980,
 249982,
 249992,
 249996,
 249998,
 250001]

In [25]:
from transformers import XLMRobertaTokenizer,AutoTokenizer ,XLMRobertaForMaskedLM
#tokenizer = AutoTokenizer.from_pretrained(save_path)
model = XLMRobertaForMaskedLM.from_pretrained(model_path)

# Set model to eval mode
model.eval()

# Define a test sentence with a masked word
test_sentence = "Hello, how <mask> you?  你好，你怎么样？"

# Tokenize input
inputs = tokenizer(test_sentence, return_tensors="pt")
print(inputs)
# Get model predictions
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits  # (batch_size, seq_len, vocab_size)
#print(inputs["input_ids"])
# Get the masked token index
mask_index = torch.where(inputs["input_ids"] == 250001)[1].item()

# Get top 5 predicted tokens for the masked position
top_5_ids = logits[0, mask_index].topk(5).indices
top_5_tokens = tokenizer.convert_ids_to_tokens(top_5_ids)

print("\nTop 5 Predictions for <mask>:")
for i, token in enumerate(top_5_tokens):
    print(f"{i+1}. {token}")

Some weights of the model checkpoint at ./models/xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'input_ids': tensor([[     0,      6,      3,   6143,      4,   1838, 250001,    269,     29,
              6,  37483,      4,    562,  30782,     29,      2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

Top 5 Predictions for <mask>:
1. ,
2. e
3. ▁d
4. :
5. ▁litt


# Trim XLM-R model according to our new tokenizer

In [26]:
import torch.nn as nn

# Create a new embedding matrix for only the selected tokens
new_embedding = nn.Embedding(len(unique_token_ids), model.config.hidden_size)
with torch.no_grad():
    new_embedding.weight.copy_(model.roberta.embeddings.word_embeddings.weight[list(unique_token_ids)])

# Update the model config
model.config.vocab_size = len(unique_token_ids)

# Assign the new embedding layer
model.roberta.embeddings.word_embeddings = new_embedding

# Update the model's output layer (usually called "lm_head" or similar)
if hasattr(model, "lm_head"):
    # Create a new decoder layer with the reduced vocabulary size
    new_decoder = torch.nn.Linear(model.config.hidden_size, len(unique_token_ids))
    
    with torch.no_grad():
        # Copy the relevant weights from the original decoder
        new_decoder.weight.copy_(model.lm_head.decoder.weight[list(unique_token_ids)])
        new_decoder.bias.copy_(model.lm_head.decoder.bias[list(unique_token_ids)])
        #if model.lm_head.decoder.bias is not None:
            
    
    # Replace the decoder in the lm_head with the new one
    model.lm_head.decoder = new_decoder
    #model.config.vocab_size = len(unique_token_ids)
elif hasattr(model, "classifier"):
    model.classifier = nn.Linear(model.config.hidden_size, model.config.vocab_size, bias=False)
else:
    raise ValueError("Model does not have a valid output layer attribute.")

print(f"New vocab size: {model.config.vocab_size}")
#print(f"New output layer size: {model.lm_head.out_features if hasattr(model, 'lm_head') else model.classifier.out_features}")


New vocab size: 54686


In [27]:
model.lm_head.decoder

Linear(in_features=768, out_features=54686, bias=True)

In [28]:
total_params = sum(p.numel() for p in model.parameters())


print(f"Total parameters: {total_params:,}")
model

Total parameters: 170,346,032


XLMRobertaForMaskedLM(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(54686, 768)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Lay

# Save the trimmed tokenizer and XLM-R model

In [29]:

# Save the modified model and tokenizer
save_path = hugging_dir+"/xlmr_reduced"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('./models//xlmr_reduced\\tokenizer_config.json',
 './models//xlmr_reduced\\special_tokens_map.json',
 './models//xlmr_reduced\\sentencepiece.bpe.model',
 './models//xlmr_reduced\\added_tokens.json',
 './models//xlmr_reduced\\tokenizer.json')

# Restart notebook kernel before loading the trimmed model

In [1]:
from transformers import XLMRobertaTokenizer,AutoTokenizer ,XLMRobertaForMaskedLM
import torch
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from torch.utils.data import DataLoader
from torch.optim import AdamW
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score
import os
from sklearn.model_selection import KFold
from tqdm import tqdm
from tokenizers import Tokenizer


hugging_dir = './models/'
save_path = hugging_dir+"/xlmr_reduced"
tokenizer = AutoTokenizer.from_pretrained(save_path)
model = XLMRobertaForMaskedLM.from_pretrained(save_path, ignore_mismatched_sizes=True)

# Set model to eval mode
model.eval()

# Define a test sentence with a masked word
test_sentence = "Hello, how <mask> you?  你好，你怎么样？"

# Tokenize input
inputs = tokenizer(test_sentence, return_tensors="pt")
print(inputs)
# Get model predictions
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits  # (batch_size, seq_len, vocab_size)

# Get the masked token index
mask_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1].item()

# Get top 5 predicted tokens for the masked position
top_5_ids = logits[0, mask_index].topk(5).indices
top_5_tokens = tokenizer.convert_ids_to_tokens(top_5_ids)

print("\nTop 5 Predictions for <mask>:")
for i, token in enumerate(top_5_tokens):
    print(f"{i+1}. {token}")

Some weights of XLMRobertaForMaskedLM were not initialized from the model checkpoint at ./models//xlmr_reduced and are newly initialized because the shapes did not match:
- lm_head.bias: found shape torch.Size([250002]) in the checkpoint and torch.Size([54686]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'input_ids': tensor([[    0,     6,     3,  6143,     4,  1838, 54685,   269,    29,     6,
         37483,     4,   562, 30782,    29,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

Top 5 Predictions for <mask>:
1. ▁are
2. ▁mate
3. ▁kabar
4. ▁e
5. ▁about


In [2]:
tokenizer.mask_token_id

54685

In [3]:
logits[0, :].topk(5).indices

tensor([[    0,     2,  1647,   332,  8867],
        [    6,    20,    39,   564,   121],
        [    3,     5,  2944,     4, 37483],
        [ 6143, 37483, 39270,   733, 37509],
        [    4,     5,    12,   184,    29],
        [ 1838,  1807,    62,   346,   169],
        [  384, 12123, 25526,    27,   912],
        [  269,    91, 20000,    60,   120],
        [   29,    33,  2355,   421,     5],
        [    6,     2,    20,   421,   564],
        [37483,   733, 44364, 13404,  5455],
        [    4,   184,     5,    29,   482],
        [  562,   301,     4,   269,    26],
        [30782, 19247,  5427,  2771,   733],
        [   29,    33,  2355,   421,     5],
        [    2,     5,     6,   611,    29]])

In [4]:
total_params = sum(p.numel() for p in model.parameters())


print(f"Total parameters: {total_params:,}")
model

Total parameters: 128,097,182


XLMRobertaForMaskedLM(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(54686, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
   