## Using distil multilingual BERT

In [1]:
print('MomoxOkarun')

MomoxOkarun


In [None]:
!pip install tensorboard

In [2]:
import torch
from transformers import (
    DistilBertForMaskedLM,
    DistilBertTokenizerFast,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback,
    pipeline
)
from datasets import Dataset, load_dataset
import pandas as pd
import random
import numpy as np

In [3]:
# ========================
# 1. Configuration
# ========================
SEED = 42
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)
MODEL_NAME = "distilbert-base-multilingual-cased"
with open('/kaggle/input/finterms/financial_terms.txt') as f:
    FINANCIAL_TERMS = f.read().splitlines()

random.shuffle(FINANCIAL_TERMS)
MAX_SEQ_LENGTH = 256
TRAIN_RATIO = 0.85

In [4]:
# ========================
# 2. Helper Functions
# ========================
def check_and_add_tokens(tokenizer, terms):
    """Identify terms needing addition and update tokenizer"""
    terms_to_add = []
    for term in terms:
        tokens = tokenizer.tokenize(term)
        if len(tokens) > 1 or not tokenizer.convert_tokens_to_ids(term):
            terms_to_add.append(term)
    
    if terms_to_add:
        print(f"Adding {len(terms_to_add)} financial terms to tokenizer")
        tokenizer.add_tokens(terms_to_add)
    
    return tokenizer

def initialize_new_embeddings(model, tokenizer, new_terms):
    """Initialize new token embeddings with pre-trained averages"""
    with torch.no_grad():
        embeddings = model.get_input_embeddings().weight.data
        new_embeddings = embeddings[:-len(new_terms)].mean(dim=0)
        embeddings[-len(new_terms):] = new_embeddings


In [5]:
# ========================
# 3. Data Preparation
# ========================
# Load and clean data

gazetta_df = pd.read_csv("/kaggle/input/gazetta-financial-news-dataset/_--.csv")
rbk_df = pd.read_csv("/kaggle/input/rbk-financial-news-dataset/--.csv")
stockNews_df = pd.read_csv("/kaggle/input/stock-news-dataset/englishFinancialNews.csv")
print(f"Gazetta shape: {gazetta_df.shape}, RBK shape: {rbk_df.shape}, Stock News shape: {stockNews_df.shape}\n")
print("Gazetta info: \n", gazetta_df.info())
print("RBK info: \n", rbk_df.info())
print("Stock News info: \n", stockNews_df.info())

Gazetta shape: (10937, 7), RBK shape: (16517, 6), Stock News shape: (11606, 5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10937 entries, 0 to 10936
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  10937 non-null  int64 
 1   text        10937 non-null  object
 2   summary     10937 non-null  object
 3   title       10937 non-null  object
 4   date        10937 non-null  object
 5   url         10937 non-null  object
 6   category    10937 non-null  object
dtypes: int64(1), object(6)
memory usage: 598.2+ KB
Gazetta info: 
 None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16517 entries, 0 to 16516
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  16517 non-null  int64 
 1   url         16517 non-null  object
 2   date        16517 non-null  object
 3   title       16517 non-null  object
 4   category    16517 non-null

In [6]:
stockNews_df.rename(columns={"title":"text"}, inplace=True)
print(stockNews_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11606 entries, 0 to 11605
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0.1  11606 non-null  int64  
 1   Unnamed: 0    11606 non-null  float64
 2   text          11606 non-null  object 
 3   date          11605 non-null  object 
 4   stock         11605 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 453.5+ KB
None


In [8]:
df1_text = gazetta_df[['text']].head(1000)
df2_text = rbk_df[['text']].head(5000)
df3_text = stockNews_df[['text']].head(3000)

# Concatenate the DataFrames along the rows
df = pd.concat([df1_text, df2_text, df3_text], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)
print(df.shape)
print(df.info())

(9000, 1)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    9000 non-null   object
dtypes: object(1)
memory usage: 70.4+ KB
None


In [9]:
df.head()

Unnamed: 0,text
0,Речь идет о развитии энергетического сотруднич...
1,"По итогам торгов в четверг, 6 ноября, доллар в..."
2,"Live At 4 p.m. EDT, Benzinga Will Be Discussin..."
3,"В четверг, 26 марта, на рынке европейских госу..."
4,"В среду, 1 апреля, с открытием торгов на рынке..."


In [10]:
dataset = Dataset.from_pandas(df[["text"]])

In [11]:
# ========================
# 4. Tokenization Setup
# ========================
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)
#tokenizer = check_and_add_tokens(tokenizer, FINANCIAL_TERMS)

def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=MAX_SEQ_LENGTH,
        return_special_tokens_mask=True  # Helps data collator
    )

tokenized_ds = dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
split_ds = tokenized_ds.train_test_split(test_size=1-TRAIN_RATIO, seed=42)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]



Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

In [12]:
original_vocab_size = tokenizer.vocab_size  # ~120,000
#new_vocab_size = original_vocab_size + len(FINANCIAL_TERMS)

# Embedding layer shape before/after:
print(f"Original: {original_vocab_size} x 768")
#print(f"Updated: {new_vocab_size} x 768")

Original: 119547 x 768


In [13]:
# ========================
# 5. Model Initialization
# ========================
model = DistilBertForMaskedLM.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(len(tokenizer))

# Smart layer freezing: Unfreeze embeddings + last 3 layers
for param in model.parameters():
    param.requires_grad = False

# Unfreeze embeddings
for param in model.distilbert.embeddings.parameters():
    param.requires_grad = True

# #Unfreeze last 1 transformer layer
# for layer in model.distilbert.transformer.layer[-1:]:
#     for param in layer.parameters():
#         param.requires_grad = True

         
# Define trainable components
trainable_components = [
    "transformer.layer.5",  # Last transformer layer (layer 5)
    "vocab_transform",       # MLM head components
    "vocab_layer_norm",
    "vocab_projector.bias"   # Only unfreeze the bias (weight may be tied)
]

# Unfreeze parameters matching the components
for name, param in model.named_parameters():
    if any(component in name for component in trainable_components):
        param.requires_grad = True
    else:
        param.requires_grad = False

for name, param in model.named_parameters():
    print(f"{name} - Trainable: {param.requires_grad}")

# # Initialize new embeddings (if terms added)
# if len(FINANCIAL_TERMS) > 0:
#     initialize_new_embeddings(model, tokenizer, FINANCIAL_TERMS)

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

distilbert.embeddings.word_embeddings.weight - Trainable: False
distilbert.embeddings.position_embeddings.weight - Trainable: False
distilbert.embeddings.LayerNorm.weight - Trainable: False
distilbert.embeddings.LayerNorm.bias - Trainable: False
distilbert.transformer.layer.0.attention.q_lin.weight - Trainable: False
distilbert.transformer.layer.0.attention.q_lin.bias - Trainable: False
distilbert.transformer.layer.0.attention.k_lin.weight - Trainable: False
distilbert.transformer.layer.0.attention.k_lin.bias - Trainable: False
distilbert.transformer.layer.0.attention.v_lin.weight - Trainable: False
distilbert.transformer.layer.0.attention.v_lin.bias - Trainable: False
distilbert.transformer.layer.0.attention.out_lin.weight - Trainable: False
distilbert.transformer.layer.0.attention.out_lin.bias - Trainable: False
distilbert.transformer.layer.0.sa_layer_norm.weight - Trainable: False
distilbert.transformer.layer.0.sa_layer_norm.bias - Trainable: False
distilbert.transformer.layer.0.ffn

In [19]:
# ========================
# 6. Training Setup
# ========================
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
    pad_to_multiple_of=8  # Optimizes GPU utilization
)

training_args = TrainingArguments(
    output_dir="./finbert-mlm",
    logging_dir="./logs",
    num_train_epochs=5,
    per_device_train_batch_size=16,  # Reduced from 64
    per_device_eval_batch_size=16,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_steps=10,
    learning_rate=3e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,  # Better than fixed steps
    lr_scheduler_type="inverse_sqrt",
    gradient_accumulation_steps=3,
    fp16=True,
    report_to="tensorboard",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_ds["train"],
    eval_dataset=split_ds["test"],
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [15]:
# ========================
# 7. Baseline Model Evaluation
# ========================

# Initialize fresh model for baseline (don't freeze layers yet)
baseline_model = DistilBertForMaskedLM.from_pretrained(MODEL_NAME)
#baseline_model.resize_token_embeddings(len(tokenizer))  # Match tokenizer changes

# Evaluate baseline
trainer_baseline = Trainer(
    model=baseline_model,
    args=TrainingArguments(output_dir="./tmp", report_to="none"),
    data_collator=data_collator,
)
baseline_results = trainer_baseline.evaluate(eval_dataset=split_ds["test"])
baseline_perplexity = torch.exp(torch.tensor(baseline_results["eval_loss"])).item()
print(f"Baseline Perplexity (Pre-Fine-Tuning): {baseline_perplexity:.2f}")

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Baseline Perplexity (Pre-Fine-Tuning): 12.11


In [None]:
# from transformers import pipeline
# fill_mask = pipeline("fill-mask", model=baseline_model, tokenizer=tokenizer)
# print(fill_mask("Revenue grew by [MASK]% this quarter."))

In [None]:
# fill_mask = pipeline("fill-mask", model=baseline_model, tokenizer=tokenizer)
# print(fill_mask("\"Юнипро\" [MASK] выработку электроэнергии в 1 полугодии на 17,1%"))

In [20]:
# ========================
# 8. Training & Evaluation
# ========================
train_results = trainer.train()

  self.pid = os.fork()


Step,Training Loss,Validation Loss
100,2.0683,1.936039
200,2.041,1.842458
300,1.9759,1.833567
400,1.9517,1.801491


There were missing keys in the checkpoint model loaded: ['vocab_projector.weight'].


In [21]:
%load_ext tensorboard
%tensorboard --logdir ./logs --port 6006

<IPython.core.display.Javascript object>

In [26]:
torch.cuda.memory_allocated() * (1e-6)

5513.914368

In [23]:
torch.cuda.empty_cache()

In [25]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [27]:
# Calculate final perplexity
eval_results = trainer.evaluate()
perplexity = torch.exp(torch.tensor(eval_results["eval_loss"])).item()
print(f"Final Perplexity: {perplexity:.2f}")

  self.pid = os.fork()


Final Perplexity: 6.04


In [28]:
train_results.metrics

{'train_runtime': 1263.8219,
 'train_samples_per_second': 30.261,
 'train_steps_per_second': 0.317,
 'total_flos': 2540034945635616.0,
 'train_loss': 2.0513448667526246,
 'epoch': 5.0}

In [29]:
train_results.metrics["train_loss"]

2.0513448667526246

In [30]:
trainer.state.log_history[:2]

[{'loss': 2.3378,
  'grad_norm': 147344.265625,
  'learning_rate': 7.5e-06,
  'epoch': 0.125,
  'step': 10},
 {'loss': 2.279,
  'grad_norm': 114982.6640625,
  'learning_rate': 1.5e-05,
  'epoch': 0.25,
  'step': 20}]

In [31]:
import matplotlib.pyplot as plt
# ========================
# 9. Monitoring (Optional)
# ========================
# Plot training loss
if trainer.state.log_history:
    train_loss = [log["loss"] for log in trainer.state.log_history if "loss" in log]
    plt.plot(train_loss)
    plt.title("Training Loss Curve")
    plt.xlabel("Steps")
    plt.ylabel("Loss")
    # Save the plot to a file
    plt.savefig("/kaggle/working/imgs/training_loss_curve.png")  # You can change the filename and format as needed
    plt.close()  # Close the plot to free up memory
else:
    print("No training loss data to plot!")

In [32]:
if trainer.state.log_history:
    train_loss = [log["eval_loss"] for log in trainer.state.log_history if "eval_loss" in log]
    plt.plot(train_loss)
    plt.title("Evalutation Loss Curve")
    plt.xlabel("Steps")
    plt.ylabel("Loss")
    plt.savefig("/kaggle/working/imgs/evalutation_loss_curve.png")  # You can change the filename and format as needed
    plt.close()  # Close the plot to free up memory
else:
    print("No training loss data to plot!")

In [33]:
if trainer.state.log_history:
    train_loss = [log["grad_norm"] for log in trainer.state.log_history if "grad_norm" in log]
    plt.plot(train_loss)
    plt.title("Gradient")
    plt.xlabel("Steps")
    plt.ylabel("Gradient")
    plt.savefig("/kaggle/working/imgs/gradient_per_step_curve.png")  # You can change the filename and format as needed
    plt.close()  # Close the plot to free up memory
else:
    print("No training loss data to plot!")

In [37]:
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer, device=0)
fill_mask_base = pipeline("fill-mask", model=baseline_model, tokenizer=tokenizer, device=0)
print(fill_mask("Revenue grew by [MASK]% this quarter."), '\n')
print(fill_mask_base("Revenue grew by [MASK]% this quarter."), '\n')
print(fill_mask("\"Юнипро\" [MASK] выработку электроэнергии в 1 полугодии на 17,1%"), '\n')
print(fill_mask_base("\"Юнипро\" [MASK] выработку электроэнергии в 1 полугодии на 17,1%"), '\n')
print(fill_mask("Halliburton to [MASK] Several Contracts in Russia by Mid-May"), '\n')
print(fill_mask_base("Halliburton to [MASK] Several Contracts in Russia by Mid-May"), '\n')

[{'score': 0.04261771962046623, 'token': 10150, 'token_str': '10', 'sequence': 'Revenue grew by 10 % this quarter.'}, {'score': 0.0355437733232975, 'token': 10197, 'token_str': '20', 'sequence': 'Revenue grew by 20 % this quarter.'}, {'score': 0.03483904153108597, 'token': 10709, 'token_str': '60', 'sequence': 'Revenue grew by 60 % this quarter.'}, {'score': 0.03136197105050087, 'token': 10208, 'token_str': '15', 'sequence': 'Revenue grew by 15 % this quarter.'}, {'score': 0.027442846447229385, 'token': 10244, 'token_str': '30', 'sequence': 'Revenue grew by 30 % this quarter.'}] 

[{'score': 0.03788019344210625, 'token': 10832, 'token_str': '80', 'sequence': 'Revenue grew by 80 % this quarter.'}, {'score': 0.03353461995720863, 'token': 11417, 'token_str': '75', 'sequence': 'Revenue grew by 75 % this quarter.'}, {'score': 0.03207072243094444, 'token': 10709, 'token_str': '60', 'sequence': 'Revenue grew by 60 % this quarter.'}, {'score': 0.03135935589671135, 'token': 11978, 'token_str': 

In [38]:
print(tokenizer.tokenize("Юнипро увеличила выработку электроэнергии в 1 полугодии на 17,1%"))
print(tokenizer.tokenize("«Газпром» сообщил о росте объема поставок в Китай на 60%"))

['Ю', '##ни', '##про', 'у', '##вели', '##чила', 'вы', '##работку', 'электр', '##о', '##эн', '##ер', '##гии', 'в', '1', 'полу', '##годи', '##и', 'на', '17', ',', '1', '%']
['«', 'Г', '##аз', '##про', '##м', '»', 'со', '##об', '##щил', 'о', 'рост', '##е', 'об', '##ъ', '##ема', 'поста', '##вок', 'в', 'Китай', 'на', '60', '%']


In [39]:
trainer.save_model("./finbert-mlm/finetuned_v2")
tokenizer.save_pretrained("./finbert-mlm/finetuned_v2")

('./finbert-mlm/finetuned_v2/tokenizer_config.json',
 './finbert-mlm/finetuned_v2/special_tokens_map.json',
 './finbert-mlm/finetuned_v2/vocab.txt',
 './finbert-mlm/finetuned_v2/added_tokens.json',
 './finbert-mlm/finetuned_v2/tokenizer.json')