In [None]:
!pip install transformers datasets accelerate optuna



In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import os
import pandas as pd
import optuna
import numpy as np

In [None]:
# Define the directory where your files are located
directory = '/content/'  # Update this path to your specific directory if needed
mdf = pd.DataFrame()

# Iterate over all files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".xlsx"):
        # Construct the full path to the file
        file_path = os.path.join(directory, filename)
        # Load the Excel file
        mdf = pd.concat([mdf, pd.read_excel(file_path)], axis=0)

In [None]:
df = pd.DataFrame(
    {
        "text": mdf["פעולות"],
        "labels": mdf["ניקוד"]
    }
)
# Load dataset and perform label filtering
df["labels"] = pd.to_numeric(df["labels"], errors='coerce')
df = df[df["labels"].notna() & df["labels"].between(0, 10)]
df["labels"] = df["labels"].astype(int)
df.to_csv("hebrew_sentiment.csv", index=False)

In [None]:
# Preprocessing
model_name = "distilbert-base-multilingual-cased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
dataset = load_dataset("csv", data_files="hebrew_sentiment.csv")
dataset = dataset["train"].train_test_split(test_size=0.2, shuffle=True)  # 80% train, 20% validation
train_dataset = dataset["train"]
eval_dataset = dataset["test"]
def preprocess_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)
encoded_dataset_train = train_dataset.map(preprocess_function, batched=True)
encoded_dataset_eval = eval_dataset.map(preprocess_function, batched=True)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/241 [00:00<?, ? examples/s]

Map:   0%|          | 0/61 [00:00<?, ? examples/s]

In [None]:
dataset = load_dataset("csv", data_files="hebrew_sentiment.csv")
dataset = dataset["train"].train_test_split(test_size=0.3, shuffle=True)  # 80% train, 20% validation
train_dataset = dataset["train"]
eval_dataset = dataset["test"]
def preprocess_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)
encoded_dataset_train = train_dataset.map(preprocess_function, batched=True)
encoded_dataset_eval = eval_dataset.map(preprocess_function, batched=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Map:   0%|          | 0/211 [00:00<?, ? examples/s]

Map:   0%|          | 0/91 [00:00<?, ? examples/s]

In [None]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [None]:
def func(a, b):
  return a + 0.5*b + 0.3*np.abs(a-b)

In [None]:
def objective(trial):
  lr = trial.suggest_float('learning_rate', 1e-6, 1e-3, log=True)
  wd = trial.suggest_float('weight_decay', 5e-5, 5e-4, log=True)

  model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=11).to(device)
  training_args = TrainingArguments(
      output_dir=f"wd{wd},lr{lr}",
      overwrite_output_dir=True,
      evaluation_strategy="epoch",
      per_device_train_batch_size=128,
      per_device_eval_batch_size=128,
      num_train_epochs=50,
      learning_rate=lr,
      weight_decay=wd,
      logging_dir='./logs',
      logging_steps=10,
  )

  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=encoded_dataset_train,
      eval_dataset=encoded_dataset_eval,
  )
  trainer.train()

  mfunc_val = 10
  for log in trainer.state.log_history:
      if 'loss' in log:  # Last training loss
          train_loss = log['loss']
          if 'eval_loss' in log:  # Last validation loss
              val_loss = log['eval_loss']
              func_val = func(train_loss, val_loss)
              mfunc_val = func_val if func_val < mfunc_val else mfunc_val
  return mfunc_val

In [None]:
study = optuna.create_study(direction='minimize')

In [None]:
study.optimize(lambda trial: objective(trial), n_trials=15)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,2.387444
2,No log,2.386835
3,No log,2.386236
4,No log,2.385597
5,2.391000,2.385003
6,2.391000,2.384403
7,2.391000,2.383866
8,2.391000,2.383294
9,2.391000,2.382772
10,2.389800,2.382273


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,2.382701
2,No log,2.375618
3,No log,2.369274
4,No log,2.363829
5,2.364900,2.360544
6,2.364900,2.357855
7,2.364900,2.354988
8,2.364900,2.350634
9,2.364900,2.345909
10,2.301100,2.340788


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,2.390621
2,No log,2.389714
3,No log,2.38883
4,No log,2.387979
5,2.392200,2.38715
6,2.392200,2.386338
7,2.392200,2.385599
8,2.392200,2.384861
9,2.392200,2.384166
10,2.385900,2.383486


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,2.365231
2,No log,2.353908
3,No log,2.342461
4,No log,2.312117
5,2.300200,2.278534
6,2.300200,2.251741
7,2.300200,2.23326
8,2.300200,2.232935
9,2.300200,2.222828
10,1.952700,2.210631


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,2.380753
2,No log,2.372086
3,No log,2.365131
4,No log,2.360754
5,2.358600,2.356978
6,2.358600,2.352823
7,2.358600,2.348428
8,2.358600,2.34237
9,2.358600,2.334788
10,2.277000,2.325328


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,2.381666
2,No log,2.373728
3,No log,2.366806
4,No log,2.361444
5,2.361300,2.358801
6,2.361300,2.355596
7,2.361300,2.35196
8,2.361300,2.346633
9,2.361300,2.340683
10,2.288600,2.333693


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,2.390875
2,No log,2.390201
3,No log,2.38954
4,No log,2.388904
5,2.393100,2.388279
6,2.393100,2.387665
7,2.393100,2.387101
8,2.393100,2.38653
9,2.393100,2.385996
10,2.388500,2.385475


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,2.3904
2,No log,2.389294
3,No log,2.388216
4,No log,2.387187
5,2.391500,2.386186
6,2.391500,2.385223
7,2.391500,2.38434
8,2.391500,2.383456
9,2.391500,2.38262
10,2.383600,2.381814


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,2.389398
2,No log,2.387407
3,No log,2.385511
4,No log,2.383734
5,2.388100,2.382025
6,2.388100,2.38046
7,2.388100,2.37911
8,2.388100,2.37775
9,2.388100,2.376507
10,2.372700,2.375286


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,2.367102
2,No log,2.355661
3,No log,2.345407
4,No log,2.31991
5,2.308500,2.29085
6,2.308500,2.263891
7,2.308500,2.241534
8,2.308500,2.229729
9,2.308500,2.218815
10,1.995500,2.209554


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,2.335629
2,No log,2.369539
3,No log,2.323927
4,No log,2.399687
5,2.344900,2.384058
6,2.344900,2.472107
7,2.344900,2.238734
8,2.344900,2.319177
9,2.344900,2.223312
10,2.130400,2.482785


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,2.387114
2,No log,2.383265
3,No log,2.3797
4,No log,2.376589
5,2.380200,2.373562
6,2.380200,2.370469
7,2.380200,2.367684
8,2.380200,2.365079
9,2.380200,2.364228
10,2.346700,2.36388


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,2.372028
2,No log,2.350387
3,No log,2.278782
4,No log,2.372604
5,2.207100,2.266813
6,2.207100,2.282587
7,2.207100,2.294528
8,2.207100,2.43104
9,2.207100,2.449614
10,1.545400,2.479812


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,2.388323
2,No log,2.385437
3,No log,2.382763
4,No log,2.380265
5,2.384400,2.378059
6,2.384400,2.375956
7,2.384400,2.374057
8,2.384400,2.371917
9,2.384400,2.369807
10,2.360400,2.367845


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,2.361722
2,No log,2.349473
3,No log,2.32761
4,No log,2.271891
5,2.264700,2.229826
6,2.264700,2.218854
7,2.264700,2.227448
8,2.264700,2.244953
9,2.264700,2.240465
10,1.799900,2.246759


In [None]:
study.trials[4]

FrozenTrial(number=4, state=TrialState.COMPLETE, values=[10.0], datetime_start=datetime.datetime(2024, 11, 1, 18, 58, 47, 104582), datetime_complete=datetime.datetime(2024, 11, 1, 19, 0, 57, 586006), params={'learning_rate': 2.0576759372081784e-05, 'weight_decay': 5.4738198513553445e-05}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=0.001, log=True, low=1e-06, step=None), 'weight_decay': FloatDistribution(high=0.0005, log=True, low=5e-05, step=None)}, trial_id=4, value=None)

In [None]:
# prompt: zip and download /content/wd5.4738198513553445e-05,lr2.0576759372081784e-05/checkpoint-100
!zip -r /content/checkpoint-100.zip /content/wd5.4738198513553445e-05,lr2.0576759372081784e-05/checkpoint-100
from google.colab import files
files.download('/content/checkpoint-100.zip')

  adding: content/wd5.4738198513553445e-05,lr2.0576759372081784e-05/checkpoint-100/ (stored 0%)
  adding: content/wd5.4738198513553445e-05,lr2.0576759372081784e-05/checkpoint-100/optimizer.pt (deflated 71%)
  adding: content/wd5.4738198513553445e-05,lr2.0576759372081784e-05/checkpoint-100/training_args.bin (deflated 51%)
  adding: content/wd5.4738198513553445e-05,lr2.0576759372081784e-05/checkpoint-100/scheduler.pt (deflated 56%)
  adding: content/wd5.4738198513553445e-05,lr2.0576759372081784e-05/checkpoint-100/rng_state.pth (deflated 25%)
  adding: content/wd5.4738198513553445e-05,lr2.0576759372081784e-05/checkpoint-100/trainer_state.json (deflated 82%)
  adding: content/wd5.4738198513553445e-05,lr2.0576759372081784e-05/checkpoint-100/model.safetensors (deflated 7%)
  adding: content/wd5.4738198513553445e-05,lr2.0576759372081784e-05/checkpoint-100/config.json (deflated 56%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>