<a href="https://colab.research.google.com/github/Hillascher5/nlp-tweets-sentiment-analysis/blob/main/DeBerta_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Try another model for the task

In [1]:
# # Needed for Google Colab
# !pip install --quiet evaluate transformers optuna datasets nltk scikit-learn
# !pip install numpy==1.26.4

In [2]:
%env CUDA_LAUNCH_BLOCKING=1

from wordcloud import WordCloud, STOPWORDS
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from tqdm import tqdm
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import Dataset
from torch.utils.data import DataLoader, TensorDataset
from optuna.pruners import MedianPruner
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import torch.nn.functional as F
import os
import re
import string
import time
import random
import glob
import nltk
import evaluate
import transformers
import torch
import optuna
import wandb
wandb.login()
# API key - 0cbd7fe3cffd71df993b30edb4fa0db94f114413

num_train_samples = 5000
os.environ["WANDB_PROJECT"] = f"tweet-sentiment-classification_split_to_test_maxl_128_deberta_{num_train_samples}_samples_optuna"
os.environ["WANDB_INIT_TIMEOUT"] = "180"

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

env: CUDA_LAUNCH_BLOCKING=1
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[34m[1mwandb[0m: Currently logged in as: [33mhillas[0m ([33mhillas-tel-aviv-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
def set_seed_all(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed_all(42)

In [4]:
# Load data
df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/nlp_project/Data/Corona_NLP_train.csv', encoding='latin1')
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/nlp_project/Data/Corona_NLP_test.csv', encoding='latin1')

In [5]:
# Merge and shuffle for better stratified splits
df_full = pd.concat([df_train, df_test], ignore_index=True)
df_full = df_full.sample(frac=1.0, random_state=42).reset_index(drop=True)

In [6]:
# Try without pre-processing
is_preprocessed = "no_preprocess"
df_full["clean_text"] = df_full["OriginalTweet"]

In [7]:
# Mapping sentiments to unique numeric IDs
unique_labels = sorted(df_full["Sentiment"].unique())
label2id = {label: idx for idx, label in enumerate(unique_labels)}
df_full["label"] = df_full["Sentiment"].map(label2id)

In [8]:
# Stratified split: 70% train, 15% val, 15% test
train_val_df, test_df = train_test_split(df_full, test_size=0.15, stratify=df_full["label"], random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.1765, stratify=train_val_df["label"], random_state=42)

# Confirm sizes
print("Train size:", len(train_df))
print("Val size:", len(val_df))
print("Test size:", len(test_df))

Train size: 31466
Val size: 6745
Test size: 6744


In [9]:
train_subset_df, _ = train_test_split(
    train_df[["clean_text", "label"]],
    train_size=num_train_samples,
    stratify=train_df["label"],
    random_state=42
)

val_subset_df, _ = train_test_split(
    val_df[["clean_text", "label"]],
    train_size=500,
    stratify=val_df["label"],
    random_state=42
)

In [10]:
# Choose pretrained models
deberta_model_name = "microsoft/deberta-v3-base"

sentiment_labels = df_full['Sentiment'].unique()
n_labels = len(sentiment_labels)

# Load BERT tokenizer and model
deberta_tokenizer = AutoTokenizer.from_pretrained(deberta_model_name)
deberta_model = AutoModelForSequenceClassification.from_pretrained(deberta_model_name, num_labels=n_labels)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Tokenize function
def tokenize_function_deberta(examples):
    return deberta_tokenizer(examples["clean_text"], truncation=True, padding='max_length', max_length=128)

In [12]:
# Convert DataFrame to Hugging Face Dataset
hf_subset_train = Dataset.from_pandas(train_subset_df)
hf_subset_val = Dataset.from_pandas(val_subset_df)

hf_train = Dataset.from_pandas(train_df[["clean_text", "label"]])
hf_val = Dataset.from_pandas(val_df[["clean_text", "label"]])
hf_test = Dataset.from_pandas(test_df[["clean_text", "label"]])

In [13]:
# Tokenize subsets
# Tokenize for DeBERTa
tokenized_deberta_train_sub = hf_subset_train.map(tokenize_function_deberta, batched=True)
tokenized_deberta_train_sub.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_deberta_val_sub = hf_subset_val.map(tokenize_function_deberta, batched=True)
tokenized_deberta_val_sub.set_format("torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [14]:
# Tokenize full dataset
# Tokenize for BERT
tokenized_deberta_train = hf_train.map(tokenize_function_deberta, batched=True)
tokenized_deberta_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_deberta_val = hf_val.map(tokenize_function_deberta, batched=True)
tokenized_deberta_val.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_deberta_test = hf_test.map(tokenize_function_deberta, batched=True)
tokenized_deberta_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/31466 [00:00<?, ? examples/s]

Map:   0%|          | 0/6745 [00:00<?, ? examples/s]

Map:   0%|          | 0/6744 [00:00<?, ? examples/s]

In [15]:
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"],
        "f1_macro": f1_metric.compute(predictions=predictions, references=labels, average="macro")["f1"],
        "precision_macro": precision_metric.compute(predictions=predictions, references=labels, average="macro")["precision"],
        "recall_macro":    recall_metric.compute(predictions=predictions, references=labels, average="macro")["recall"],
    }

In [16]:
def build_trainer(model_checkpoint, trial, run_prefix, train_dataset, val_dataset):
    # Sample hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    num_epochs = trial.suggest_int("num_train_epochs", 2, 5)
    n_samples = len(train_dataset)

    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=5)
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    run_name = f"{run_prefix}-ep{num_epochs}-lr{learning_rate}-bs{batch_size}-samples{n_samples}-run{int(time.time())}-{is_preprocessed}"

    args = TrainingArguments(
        output_dir=f"./results/{run_prefix}/{run_name}",
        disable_tqdm=True,
        fp16=True,
        eval_strategy="epoch",
        save_strategy="epoch",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        learning_rate=learning_rate,
        weight_decay=0.01,
        label_smoothing_factor=0.1,
        load_best_model_at_end=True,
        save_total_limit=1,
        logging_strategy="epoch",
        logging_dir=f"./logs/{run_prefix}/{run_name}",
        run_name=run_name,
        report_to="wandb",
        metric_for_best_model="f1_macro",
        greater_is_better=True
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    return trainer

In [17]:
def objective_deberta(trial):
    trainer = build_trainer(
        model_checkpoint="microsoft/deberta-v3-base",
        trial=trial,
        run_prefix="deberta",
        train_dataset=tokenized_deberta_train_sub,
        val_dataset=tokenized_deberta_val_sub
    )
    trainer.train()
    eval_result = trainer.evaluate()
    wandb.finish()
    return eval_result["eval_f1_macro"]

In [18]:
study_deberta = optuna.create_study(direction="maximize",
                                 pruner=MedianPruner(n_startup_trials=2, n_warmup_steps=1),
                                 study_name=f"deberta_study_stratify_{is_preprocessed}",
                                 storage=f"sqlite:////content/drive/MyDrive/Colab Notebooks/nlp_project/optuna/deberta_study_stratify_maxl_128_{is_preprocessed}_{num_train_samples}_samples_optuna.db",
                                 load_if_exists=True)
study_deberta.optimize(objective_deberta, n_trials=5)
wandb.finish()

[I 2025-08-17 23:41:55,542] A new study created in RDB with name: deberta_study_stratify_no_preprocess
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


{'loss': 1.3875, 'grad_norm': 9.497136116027832, 'learning_rate': 9.53852034017814e-06, 'epoch': 1.0}
{'eval_loss': 1.1626235246658325, 'eval_accuracy': 0.568, 'eval_f1_macro': 0.5749133408001545, 'eval_precision_macro': 0.5964848456318588, 'eval_recall_macro': 0.5754166573725537, 'eval_runtime': 3.1739, 'eval_samples_per_second': 157.535, 'eval_steps_per_second': 19.849, 'epoch': 1.0}
{'loss': 1.0413, 'grad_norm': 11.81839370727539, 'learning_rate': 7.156612435595985e-06, 'epoch': 2.0}
{'eval_loss': 1.0584924221038818, 'eval_accuracy': 0.656, 'eval_f1_macro': 0.6669337480217985, 'eval_precision_macro': 0.6688928628555484, 'eval_recall_macro': 0.6909328369647864, 'eval_runtime': 2.9785, 'eval_samples_per_second': 167.868, 'eval_steps_per_second': 21.151, 'epoch': 2.0}
{'loss': 0.8752, 'grad_norm': 6.349592685699463, 'learning_rate': 4.777426711476209e-06, 'epoch': 3.0}
{'eval_loss': 0.9782894253730774, 'eval_accuracy': 0.718, 'eval_f1_macro': 0.7221448587701296, 'eval_precision_macro':

0,1
eval/accuracy,▁▅████
eval/f1_macro,▁▅████
eval/loss,█▄▁▄▅▅
eval/precision_macro,▁▅█▇██
eval/recall_macro,▁▆▇▇██
eval/runtime,█▂▁▁▂▂
eval/samples_per_second,▁▆██▇▇
eval/steps_per_second,▁▆██▇▇
train/epoch,▁▁▃▃▅▅▆▆████
train/global_step,▁▁▃▃▅▅▆▆████

0,1
eval/accuracy,0.718
eval/f1_macro,0.72658
eval/loss,1.08972
eval/precision_macro,0.72248
eval/recall_macro,0.73254
eval/runtime,2.9559
eval/samples_per_second,169.155
eval/steps_per_second,21.313
total_flos,2302325034240000.0
train/epoch,5.0


[I 2025-08-17 23:52:46,227] Trial 0 finished with value: 0.7265770369546816 and parameters: {'learning_rate': 1.1909539522910777e-05, 'batch_size': 8, 'num_train_epochs': 5}. Best is trial 0 with value: 0.7265770369546816.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


{'loss': 1.496, 'grad_norm': 8.448129653930664, 'learning_rate': 9.968491480576731e-06, 'epoch': 1.0}
{'eval_loss': 1.3796602487564087, 'eval_accuracy': 0.404, 'eval_f1_macro': 0.40541275191354914, 'eval_precision_macro': 0.4425529145444256, 'eval_recall_macro': 0.43595496152923563, 'eval_runtime': 0.9957, 'eval_samples_per_second': 502.181, 'eval_steps_per_second': 16.07, 'epoch': 1.0}
{'loss': 1.2046, 'grad_norm': 8.468463897705078, 'learning_rate': 6.696159930721756e-06, 'epoch': 2.0}
{'eval_loss': 1.1703674793243408, 'eval_accuracy': 0.538, 'eval_f1_macro': 0.5474609725805605, 'eval_precision_macro': 0.5443220080456173, 'eval_recall_macro': 0.5775005516413517, 'eval_runtime': 0.9796, 'eval_samples_per_second': 510.416, 'eval_steps_per_second': 16.333, 'epoch': 2.0}
{'loss': 1.0374, 'grad_norm': 6.587419033050537, 'learning_rate': 3.3783793315632386e-06, 'epoch': 3.0}
{'eval_loss': 1.1053261756896973, 'eval_accuracy': 0.602, 'eval_f1_macro': 0.6130777989142591, 'eval_precision_macro

0,1
eval/accuracy,▁▅███
eval/f1_macro,▁▆███
eval/loss,█▃▁▁▁
eval/precision_macro,▁▅███
eval/recall_macro,▁▆███
eval/runtime,▄▂▂▁█
eval/samples_per_second,▅▇▇█▁
eval/steps_per_second,▅▇▇█▁
train/epoch,▁▁▃▃▆▆████
train/global_step,▁▁▃▃▆▆████

0,1
eval/accuracy,0.614
eval/f1_macro,0.62527
eval/loss,1.10453
eval/precision_macro,0.61581
eval/recall_macro,0.64179
eval/runtime,1.0222
eval/samples_per_second,489.126
eval/steps_per_second,15.652
total_flos,1841860027392000.0
train/epoch,4.0


[I 2025-08-17 23:55:42,148] Trial 1 finished with value: 0.6252701838781832 and parameters: {'learning_rate': 1.3271122396634068e-05, 'batch_size': 32, 'num_train_epochs': 4}. Best is trial 0 with value: 0.7265770369546816.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


{'loss': 1.3957, 'grad_norm': 9.429511070251465, 'learning_rate': 2.0164014257421895e-05, 'epoch': 1.0}
{'eval_loss': 1.160599946975708, 'eval_accuracy': 0.566, 'eval_f1_macro': 0.5790444435943002, 'eval_precision_macro': 0.5825081298485554, 'eval_recall_macro': 0.5943735015745084, 'eval_runtime': 1.6767, 'eval_samples_per_second': 298.204, 'eval_steps_per_second': 19.085, 'epoch': 1.0}
{'loss': 1.0008, 'grad_norm': 11.242101669311523, 'learning_rate': 1.5131627793176432e-05, 'epoch': 2.0}
{'eval_loss': 0.9868803024291992, 'eval_accuracy': 0.688, 'eval_f1_macro': 0.6959411957055813, 'eval_precision_macro': 0.693741518948275, 'eval_recall_macro': 0.7178653038236954, 'eval_runtime': 1.6781, 'eval_samples_per_second': 297.959, 'eval_steps_per_second': 19.069, 'epoch': 2.0}
{'loss': 0.817, 'grad_norm': 3.981914758682251, 'learning_rate': 1.0099241328930968e-05, 'epoch': 3.0}
{'eval_loss': 0.9320616126060486, 'eval_accuracy': 0.74, 'eval_f1_macro': 0.7436865026791278, 'eval_precision_macro'

0,1
eval/accuracy,▁▆████
eval/f1_macro,▁▆████
eval/loss,█▃▁▃▄▁
eval/precision_macro,▁▆█▇██
eval/recall_macro,▁▇████
eval/runtime,▂▂▃▁█▅
eval/samples_per_second,▇▇▆█▁▄
eval/steps_per_second,▇▇▆█▁▄
train/epoch,▁▁▃▃▅▅▆▆████
train/global_step,▁▁▃▃▅▅▆▆████

0,1
eval/accuracy,0.74
eval/f1_macro,0.74369
eval/loss,0.93206
eval/precision_macro,0.7477
eval/recall_macro,0.74049
eval/runtime,1.6883
eval/samples_per_second,296.158
eval/steps_per_second,18.954
total_flos,2302325034240000.0
train/epoch,5.0


[I 2025-08-18 00:01:54,948] Trial 2 finished with value: 0.7436865026791278 and parameters: {'learning_rate': 2.5161932321227323e-05, 'batch_size': 16, 'num_train_epochs': 5}. Best is trial 2 with value: 0.7436865026791278.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


{'loss': 1.3846, 'grad_norm': 11.367465019226074, 'learning_rate': 7.760374929387692e-06, 'epoch': 1.0}
{'eval_loss': 1.1807622909545898, 'eval_accuracy': 0.57, 'eval_f1_macro': 0.5779813478111506, 'eval_precision_macro': 0.6039227782637386, 'eval_recall_macro': 0.5760620711767765, 'eval_runtime': 2.9514, 'eval_samples_per_second': 169.41, 'eval_steps_per_second': 21.346, 'epoch': 1.0}
{'loss': 1.0547, 'grad_norm': 11.552091598510742, 'learning_rate': 3.891254742117388e-06, 'epoch': 2.0}
{'eval_loss': 1.0789765119552612, 'eval_accuracy': 0.634, 'eval_f1_macro': 0.6452631137199162, 'eval_precision_macro': 0.6411766987469577, 'eval_recall_macro': 0.6685638768084485, 'eval_runtime': 2.9687, 'eval_samples_per_second': 168.422, 'eval_steps_per_second': 21.221, 'epoch': 2.0}
{'loss': 0.9037, 'grad_norm': 13.721375465393066, 'learning_rate': 2.6561465816500945e-08, 'epoch': 3.0}
{'eval_loss': 1.0491530895233154, 'eval_accuracy': 0.666, 'eval_f1_macro': 0.6722245916914883, 'eval_precision_macr

0,1
eval/accuracy,▁▆██
eval/f1_macro,▁▆██
eval/loss,█▃▁▁
eval/precision_macro,▁▅██
eval/recall_macro,▁███
eval/runtime,▁▄▂█
eval/samples_per_second,█▅▇▁
eval/steps_per_second,█▅▇▁
train/epoch,▁▁▅▅████
train/global_step,▁▁▅▅████

0,1
eval/accuracy,0.666
eval/f1_macro,0.67222
eval/loss,1.04915
eval/precision_macro,0.67721
eval/recall_macro,0.66956
eval/runtime,2.9954
eval/samples_per_second,166.922
eval/steps_per_second,21.032
total_flos,1381395020544000.0
train/epoch,3.0


[I 2025-08-18 00:08:31,561] Trial 3 finished with value: 0.6722245916914883 and parameters: {'learning_rate': 1.1620641294719162e-05, 'batch_size': 8, 'num_train_epochs': 3}. Best is trial 2 with value: 0.7436865026791278.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


{'loss': 1.4128, 'grad_norm': 8.009012222290039, 'learning_rate': 1.4518409740464786e-05, 'epoch': 1.0}
{'eval_loss': 1.1597110033035278, 'eval_accuracy': 0.546, 'eval_f1_macro': 0.5623986457641796, 'eval_precision_macro': 0.5602283961704959, 'eval_recall_macro': 0.5742378735847353, 'eval_runtime': 1.7146, 'eval_samples_per_second': 291.621, 'eval_steps_per_second': 18.664, 'epoch': 1.0}
{'loss': 1.0439, 'grad_norm': 8.939412117004395, 'learning_rate': 1.3138832344312022e-07, 'epoch': 2.0}
{'eval_loss': 1.0700995922088623, 'eval_accuracy': 0.628, 'eval_f1_macro': 0.6379757934709596, 'eval_precision_macro': 0.6289435096260542, 'eval_recall_macro': 0.6555255383350418, 'eval_runtime': 1.6945, 'eval_samples_per_second': 295.064, 'eval_steps_per_second': 18.884, 'epoch': 2.0}
{'train_runtime': 147.9392, 'train_samples_per_second': 94.633, 'train_steps_per_second': 5.921, 'train_loss': 1.2283840440723994, 'epoch': 2.0}
{'eval_loss': 1.0700995922088623, 'eval_accuracy': 0.628, 'eval_f1_macro'

0,1
eval/accuracy,▁██
eval/f1_macro,▁██
eval/loss,█▁▁
eval/precision_macro,▁██
eval/recall_macro,▁██
eval/runtime,▁▁█
eval/samples_per_second,██▁
eval/steps_per_second,██▁
train/epoch,▁▁████
train/global_step,▁▁████

0,1
eval/accuracy,0.628
eval/f1_macro,0.63798
eval/loss,1.0701
eval/precision_macro,0.62894
eval/recall_macro,0.65553
eval/runtime,3.6297
eval/samples_per_second,137.753
eval/steps_per_second,8.816
total_flos,920930013696000.0
train/epoch,2.0


[I 2025-08-18 00:11:06,023] Trial 4 finished with value: 0.6379757934709596 and parameters: {'learning_rate': 2.877404283404333e-05, 'batch_size': 16, 'num_train_epochs': 2}. Best is trial 2 with value: 0.7436865026791278.


In [19]:
best_trial_deberta = study_deberta.best_trial
print('DeBerta best trial on subset:')
print(best_trial_deberta.params)

DeBerta best trial on subset:
{'learning_rate': 2.5161932321227323e-05, 'batch_size': 16, 'num_train_epochs': 5}


In [20]:
best_params_deberta = best_trial_deberta.params
run_name_deberta = f"deberta_final_stratify_{is_preprocessed}-ep{best_params_deberta['num_train_epochs']}-lr{best_params_deberta['learning_rate']:.1e}-bs{best_params_deberta['batch_size']}"
wandb.init(project=f"tweet-sentiment-classification_split_to_test_maxl_128_deberta_{num_train_samples}_samples_optuna", name=run_name_deberta, reinit=True)

final_trainer_deberta = build_trainer(
    model_checkpoint="microsoft/deberta-v3-base",
    trial=best_trial_deberta,
    run_prefix=f"deberta_final_stratify_{is_preprocessed}",
    train_dataset=tokenized_deberta_train,
    val_dataset=tokenized_deberta_val
)
final_trainer_deberta.train()
final_trainer_deberta.evaluate(tokenized_deberta_test)
wandb.finish()



Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


{'loss': 1.01, 'grad_norm': 12.703093528747559, 'learning_rate': 2.0139779484768834e-05, 'epoch': 1.0}
{'eval_loss': 0.8661996722221375, 'eval_accuracy': 0.767531504818384, 'eval_f1_macro': 0.7731453538494677, 'eval_precision_macro': 0.7702190922709934, 'eval_recall_macro': 0.7987127089316035, 'eval_runtime': 21.869, 'eval_samples_per_second': 308.428, 'eval_steps_per_second': 19.297, 'epoch': 1.0}
{'loss': 0.723, 'grad_norm': 8.643494606018066, 'learning_rate': 1.510739302052337e-05, 'epoch': 2.0}
{'eval_loss': 0.7313589453697205, 'eval_accuracy': 0.8397331356560415, 'eval_f1_macro': 0.8455987270165057, 'eval_precision_macro': 0.8433732302363852, 'eval_recall_macro': 0.8521262590818788, 'eval_runtime': 21.9711, 'eval_samples_per_second': 306.994, 'eval_steps_per_second': 19.207, 'epoch': 2.0}
{'loss': 0.6241, 'grad_norm': 37.53459167480469, 'learning_rate': 1.007756496322465e-05, 'epoch': 3.0}
{'eval_loss': 0.7792472839355469, 'eval_accuracy': 0.8434395848776872, 'eval_f1_macro': 0.84

0,1
eval/accuracy,▁▆▆███
eval/f1_macro,▁▆▆███
eval/loss,█▂▄▁▁▂
eval/precision_macro,▁▆▆███
eval/recall_macro,▁▅▆███
eval/runtime,▂█▂▂▁▁
eval/samples_per_second,▇▁▇▇██
eval/steps_per_second,▇▁▇▇██
train/epoch,▁▁▃▃▅▅▆▆████
train/global_step,▁▁▃▃▅▅▆▆████

0,1
eval/accuracy,0.86595
eval/f1_macro,0.86969
eval/loss,0.74744
eval/precision_macro,0.86407
eval/recall_macro,0.87736
eval/runtime,21.8588
eval/samples_per_second,308.525
eval/steps_per_second,19.306
total_flos,1.034927993248512e+16
train/epoch,5.0


In [21]:
final_trainer_deberta.save_model(f"models/w_test_split/deberta_final_stratify_{is_preprocessed}_{num_train_samples}_samples_optuna")
deberta_tokenizer.save_pretrained(f"models/w_test_split/deberta_final_stratify_{is_preprocessed}_{num_train_samples}_samples_optuna")
!cp -r models/w_test_split/deberta_final_stratify_{is_preprocessed}_{num_train_samples}_samples_optuna "/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/deberta_best_model_stratify_maxl_128_{is_preprocessed}_{num_train_samples}_samples_optuna"