In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("..")

from data_modules.lightning import HuggingFaceTokenizedDataModule
from transformers import AutoModelForSequenceClassification, DistilBertForSequenceClassification, AutoTokenizer
import pytorch_lightning as pl
from models.lightning import LitHuggingfaceClassifier
from datasets import DatasetDict

In [3]:
ds = DatasetDict.load_from_disk("../../data/huggingface/my_datasets/comments_with_color_200k_5k")
ds

DatasetDict({
    train: Dataset({
        features: ['fen', 'move', 'comment', 'sentiment', 'color_comment'],
        num_rows: 400000
    })
    validation: Dataset({
        features: ['fen', 'move', 'comment', 'sentiment', 'color_comment'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['fen', 'move', 'comment', 'sentiment', 'color_comment'],
        num_rows: 10000
    })
})

In [4]:
# checkpoint = "google/flan-t5-small"
checkpoint = "distilbert-base-multilingual-cased"
# checkpoint = "bert-base-multilingual-uncased"


In [5]:
model = DistilBertForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
# model = AutoModelForSequenceClassification.from_pretrained("../../models/distillbert-12-10/distilbert-base-multilingual-cased/")
# config = AutoConfig.from_pretrained(checkpoint, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [7]:
print(tokenizer("white [SEP] Good move for white!"))

{'input_ids': [101, 15263, 102, 13073, 18577, 10142, 15263, 106, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [9]:
batch_size = 8
pl_model = LitHuggingfaceClassifier(checkpoint, learning_rate=1e-5, save_dir="../../models/comments_sentiment/distillbert-5-23/v1")
data_module = HuggingFaceTokenizedDataModule(checkpoint=checkpoint, batch_size=batch_size, dataset_path="../../data/huggingface/my_datasets/comments_with_color_200k_5k/")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# pl_model = LitHuggingfaceClassifier.load_from_checkpoint("../../lightning_logs/comments_sentiment/tensorboard/DistilBertForSequenceClassification/version_5/checkpoints/epoch=4-step=31975.ckpt", model=model)

In [10]:
from pytorch_lightning.loggers import TensorBoardLogger, CSVLogger
from pytorch_lightning.callbacks import RichProgressBar

tensorboard_logger = TensorBoardLogger(save_dir="../../lightning_logs/comments_sentiment/tensorboard/", name=f"{type(pl_model.model).__name__}")
csv_logger = CSVLogger(save_dir="../../lightning_logs/comments_sentiment/csv/", name=f"{type(pl_model.model).__name__}")

In [11]:
trainer = pl.Trainer(
    accelerator="gpu",
    max_epochs=5,
    callbacks=[RichProgressBar()],
    # logger=[tensorboard_logger, csv_logger],
    # fast_dev_run=True
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
# model = AutoModelForSequenceClassification.from_pretrained(
#     "../../models/distillbert-12-10/distilbert-base-multilingual-cased/"
# )
# pl_model = LitHuggingfaceClassifier(model, learning_rate=1e-5)


trainer.fit(
    model=pl_model,
    datamodule=data_module,
    # ckpt_path="../../lightning_logs/comments_sentiment/tensorboard/DistilBertForSequenceClassification/version_11/checkpoints/epoch=2-step=34365.ckpt",
)
# trainer.validate(pl_model, dataloaders=eval_dataloader)

In [None]:
data_module = HuggingFaceTokenizedDataModule(checkpoint=checkpoint, batch_size=batch_size)

In [None]:
# model = AutoModelForSequenceClassification.from_pretrained("../../models/distillbert-12-18/distilbert-base-multilingual-cased/")
pl_model = LitHuggingfaceClassifier("../../models/distillbert-12-18/distilbert-base-multilingual-cased/")
trainer.validate(pl_model, datamodule=data_module)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("../../models/distillbert-12-10/distilbert-base-multilingual-cased/")
pl_model = LitHuggingfaceClassifier(model, learning_rate=1e-5)
trainer.validate(pl_model, datamodule=data_module)

In [13]:
model.save_pretrained("../../models/distillbert-12-18/distilbert-base-multilingual-cased")