In [1]:
%reload_ext autoreload
%autoreload 2

In [115]:
import sys
sys.path.append("..")

from torch.utils.data import DataLoader
from datasets import Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from utils.utils import load_sql_to_df
import pytorch_lightning as pl
from models.lightning import LitHuggingfaceClassifier


In [3]:
# checkpoint = "google/flan-t5-small"
checkpoint = "distilbert-base-multilingual-cased"
# checkpoint = "bert-base-multilingual-uncased"


In [5]:
# model = AutoModelForSequenceClassification.from_pretrained("../../models/distillbert-12-18/distilbert-base-multilingual-cased/")
pl_model = LitHuggingfaceClassifier("../../models/distillbert-12-18/distilbert-base-multilingual-cased/")

In [6]:
chess_database_file = "../../data/chess_moves_comments_nags.db"

important_columns = ["fen", "move", "comment", "sentiment"]
unlabeled_moves = load_sql_to_df("SELECT * FROM unlabeled_moves", chess_database_file)[important_columns]

In [49]:
unlabeled_data_dict = unlabeled_moves[['comment']].to_dict(orient='list')
unlabeled_dataset = Dataset.from_dict(unlabeled_data_dict)
unlabeled_dataset

Dataset({
    features: ['comment'],
    num_rows: 3308225
})

In [74]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["comment"], truncation=True)

def sample_length_function(example):
    example["length"] = example["input_ids"].shape[-1]
    return example


In [None]:

unlabeled_tokenized_dataset = unlabeled_dataset.map(tokenize_function, batched=True)
# unlabeled_tokenized_dataset.remove_columns(["comment", "sentiment"])
unlabeled_tokenized_dataset.set_format("torch")

In [75]:
unlabeled_tokenized_dataset = unlabeled_tokenized_dataset.map(sample_length_function, batched=False)
unlabeled_tokenized_dataset = unlabeled_tokenized_dataset.sort("length")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/3308225 [00:00<?, ? examples/s]

In [77]:
unlabeled_tokenized_dataset.save_to_disk("../../data/datasets/unlabeled_tokenized_dataset")

Saving the dataset (0/2 shards):   0%|          | 0/3308225 [00:00<?, ? examples/s]

In [61]:
unlabeled_tokenized_dataset = unlabeled_tokenized_dataset.sort("length")

In [86]:
unlabeled_tokenized_dataset

Dataset({
    features: ['comment', 'input_ids', 'attention_mask', 'length'],
    num_rows: 3308225
})

In [87]:
predict_dataset = unlabeled_tokenized_dataset.remove_columns(["comment", "length"])

In [96]:
predict_dataloader = DataLoader(predict_dataset, collate_fn=data_collator, batch_size=8, shuffle=False)

trainer = pl.Trainer(
    accelerator="gpu",
)

prediction = trainer.predict(pl_model, dataloaders=predict_dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/kamil/miniconda3/envs/thesis/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

In [102]:
flatten_predictions = [value.item() for batch in prediction for value in batch]

In [105]:
predictions_dataset = unlabeled_tokenized_dataset.add_column("prediction", flatten_predictions)

Flattening the indices:   0%|          | 0/3308225 [00:00<?, ? examples/s]

In [114]:
predictions_dataset.save_to_disk("../../data/datasets/predictions_dataset")

Saving the dataset (0/2 shards):   0%|          | 0/3308225 [00:00<?, ? examples/s]