In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("..")

import numpy as np
import pandas as pd
import utils.data_process as dp
from random import shuffle
from torch.utils.data.sampler import Sampler
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from custom_datasets.samplers import BySequenceLengthBatchSampler, BySequenceLengthBatchSampler2
from utils.utils import load_sql_to_df, save_to_sql, plot_history
from torch.optim import AdamW
import torch
from transformers import AutoModelForSequenceClassification, AutoConfig
from transformers import get_scheduler
import matplotlib.pyplot as plt
import pytorch_lightning as pl
from models.lightning import LitHuggingfaceClassifier

In [3]:
chess_database_file = "../../data/chess.db"

In [4]:
important_columns = ["position", "move", "comment", "sentiment"]
gameknot_moves_df = load_sql_to_df("SELECT * FROM english_annotated_moves", chess_database_file)[important_columns]
angelfire_moves = load_sql_to_df("SELECT * FROM angelfire_moves", chess_database_file)[['position', 'move', 'comment']]
angelfire_moves['sentiment'] = -1

chessbase_moves = load_sql_to_df("SELECT * FROM chessbase_moves_with_comments_2", chess_database_file)
chessbase_moves = chessbase_moves[important_columns]

moves_df = pd.concat((gameknot_moves_df, chessbase_moves, angelfire_moves), axis=0)
moves_df

Unnamed: 0,position,move,comment,sentiment
0,rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w ...,e2e4,This is my first gameknot game against someone...,-1
1,rnbqkbnr/pppp1ppp/8/4p3/4P3/8/PPPP1PPP/RNBQKBN...,b1c3,"I've been playing the Vienna Gambit as white, ...",-1
2,rnbqkbnr/pppp1ppp/8/4p3/4P3/2N5/PPPP1PPP/R1BQK...,f8c5,Minor disappointment.,-1
3,r1bqk2r/pppp1ppp/2n2n2/2b1p3/2B1P3/2NP4/PPP2PP...,f2f4,"My idea here is to expand on the kingside, dri...",-1
4,r1bqk2r/ppp2ppp/3p1n2/n1b1pP2/2B1P3/2NP4/PPP3P...,d1f3,"Maybe this isn't the greatest plan, since with...",0
...,...,...,...,...
27500,2Q2b1k/1p3q2/p6p/5n2/3p1r2/5NR1/PP3K2/7R w - -...,g3h3,"White defends very\nactively, with his major p...",-1
27501,2Q2b1k/1p3q2/p6p/5n2/3p1r2/5N1R/PP3K2/7R b - -...,d4d3,"A decoy, to tempt the White Queen\naway from t...",-1
27502,5b1k/1p6/p6p/5n2/5r2/3Q1N1R/P3K3/q6R b - - 3 43,a1a2,And this brings about a very unusual middlegam...,-1
27503,5b1k/1p6/p6p/5n2/5r2/3Q3R/q2NK3/7R b - - 1 44,f4d4,Consistent chess. Black hammers away at the\nw...,-1


In [5]:
data_df = moves_df[moves_df.sentiment.isin([0,1])]

In [6]:
data_df

Unnamed: 0,position,move,comment,sentiment
4,r1bqk2r/ppp2ppp/3p1n2/n1b1pP2/2B1P3/2NP4/PPP3P...,d1f3,"Maybe this isn't the greatest plan, since with...",0
7,r1bqk2r/pp3p1p/2pp2p1/2b1pPPn/2P1P3/2N2Q2/PPP1...,a7a6,"? Too slow, maybe.",0
18,2kr4/1bq2p1Q/p1p3p1/3ppPP1/2PbP1P1/2N5/P1PB4/1...,g6f5,"? This allows a combination by white. However,...",0
59,r7/pp1bkprp/2n1pN2/4P3/3P4/P7/5KPP/3R1B1R w - ...,f1d3,!! brilliant move it gives Black the d4 pawn b...,1
61,r7/pp1bkpr1/2n1pN1p/4P3/3P4/P2B4/5KPP/3R3R w -...,h2h4,!! this move is really fantastic. main job of ...,1
...,...,...,...,...
1633546,r5k1/5ppp/2q5/p1p1p3/2P1n3/P3P3/1Q3PPP/4NRK1 w...,b2e5,on account\nof wegen,0
1633547,r1r3k1/5ppp/1Rb1qn2/p1p1p3/B1P1P3/P3P3/3Q1PPP/...,a8b8,was a very\nimportant resource to avoid worse....,1
1633552,r4rk1/5ppp/2bpqnn1/p1p5/2P1pP2/P1BPP3/3Q2PP/1R...,g6e7,"appears to prevent f5, but in\nreality does no...",0
1633559,r1r3k1/4qppp/1Rbp1nn1/p1p5/2P1p3/P1BPPP2/3Q2PP...,f6d7,An important\nintermediate move. Wichtiger Zwi...,1


In [7]:
data_dict = data_df[['comment', 'sentiment']].to_dict(orient='list')

In [8]:
dataset = Dataset.from_dict(data_dict)
raw_datasets = dataset.train_test_split(test_size=0.1, shuffle=True)

In [9]:
checkpoint = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["comment"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/195200 [00:00<?, ? examples/s]

Map:   0%|          | 0/21689 [00:00<?, ? examples/s]

In [10]:
tokenized_datasets = tokenized_datasets.remove_columns(["comment"])
tokenized_datasets = tokenized_datasets.rename_column("sentiment", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'attention_mask']

In [11]:
tokenize_function(raw_datasets["train"][0])

{'input_ids': [101, 18740, 38356, 25743, 61285, 14010, 119, 34961, 10114, 29597, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
sampler = BySequenceLengthBatchSampler2(tokenized_datasets["train"]["input_ids"])

In [13]:
len(tokenized_datasets["train"]["input_ids"])

195200

In [14]:
batch_size = 8

# sampler = BySequenceLengthBatchSampler2(tokenized_datasets["train"]["input_ids"])

train_dataloader = DataLoader(
    tokenized_datasets["train"], batch_sampler=sampler, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["test"], batch_size=batch_size, collate_fn=data_collator
)

In [15]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
# model = AutoModelForSequenceClassification.from_pretrained("../../models/distillbert-12-10/distilbert-base-multilingual-cased/")
config = AutoConfig.from_pretrained(checkpoint, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
pl_model = LitHuggingfaceClassifier(model, learning_rate=1e-5)

In [17]:
# pl_model = LitHuggingfaceClassifier.load_from_checkpoint("../../lightning_logs/comments_sentiment/tensorboard/DistilBertForSequenceClassification/version_5/checkpoints/epoch=4-step=31975.ckpt", model=model)

In [18]:
from pytorch_lightning.loggers import TensorBoardLogger, CSVLogger
from pytorch_lightning.callbacks import RichProgressBar

tensorboard_logger = TensorBoardLogger(save_dir="../../lightning_logs/comments_sentiment/tensorboard/", name=f"{type(model).__name__}")
csv_logger = CSVLogger(save_dir="../../lightning_logs/comments_sentiment/csv/", name=f"{type(model).__name__}")


trainer = pl.Trainer(
    accelerator="gpu",
    max_epochs=6,
    callbacks=[RichProgressBar()],
    logger=[tensorboard_logger, csv_logger]
)

trainer.fit(model=pl_model, train_dataloaders=train_dataloader, val_dataloaders=eval_dataloader)
# trainer.validate(pl_model, dataloaders=eval_dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


`Trainer.fit` stopped: `max_epochs=6` reached.


In [20]:
model = AutoModelForSequenceClassification.from_pretrained("../../models/distillbert-12-10/distilbert-base-multilingual-cased/")
pl_model = LitHuggingfaceClassifier(model, learning_rate=1e-5)

In [21]:
correct_predictions = 0
items = 0

pl_model.cuda()

for batch in eval_dataloader:
    with torch.no_grad():
        batch = {k: v.cuda() for k, v in batch.items()}
        outputs = pl_model(batch)
        predicts = torch.argmax(outputs.logits, dim=-1)
        labels = batch["labels"]
        correct_predictions += torch.sum(predicts == labels).item()
        items += len(labels)

print(f"Accuracy: {correct_predictions} / {items} = {correct_predictions / items}")

Accuracy: 20908 / 21689 = 0.9639909631610494


In [22]:
trainer.validate(pl_model, dataloaders=eval_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

[{'valid_loss': 0.10363312065601349,
  'valid_acc_micro': 0.9639909863471985,
  'valid_acc_macro': 0.9432557821273804}]

In [19]:

trainer.fit(model=pl_model, train_dataloaders=train_dataloader, val_dataloaders=eval_dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


`Trainer.fit` stopped: `max_epochs=5` reached.


In [20]:
# torch.save(model.state_dict(), "../../models/distillbert/distilbert-base-multilingual-cased.pt")

In [22]:
model.save_pretrained("../../models/distillbert-12-10/distilbert-base-multilingual-cased")