# 
RusCola data is accessible  [here](https://github.com/RussianNLP/RuCoLA/tree/main/data)

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('RuCoLA-main/data/in_domain_train.csv')
df

Unnamed: 0,id,sentence,acceptable,error_type,detailed_source
0,0,"Вдруг решетка беззвучно поехала в сторону, и н...",1,0,Paducheva2004
1,1,Этим летом не никуда ездили.,0,Syntax,Rusgram
2,2,Только Иван выразил какую бы то ни было готовн...,1,0,Paducheva2013
3,3,"Теперь ты видишь собственными глазами, как тут...",1,0,Paducheva2010
4,4,На поверку вся теория оказалась полной чепухой.,1,0,Paducheva2010
...,...,...,...,...,...
7864,7864,Установки не было введено в действие.,0,Semantics,Paducheva2004
7865,7865,"Конечно, против такой системы ценностей решите...",0,Semantics,Paducheva2013
7866,7866,Симптомов болезни не исчезло.,0,Semantics,Paducheva2013
7867,7867,Послезавтра температура у больного снижается д...,0,Semantics,Rusgram


In [3]:
import transformers

from datasets import load_dataset, load_metric, Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
ds = df.rename(columns={'acceptable': 'label', 'sentence': 'text'})[['label', 'text']]
ds = Dataset.from_pandas(ds)
ds = ds.train_test_split(test_size=0.1)
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 7082
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 787
    })
})

In [5]:
model_name = 'DeepPavlov/rubert-base-cased'

In [6]:
from transformers import AutoModel, AutoTokenizer
# Load the BERT tokenizer.
print(f'Loading {model_name} tokenizer...')
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading DeepPavlov/rubert-base-cased tokenizer...


In [7]:
def tokenize_function(examples):

    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64)


In [8]:
tokenized_datasets = ds.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/7082 [00:00<?, ? examples/s]

Map: 100%|██████████| 7082/7082 [00:00<00:00, 22358.93 examples/s]
Map: 100%|██████████| 787/787 [00:00<00:00, 34877.20 examples/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7082
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 787
    })
})

In [9]:
tokenized_datasets['train'][0]

{'label': 1,
 'text': 'Они селились вблизи больших рек.',
 'input_ids': [101,
  9621,
  81409,
  19601,
  17174,
  3587,
  132,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  

In [10]:
from transformers import BertForSequenceClassification, AdamW, BertConfig


# just using automodel will not work
# This is because MT5EncoderModel or the T5EncoderModel is just a base model
#  and does not have any head. So it does not accept the labels argument.
# model = AutoModel.from_pretrained(model_name)

# Load BertForSequenceClassification, the pretrained BERT model with a single
# linear classification layer on top.
model = BertForSequenceClassification.from_pretrained(
    # "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    model_name, # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)



# Tell pytorch to run this model on the GPU.
model.cuda()

2023-11-12 14:21:23.186484: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model f

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [11]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir=model_name)

In [14]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [15]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [17]:
from transformers import TrainingArguments, Trainer

batch_size = 4

training_args = TrainingArguments(
    output_dir=model_name,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=10,
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    )

training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=4,
gradient_checkpointing=False,
greater_is_better=True,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ign

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics,
)

In [19]:
train_results = trainer.train()
# rest is optional but nice to have
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myukos[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
0,0.5622,0.573114,0.740788
1,0.5399,0.599694,0.740788
2,0.5287,0.586119,0.740788
4,0.5896,0.580182,0.740788
4,0.5018,0.577941,0.740788
5,0.5828,0.574552,0.740788
6,0.5667,0.575231,0.740788
8,0.5491,0.572558,0.740788
8,0.5709,0.572344,0.740788
9,0.5242,0.573097,0.740788


***** train metrics *****
  epoch                    =       9.98
  total_flos               =  2165613GF
  train_loss               =     0.5712
  train_runtime            = 0:11:09.81
  train_samples_per_second =     105.73
  train_steps_per_second   =      6.599


In [None]:
model.eval()

In [20]:
df_val = pd.read_csv('RuCoLA-main/data/in_domain_dev.csv')
ds_val = df_val.rename(columns={'acceptable': 'label', 'sentence': 'text'})[['label', 'text']]
ds_val = Dataset.from_pandas(ds_val)
ds_val

Dataset({
    features: ['label', 'text'],
    num_rows: 983
})

In [22]:
tokenized_ds_val = ds_val.map(tokenize_function, batched=True)

Map: 100%|██████████| 983/983 [00:00<00:00, 30639.93 examples/s]


In [23]:
trainer.evaluate(tokenized_ds_val)

{'eval_loss': 0.5675084590911865,
 'eval_accuracy': 0.745676500508647,
 'eval_runtime': 2.4985,
 'eval_samples_per_second': 393.437,
 'eval_steps_per_second': 98.459,
 'epoch': 9.98}

In [24]:
from transformers import TextClassificationPipeline

pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True, device=0)
# outputs a list of dicts like [[{'label': 'NEGATIVE', 'score': 0.0001223755971295759},  {'label': 'POSITIVE', 'score': 0.9998776316642761}]]
preds = pipe(tokenized_ds_val['text'])
preds = [np.argmax([p0['score'], p1['score']]) for p0, p1 in preds]



In [26]:
import torch

In [28]:
acc = torch.tensor(tokenized_ds_val['label']) == torch.tensor(preds)
acc.sum().item() / acc.numel()

0.745676500508647

In [34]:
len(tokenized_ds_val['label'])

983

In [35]:
len(preds)

983

In [36]:
matthews_metric = evaluate.load("matthews_correlation")
results = matthews_metric.compute(references=tokenized_ds_val['label'], predictions=preds)
results

{'matthews_correlation': 0.0}

In [39]:
# PATH = 'models/cased_L-12_H-768_A-12/'
# tokenizer = BertTokenizer.from_pretrained(PATH, local_files_only=True)

# model = BertForSequenceClassification.from_pretrained(
#     # "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
#     model_name, # Use the 12-layer BERT model, with an uncased vocab.
#     num_labels = 2, # The number of output labels--2 for binary classification.
#                     # You can increase this for multi-class tasks.
#     output_attentions = False, # Whether the model returns attentions weights.
#     output_hidden_states = False, # Whether the model returns all hidden-states.
# )

In [45]:
model2 = BertForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased/', local_files_only=True)

In [47]:
model2.num_labels

2