In [None]:
#@title Установка среды { vertical-output: true }

!pip install transformers datasets

In [1]:
#@title Загрузка данных для обучения { vertical-output: true }

!git clone https://github.com/Nehc/BertMobile.git

Cloning into 'BertMobile'...
remote: Enumerating objects: 11, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 11 (delta 2), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (11/11), done.


In [1]:
#@title Формирование датасетов { vertical-output: true }
import json
from datasets import Dataset

with open('BertMobile/data.json', 'r', encoding='UTF-8') as fr:
    # читаем из файла
    ls_d = json.load(fr)

ds = Dataset.from_dict(ls_d)

with open('BertMobile/eval.json', 'r', encoding='UTF-8') as fr:
    # читаем из файла
    ls_e = json.load(fr)

es = Dataset.from_dict(ls_e)

ds, es 

(Dataset({
     features: ['label', 'text'],
     num_rows: 8500
 }), Dataset({
     features: ['label', 'text'],
     num_rows: 10
 }))

In [2]:
#@title Загрузка исходной модели и токенизация { vertical-output: true }

from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = 'bert-base-multilingual-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=221)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64)

tokenized_dataset = ds.map(tokenize_function, batched=True)
train_dataset = tokenized_dataset.shuffle(seed=42)

tokenized_ev_dataset = es.map(tokenize_function, batched=True)
eval_dataset = tokenized_ev_dataset.shuffle(seed=42)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [3]:
#@title Метрика качества (accuracy) { vertical-output: true } 
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [4]:
#@title Параметры обучения { vertical-output: true } 

from transformers import TrainingArguments, Trainer 

training_args = TrainingArguments(
    output_dir="test_trainer", #The output directory
    evaluation_strategy="epoch",
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=5, # number of training epochs
    per_device_train_batch_size=100, # batch size for training
    per_device_eval_batch_size=10,  # batch size for evaluation
    eval_steps = 5, # Number of update steps between two evaluations.
    logging_steps = 1,  # Number of update steps between two evaluations.
    save_steps = 85, # after # steps model is saved 
    warmup_steps = 100, # number of warmup steps for learning rate scheduler
    #prediction_loss_only=True,
    )

trainer = Trainer(
    model=model,
    args=training_args,
    #data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [5]:
#@title Обучение { vertical-output: true } 

trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 8500
  Num Epochs = 5
  Instantaneous batch size per device = 100
  Total train batch size (w. parallel, distributed & accumulation) = 100
  Gradient Accumulation steps = 1
  Total optimization steps = 425


Epoch,Training Loss,Validation Loss,Accuracy
1,4.4649,5.040106,0.0
2,2.2441,4.362595,0.3
3,1.4328,4.154456,0.3
4,0.9383,4.0838,0.3
5,0.7605,4.063141,0.3


Saving model checkpoint to test_trainer/checkpoint-85
Configuration saved in test_trainer/checkpoint-85/config.json
Model weights saved in test_trainer/checkpoint-85/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 10
Saving model checkpoint to test_trainer/checkpoint-170
Configuration saved in test_trainer/checkpoint-170/config.json
Model weights saved in test_trainer/checkpoint-170/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num example

TrainOutput(global_step=425, training_loss=2.383674660570481, metrics={'train_runtime': 461.9567, 'train_samples_per_second': 92.0, 'train_steps_per_second': 0.92, 'total_flos': 1400525949120000.0, 'train_loss': 2.383674660570481, 'epoch': 5.0})

In [6]:
#@title Подгружаем ответы { vertical-output: true } 
with open('BertMobile/ans.json', 'r', encoding='UTF-8') as fr:
    # читаем из файла
    ans = json.load(fr)
    
ans[100]



'На тарифе Сижу в интернете Абонентская плата - 8.90 руб.'

In [9]:
#@title Инференс (тестирование) { vertical-output: true } 
import torch as pt

#@markdown Вопрос: *на тарифе позвони маме сколько бесплатных минут?* 
text = "[CLS] на тарифе позвони маме сколько бесплатных минут? [SEP]"
inpt = tokenizer.encode(text, return_tensors="pt")
model.cuda()
model.eval()
inpt= inpt.cuda()
out = model(inpt)
it = pt.argmax(out[0], dim=1).item()
m = pt.nn.Softmax(dim=1)
prc = pt.max(m(out[0])).item()*100
s = ans[it]
print(f'{s} (Вероятность:{prc:.2f}%)')

На тарифе Позвони маме Бесплатные минуты внутри сети - безлимит. (Вероятность:59.23%)


In [10]:
#@title Сохраняем модель и токенайзер { vertical-output: true } 

#@markdown в таком виде уже можно закинуть на https://huggingface.co/
#@markdown ,но это уже отдельная тема и не в блокноте 

model.save_pretrained('FakeMobile')
tokenizer.save_pretrained('FakeMobile')

Configuration saved in FakeMobile/config.json
Model weights saved in FakeMobile/pytorch_model.bin
tokenizer config file saved in FakeMobile/tokenizer_config.json
Special tokens file saved in FakeMobile/special_tokens_map.json


('FakeMobile/tokenizer_config.json',
 'FakeMobile/special_tokens_map.json',
 'FakeMobile/vocab.txt',
 'FakeMobile/added_tokens.json',
 'FakeMobile/tokenizer.json')

In [13]:
#@title А вот так потом с huggingface достаем и используем { vertical-output: true } 

model_name = "Nehc/FakeMobile" 

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

https://huggingface.co/Nehc/FakeMobile/resolve/main/tokenizer_config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpjtgniff9


Downloading:   0%|          | 0.00/334 [00:00<?, ?B/s]

storing https://huggingface.co/Nehc/FakeMobile/resolve/main/tokenizer_config.json in cache at /root/.cache/huggingface/transformers/1d6d2b814e76bac03632ef3aa5450ac16671d17887058c14fac8b1ae97bb5df2.425d9f608249dd9c0ba7685d692ab4dad376e788e88c6294505926231d548fee
creating metadata file for /root/.cache/huggingface/transformers/1d6d2b814e76bac03632ef3aa5450ac16671d17887058c14fac8b1ae97bb5df2.425d9f608249dd9c0ba7685d692ab4dad376e788e88c6294505926231d548fee
https://huggingface.co/Nehc/FakeMobile/resolve/main/vocab.txt not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpxirubueg


Downloading:   0%|          | 0.00/851k [00:00<?, ?B/s]

storing https://huggingface.co/Nehc/FakeMobile/resolve/main/vocab.txt in cache at /root/.cache/huggingface/transformers/525034786dabc3d1a5faa160cf15fc7a20c280ea246132814064d17a7e107497.92022aa29ab6663b0b4254744f28ab43e6adf4deebe0f26651e6c61f28f69d8b
creating metadata file for /root/.cache/huggingface/transformers/525034786dabc3d1a5faa160cf15fc7a20c280ea246132814064d17a7e107497.92022aa29ab6663b0b4254744f28ab43e6adf4deebe0f26651e6c61f28f69d8b
https://huggingface.co/Nehc/FakeMobile/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpr9isozug


Downloading:   0%|          | 0.00/1.64M [00:00<?, ?B/s]

storing https://huggingface.co/Nehc/FakeMobile/resolve/main/tokenizer.json in cache at /root/.cache/huggingface/transformers/182a21e4d3dbb3e2b47f94fb0e292695f1f0a8efdc5dd6d8a34328d111fbab87.edd23e8890ed17ee47c28c187a77d3b7958b18fd9a44753112e13d2c61df2ab2
creating metadata file for /root/.cache/huggingface/transformers/182a21e4d3dbb3e2b47f94fb0e292695f1f0a8efdc5dd6d8a34328d111fbab87.edd23e8890ed17ee47c28c187a77d3b7958b18fd9a44753112e13d2c61df2ab2
https://huggingface.co/Nehc/FakeMobile/resolve/main/special_tokens_map.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpipoercu1


Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

storing https://huggingface.co/Nehc/FakeMobile/resolve/main/special_tokens_map.json in cache at /root/.cache/huggingface/transformers/3a4ba1ed048a23801a6e93d371da680f200e6aeb2d55202c2c3a8022b4470a2b.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
creating metadata file for /root/.cache/huggingface/transformers/3a4ba1ed048a23801a6e93d371da680f200e6aeb2d55202c2c3a8022b4470a2b.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
loading file https://huggingface.co/Nehc/FakeMobile/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/525034786dabc3d1a5faa160cf15fc7a20c280ea246132814064d17a7e107497.92022aa29ab6663b0b4254744f28ab43e6adf4deebe0f26651e6c61f28f69d8b
loading file https://huggingface.co/Nehc/FakeMobile/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/182a21e4d3dbb3e2b47f94fb0e292695f1f0a8efdc5dd6d8a34328d111fbab87.edd23e8890ed17ee47c28c187a77d3b7958b18fd9a44753112e13d2c61df2ab2
loading file https://h

Downloading:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

storing https://huggingface.co/Nehc/FakeMobile/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/d4364adf91830bbf8d0c723802e4b6586af621219265ef1336a2373c0053479f.75744fc80b077915b3ea1aff6afd2515696509b2ce94a6a1cfb74637020c38e3
creating metadata file for /root/.cache/huggingface/transformers/d4364adf91830bbf8d0c723802e4b6586af621219265ef1336a2373c0053479f.75744fc80b077915b3ea1aff6afd2515696509b2ce94a6a1cfb74637020c38e3
loading configuration file https://huggingface.co/Nehc/FakeMobile/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/d4364adf91830bbf8d0c723802e4b6586af621219265ef1336a2373c0053479f.75744fc80b077915b3ea1aff6afd2515696509b2ce94a6a1cfb74637020c38e3
Model config BertConfig {
  "_name_or_path": "Nehc/FakeMobile",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "

Downloading:   0%|          | 0.00/639M [00:00<?, ?B/s]

storing https://huggingface.co/Nehc/FakeMobile/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/84887cf7e1454006741bfb9c78d60bacda98cc57c586ccd655022950b91f7312.7a5cede4c58340bf5cf981a69c394287dffc8229cd6069925b558e5400dbd28e
creating metadata file for /root/.cache/huggingface/transformers/84887cf7e1454006741bfb9c78d60bacda98cc57c586ccd655022950b91f7312.7a5cede4c58340bf5cf981a69c394287dffc8229cd6069925b558e5400dbd28e
loading weights file https://huggingface.co/Nehc/FakeMobile/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/84887cf7e1454006741bfb9c78d60bacda98cc57c586ccd655022950b91f7312.7a5cede4c58340bf5cf981a69c394287dffc8229cd6069925b558e5400dbd28e
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the model checkpoint at Nehc/FakeMobile.
If your task is similar to the task the model of the checkpoint was train