In [16]:
!pip install accelerate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting accelerate
  Downloading accelerate-1.1.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.1.0-py3-none-any.whl (333 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m333.2/333.2 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: accelerate
Successfully installed accelerate-1.1.0
[0m

In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import logging
import torch
from transformers import RobertaForQuestionAnswering, RobertaTokenizer
from transformers import Trainer, TrainingArguments
import pandas as pd
from torch.utils.data import Dataset
from sklearn.metrics import f1_score
import numpy as np

logging.basicConfig(level=logging.DEBUG)


In [2]:

logging.basicConfig(level=logging.DEBUG)
tokenizer = AutoTokenizer.from_pretrained("nur-dev/roberta-kaz-large")
model = AutoModelForMaskedLM.from_pretrained("nur-dev/roberta-kaz-large")

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /nur-dev/roberta-kaz-large/resolve/main/tokenizer_config.json HTTP/11" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /nur-dev/roberta-kaz-large/resolve/main/config.json HTTP/11" 200 0


In [3]:

class CustomQuestionAnsweringModel(RobertaForQuestionAnswering):
    def __init__(self, masked_model):
        super().__init__(masked_model.config)
        self.roberta.load_state_dict(masked_model.roberta.state_dict())

qa_model = CustomQuestionAnsweringModel(model)


In [4]:
print("CUDA available:", torch.cuda.is_available())


CUDA available: True


In [5]:
def encode_data(questions, contexts, start_positions, end_positions):
    encodings = tokenizer(questions, contexts, truncation=True, padding=True, return_tensors="pt")
    encodings["start_positions"] = torch.tensor(start_positions)
    encodings["end_positions"] = torch.tensor(end_positions)
    return encodings


In [6]:
data = pd.read_csv('final.csv')
data.head()

Unnamed: 0,context,question,answer_start,answer_text,answer_end
0,Belcalis Marlenis Almánzar ( /ˈbɛlkəliːz ɑːlˈm...,Карди Би музыканың қай жанрында ән айтады?,164,рэп,167
1,Карди Би дискографиясы негізінен трап пен R&am...,Карди Би музыканың қай жанрында ән айтады?,33,трап пен R&amp;B элементтерін қамтитын хип-хоп,79
2,Тікбұрышты горизонтальды үлендігі бірдей бірін...,Қызыл жолақты американдық жалау нені білдіреді?,537,шыдамдылық пен батырлықты,562
3,Мо́на Ли́за (Mona Lisa) — бұл шамамен 1503 жыл...,Әйгілі 'Мона Лиза' картинасы қайда қойылған?,217,"Франция, Париж)",232
4,Ливерпуль футбол клубы (Liverpool Football Clu...,Ливерпуль Премьер-Лига титулын қанша рет жеңіп...,208,19 рет,214


In [7]:
train_encodings = encode_data(
    data['question'].tolist(),
    data['context'].tolist(),
    data['answer_start'].tolist(),
    data['answer_end'].tolist()
)


In [8]:
class QADataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = QADataset(train_encodings)


In [9]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,  # Уменьшите значение
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,  # Увеличьте количество эпох
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=100,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=qa_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset
)

# Начинаем обучение
trainer.train()




Epoch,Training Loss,Validation Loss
1,5.7633,5.611176
2,5.3872,5.053584
3,4.9681,4.270538
4,4.232,3.340251
5,3.7194,2.775655
6,3.1195,2.54693
7,2.891,2.473369
8,2.9708,2.427505
9,2.5436,2.403644
10,2.3909,2.386472


TrainOutput(global_step=2080, training_loss=3.762348262163309, metrics={'train_runtime': 3364.8562, 'train_samples_per_second': 19.733, 'train_steps_per_second': 0.618, 'total_flos': 6.16661442772992e+16, 'train_loss': 3.762348262163309, 'epoch': 10.0})

In [17]:
context = """
Қазақстан Орталық Азиядағы ең ірі мемлекеттердің бірі болып табылады.
"""
question = "Орталық Азиядағы ең ірі мемлекеттердің бірі?"
qa_model.to("cpu")
# Tokenize the input
inputs = tokenizer.encode_plus(
    question, 
    context, 
    add_special_tokens=True, 
    return_tensors="pt"
)

input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# Perform inference
with torch.no_grad():
    outputs = qa_model(input_ids=input_ids, attention_mask=attention_mask)
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

# Find the answer's start and end position
start_index = torch.argmax(start_logits)
end_index = torch.argmax(end_logits)

# Decode the answer from the context
answer = tokenizer.decode(input_ids[0][start_index:end_index + 1])

print(f"Question: {question}")
print(f"Answer: {answer}")

Question: Орталық Азиядағы ең ірі мемлекеттердің бірі?
Answer:  ең ірі мемлекеттердің бірі?</s></s>
Қазақстан Орталық Азиядағы ең ірі мемлекеттердің бірі болып табылады


In [12]:
qa_model

CustomQuestionAnsweringModel(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

In [6]:
!nvidia-smi


Sun Nov  3 07:42:37 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L40S                    On  | 00000000:04:00.0 Off |                    0 |
| N/A   64C    P0              99W / 350W |      0MiB / 46068MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
import torch
torch.cuda.empty_cache()


In [4]:
!du -sh ./results/checkpoint-4000

18M	./results/checkpoint-3000
