In [23]:
# !pip install transformers
# !pip install huggingface_hub
# !pip install --upgrade --force-reinstall --no-deps transformers
# !pip install --upgrade --force-reinstall --no-deps huggingface_hub

In [24]:
# !pip install transformers --upgrade

In [25]:
import csv

import torch
from torch import nn
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel

from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from transformers import AdamW

In [26]:
class QuestionAnsweringClassifier(nn.Module):
    def __init__(self, bert_model='bert-base-uncased', num_labels=2):
        super(QuestionAnsweringClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        # Получаем скрытое состояние [CLS] токена
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs[1]  # outputs.pooler_output in newer versions
        # Пропускаем через классификатор
        logits = self.classifier(cls_output)
        return logits

In [27]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [28]:
tokenizer = BertTokenizer.from_pretrained('/kaggle/input/bert-base-uncased')

In [29]:
tokenized_inputs = []
labels = []

with open('/kaggle/input/virtassessordata/dataset.csv', newline='', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for i, row in enumerate(reader):
        labels.append(int(row['label']))

        input_sequence = f"[CLS] {row['text']} [SEP] {row['question']} [SEP] {row['answer']}"

        tokenized = tokenizer(input_sequence, padding='max_length', truncation=True, max_length=512)
        tokenized_inputs.append(tokenized)

input_ids = [x['input_ids'] for x in tokenized_inputs]
attention_masks = [x['attention_mask'] for x in tokenized_inputs]


In [30]:
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
labels = torch.tensor(labels)

In [31]:
input_ids_train, input_ids_val, attention_masks_train, attention_masks_val, labels_train, labels_val = train_test_split(
    input_ids, attention_masks, labels, test_size=0.1, random_state=42)


train_dataset = TensorDataset(input_ids_train, attention_masks_train, labels_train)
val_dataset = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [32]:
batch_size = 16

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [33]:
class F1Loss(nn.Module):
    def __init__(self, epsilon=1e-7):
        super(F1Loss, self).__init__()
        self.epsilon = epsilon

    def forward(self, logits, labels):
        probs = torch.sigmoid(logits)
        
        labels = F.one_hot(labels, num_classes=probs.shape[-1]).float()
        
        true_pos = torch.sum(probs * labels, dim=0)
        pred_pos = torch.sum(probs, dim=0)
        actual_pos = torch.sum(labels, dim=0)
        
        precision = true_pos / (pred_pos + self.epsilon)
        recall = true_pos / (actual_pos + self.epsilon)
        
        f1 = 2 * precision * recall / (precision + recall + self.epsilon)
        
        f1_score = torch.mean(f1)
        
        return 1 - f1_score

In [34]:
model = QuestionAnsweringClassifier('/kaggle/input/bert-base-uncased')
loss_fn = F1Loss()
optimizer = AdamW(model.parameters(), lr=5e-5)

In [35]:
from tqdm import tqdm

def train(model, dataloader, loss_fn, optimizer):
    input_ids, masks, labels = batch

    optimizer.zero_grad()

    outputs = model(input_ids, masks)
    preds = torch.argmax(torch.softmax(outputs, dim=1), dim=1)

    loss = loss_fn(outputs, labels)
    loss_item = loss.item()

    loss.backward()
    optimizer.step()

    return loss_item


In [36]:
def validate(model, dataloader, loss_fn):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validation"):
            input_ids, masks, labels = batch
            outputs = model(input_ids, masks)
            loss = loss_fn(outputs, labels)
            total_loss += loss.item()

    average_loss = total_loss / len(dataloader)
    return average_loss


In [37]:
num_epochs = 5
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}")
    model.train()
    total_loss = 0

    for batch in tqdm(train_dataloader, total=len(train_dataloader), desc="Training"):
        total_loss += train(model, train_dataloader, loss_fn, optimizer)
    train_loss = total_loss / len(train_dataloader)

    print(f"Train Loss: {train_loss}")
    val_loss = validate(model, val_dataloader, loss_fn)
    print(f"Validation Loss: {val_loss}")


Epoch 1


Training: 100%|██████████| 4/4 [02:51<00:00, 42.78s/it]


Train Loss: 0.5391850620508194


Validation: 100%|██████████| 1/1 [00:05<00:00,  5.51s/it]


Validation Loss: 0.47899889945983887
Epoch 2


Training: 100%|██████████| 4/4 [02:43<00:00, 40.98s/it]


Train Loss: 0.46893227100372314


Validation: 100%|██████████| 1/1 [00:05<00:00,  5.50s/it]


Validation Loss: 0.4591088891029358
Epoch 3


Training: 100%|██████████| 4/4 [02:55<00:00, 43.93s/it]


Train Loss: 0.4591663032770157


Validation: 100%|██████████| 1/1 [00:05<00:00,  5.51s/it]


Validation Loss: 0.44518375396728516
Epoch 4


Training: 100%|██████████| 4/4 [02:42<00:00, 40.53s/it]


Train Loss: 0.4389566630125046


Validation: 100%|██████████| 1/1 [00:05<00:00,  5.52s/it]


Validation Loss: 0.43427520990371704
Epoch 5


Training: 100%|██████████| 4/4 [02:43<00:00, 40.78s/it]


Train Loss: 0.4279625862836838


Validation: 100%|██████████| 1/1 [00:06<00:00,  6.07s/it]

Validation Loss: 0.4269086718559265





In [41]:
torch.save(model.state_dict(), 'model5.pth')
