In [60]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback
data = pd.read_excel('for_learning.xlsx')
model_name = "cointegrated/rubert-tiny2"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=21)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
X = list(data["Текст сообщения"])
y = list(data["Разметка"])

In [18]:
X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, stratify=y, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, shuffle=True)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [49]:
X[:10],y[:10]

(['первая - не сработала',
  'Я вообще не понимаю блок-схемы, никогда с ними не сталкивался, а тут их с первого занятия во всю показывают, домашку через gpt 3.5 сделал, но толку 0, я понмаю что написано в условии, но не могу реализовать это в блок-схеме)))',
  'теперь я сделаю из этого динамитную установку',
  'а радиокнопки это тег?',
  'мультяшный; 2024-03-10T12:17:03.419Z',
  'в первом шаге sum оставить = 1?',
  'передвигаться',
  'вагон смотря у кого какой',
  'меня*',
  'payday2; 2024-03-16T15:12:52.091Z'],
 [10, 5, 16, 5, 12, 5, 15, 12, 12, 15])

In [19]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [61]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average = 'macro')
    precision = precision_score(y_true=labels, y_pred=pred, average = 'macro')
    f1 = f1_score(y_true=labels, y_pred=pred, average = 'macro')
    report = classification_report(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1, 'report':report}

# Define Trainer
args = TrainingArguments(
    output_dir="/output",
    evaluation_strategy="steps",
    eval_steps=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=128,
    num_train_epochs=3,
    weight_decay=0.1,
    learning_rate = 4e-5,
    warmup_ratio=0.2,
    lr_scheduler_type = 'cosine',
    seed=20222022,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,    
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

trainer.train()

  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "              precision    recall  f1-score   support

           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         8
           5       0.00      0.00      0.00        13
           6       0.00      0.00      0.00         4
           9       0.00      0.00      0.00        10
          10       0.00      0.00      0.00         8
          11       0.00      0.00      0.00         4
          12       0.42      1.00      0.60        68
          13       0.00      0.00      0.00         3
          14       0.00      0.00      0.00         5
          15       0.00      0.00   

{'eval_loss': 2.1735663414001465, 'eval_accuracy': 0.425, 'eval_precision': 0.0265625, 'eval_recall': 0.0625, 'eval_f1': 0.03728070175438596, 'eval_report': '              precision    recall  f1-score   support\n\n           1       0.00      0.00      0.00         4\n           2       0.00      0.00      0.00         2\n           3       0.00      0.00      0.00         1\n           4       0.00      0.00      0.00         8\n           5       0.00      0.00      0.00        13\n           6       0.00      0.00      0.00         4\n           9       0.00      0.00      0.00        10\n          10       0.00      0.00      0.00         8\n          11       0.00      0.00      0.00         4\n          12       0.42      1.00      0.60        68\n          13       0.00      0.00      0.00         3\n          14       0.00      0.00      0.00         5\n          15       0.00      0.00      0.00        23\n          16       0.00      0.00      0.00         3\n          18   

TrainOutput(global_step=120, training_loss=2.475537872314453, metrics={'train_runtime': 681.9457, 'train_samples_per_second': 2.815, 'train_steps_per_second': 0.176, 'train_loss': 2.475537872314453, 'epoch': 3.0})

In [53]:
import pandas as pd
data = pd.read_excel("for_learning(3).xlsx")
data.head()

Unnamed: 0,ID урока,Дата старта урока,Роль пользователя,Текст сообщения,Дата сообщения,Разметка
0,307752,2024-03-13 09:51:49,user,https://diresnode.com/; 2024-03-13T10:12:22.638Z,2024-03-13 10:12:22,18
1,352174,2024-03-02 12:50:24,user,z dfc yt cksie,2024-03-02 13:02:49,15
2,352174,2024-03-02 12:50:24,user,gjxtve nj,2024-03-02 13:02:50,15
3,352174,2024-03-02 12:50:24,user,окей; 2024-03-02T13:03:16.429Z,2024-03-02 13:03:16,12
4,352174,2024-03-02 12:50:24,user,если не с хрома или яндекса сидишь то может по...,2024-03-02 13:05:17,10


In [38]:
data['14'][4]

'щас родители придут | Группа: Группа: [Родительское сообщение]. Описание группы: Родительское сообщение.'

In [59]:
data = pd.read_excel("for_learning(3).xlsx")

In [60]:
data['Разметка'].value_counts()

12    1473
15     333
4      226
11     188
9      160
6      148
10     132
14      93
5       89
13      77
1       65
20      57
16      52
18      41
7       29
2       18
3       16
17       6
19       3
8        1
Name: Разметка, dtype: int64

In [58]:
usefull = [2,3,4,5,6,7,8,9,10,11,18]
useless = [1,12,13,14,15,16,17,19,20]

In [49]:
usefull_dataset = pd.DataFrame(columns=data.columns)
useless_dataset = pd.DataFrame(columns=data.columns)

In [56]:
for i in range(len(data)):
    if data['Разметка'][i] in usefull:
        data['Разметка'][i]=1
    else:
        data['Разметка'][i]=0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Разметка'][i]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Разметка'][i]=0


In [61]:
for i in range(len(data)):
    if data['Разметка'][i] in usefull:
        usefull_dataset = usefull_dataset.append(data.iloc[i])
    else:
        useless_dataset = useless_dataset.append(data.iloc[i])

  usefull_dataset = usefull_dataset.append(data.iloc[i])
  useless_dataset = useless_dataset.append(data.iloc[i])
  useless_dataset = useless_dataset.append(data.iloc[i])
  useless_dataset = useless_dataset.append(data.iloc[i])
  usefull_dataset = usefull_dataset.append(data.iloc[i])
  usefull_dataset = usefull_dataset.append(data.iloc[i])
  useless_dataset = useless_dataset.append(data.iloc[i])
  useless_dataset = useless_dataset.append(data.iloc[i])
  useless_dataset = useless_dataset.append(data.iloc[i])
  useless_dataset = useless_dataset.append(data.iloc[i])
  useless_dataset = useless_dataset.append(data.iloc[i])
  useless_dataset = useless_dataset.append(data.iloc[i])
  useless_dataset = useless_dataset.append(data.iloc[i])
  useless_dataset = useless_dataset.append(data.iloc[i])
  useless_dataset = useless_dataset.append(data.iloc[i])
  useless_dataset = useless_dataset.append(data.iloc[i])
  useless_dataset = useless_dataset.append(data.iloc[i])
  useless_dataset = useless_dat

In [62]:
useless_dataset.to_csv('useless_bigger.csv',index=True)
usefull_dataset.to_csv('usefull_bigger.csv',index=True)

In [None]:
useless_dataset