In [30]:
import wandb
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AdamW, AutoModel, get_cosine_schedule_with_warmup
from sklearn.metrics import accuracy_score, mean_absolute_error, classification_report, f1_score
from nn import eval_model, train_epoch, get_predictions, TgClassifier, create_data_loader, TextDataset
from utils import save, calc_mae, cleanup, calc_accuracy
from config import cfg
import gc


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [32]:
wandb.login()
wandb.init(project="tg-markettwits-clf-tickers", config=cfg, entity="mikezz1")



True

In [34]:
np.random.seed(cfg['SEED'])
torch.manual_seed(cfg['SEED'])
random.seed(cfg['SEED'])

In [35]:
data = pd.read_csv('../input/markettwitslabeleddataset/mt-cleaned-labeled-tickers4.csv')
labelled_data = pd.read_csv('../input/markettwitslabeleddataset/final_2.csv')

In [36]:
data['label'] = data['label'].map({-1:2,0:0,1:1})
data.date = data.date.apply(pd.to_datetime)

In [38]:
labelled_train, labelled_test = train_test_split(labelled_data, test_size = 0.2, random_state=cfg['SEED'])
data_train, data_test = train_test_split(data, test_size = 0.2, random_state=cfg['SEED'])
data_train.shape, labelled_train.shape

((12520, 7), (1996, 11))

In [39]:
labelled_test = labelled_test.reset_index()
data_test = data_test.reset_index()

In [40]:
data_train = pd.concat([data_train, labelled_train.loc[:, data_train.columns.values]])

In [41]:
X_train, y_train = data_train['message'],data_train['label']
X_val, y_val = data_test['message'],data_test['label']

In [42]:
X_train = X_train.reset_index().fillna(' ')
X_val = X_val.reset_index().fillna(' ')
y_train=y_train.reset_index()
y_val = y_val.reset_index()

In [43]:
MODEL_NAME = cfg['model_name']
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

data_tokenized = [tokenizer.encode(str(x), add_special_tokens=True) for x in data.message]
np.quantile(list(map(len, data_tokenized)),0.99)

308.0

In [46]:
train_dl = create_data_loader(
    text=X_train.message,
    labels=y_train.label,
    tokenizer=tokenizer,
    max_len=cfg['max_len'],
    batch_size=16,
    shuffle=True
    )
val_dl = create_data_loader(
    text=X_val.message,
    labels=y_val.label,
    tokenizer=tokenizer,
    max_len=cfg['max_len'],
    batch_size=24,
    shuffle=False
    )

test_dl = create_data_loader(
    text=labelled_test.message,
    labels=labelled_test.label,
    tokenizer=tokenizer,
    max_len=cfg['max_len'],
    batch_size=24,
    shuffle=False
    )

In [47]:
def get_optimizer_grouped_parameters(
    model, model_type, 
    learning_rate, weight_decay, 
    layerwise_learning_rate_decay
):
    no_decay = ["bias", "LayerNorm.weight"]

    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if "classifier" in n or "pooler" in n],
            "weight_decay": 0.0,
            "lr": learning_rate,
        },
    ]
    layers = [getattr(model, model_type).embeddings] + list(getattr(model, model_type).encoder.layer)
    layers.reverse()
    lr = learning_rate
    for layer in layers:
        lr *= layerwise_learning_rate_decay
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": weight_decay,
                "lr": lr,
            },
            {
                "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
                "lr": lr,
            },
        ]
    return optimizer_grouped_parameters

## Тюним модель

In [48]:
EPOCHS=cfg['epochs']
device = 'cuda'
model = TgClassifier(3)
if cfg['freeze']:
    for param in model.bert.parameters():
        param.requires_grad = False
model.to(device)

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased-conversational were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=cfg['lr'], weight_decay= cfg['weight_decay'],
                  correct_bias = cfg['correct_bias'])


if cfg['layerwise_lr']:
    grouped_optimizer_params = get_optimizer_grouped_parameters(
        model,
        'bert', 
        learning_rate=cfg['lr'],
        weight_decay = cfg['weight_decay'], 
        layerwise_learning_rate_decay= cfg['layerwise_learning_rate_decay']
    )
    optimizer = AdamW(
        grouped_optimizer_params,
        lr=cfg['lr'],
        correct_bias=cfg['correct_bias']
    )

        
eval_steps = len(train_dl) // 3
max_train_steps= len(train_dl) * EPOCHS
warmup_steps= int(max_train_steps / 40)
scheduler =  get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=max_train_steps
            )


In [51]:
cleanup()
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1:2d}/{EPOCHS:2d}')
    print('-' * 10)
    train_epoch(model, loss_fn, optimizer, train_dl, val_dl, scheduler, device, eval_steps=eval_steps, test_data_loader=test_dl)
    save(model, f'/kaggle/working/bert_model/model_epoch_{epoch}.bin')

Epoch  1/ 3
----------


TRAIN:   0%|          | 0/908 [00:00<?, ?it/s]

=== Validation ===


VALIDATION:   0%|          | 0/131 [00:00<?, ?it/s]

Validation loss: 0.7476789475852297


TESTING:   0%|          | 0/21 [00:00<?, ?it/s]

Test loss: 0.7909988505499703
=== Validation ===


VALIDATION:   0%|          | 0/131 [00:00<?, ?it/s]

Validation loss: 0.7499045779686848


TESTING:   0%|          | 0/21 [00:00<?, ?it/s]

Test loss: 0.7352837707315173
=== Validation ===


VALIDATION:   0%|          | 0/131 [00:00<?, ?it/s]

Validation loss: 0.6457858019657717


TESTING:   0%|          | 0/21 [00:00<?, ?it/s]

Test loss: 0.7053616784867787
Epoch  2/ 3
----------


TRAIN:   0%|          | 0/908 [00:00<?, ?it/s]

=== Validation ===


VALIDATION:   0%|          | 0/131 [00:00<?, ?it/s]

Validation loss: 0.7375248013106921


TESTING:   0%|          | 0/21 [00:00<?, ?it/s]

Test loss: 0.81131340776171
=== Validation ===


VALIDATION:   0%|          | 0/131 [00:00<?, ?it/s]

Validation loss: 0.6135035276640463


TESTING:   0%|          | 0/21 [00:00<?, ?it/s]

Test loss: 0.7741583415440151
=== Validation ===


VALIDATION:   0%|          | 0/131 [00:00<?, ?it/s]

Validation loss: 0.6224048098990025


TESTING:   0%|          | 0/21 [00:00<?, ?it/s]

Test loss: 0.8046778596582866
Epoch  3/ 3
----------


TRAIN:   0%|          | 0/908 [00:00<?, ?it/s]

=== Validation ===


VALIDATION:   0%|          | 0/131 [00:00<?, ?it/s]

Validation loss: 0.763954046687097


TESTING:   0%|          | 0/21 [00:00<?, ?it/s]

Test loss: 0.9259515575000218
=== Validation ===


VALIDATION:   0%|          | 0/131 [00:00<?, ?it/s]

Validation loss: 0.7603125898665144


TESTING:   0%|          | 0/21 [00:00<?, ?it/s]

Test loss: 0.96738163346336
=== Validation ===


VALIDATION:   0%|          | 0/131 [00:00<?, ?it/s]

Validation loss: 0.7494991208988292


TESTING:   0%|          | 0/21 [00:00<?, ?it/s]

Test loss: 0.9640765814554124


### Метрики:

### Валидация - MarketTwits

In [52]:
prediction, true = get_predictions(model, val_dl)

  0%|          | 0/131 [00:00<?, ?it/s]

In [53]:
labels_predicted = pd.DataFrame(F.softmax(prediction,dim=1) \
                                .argmax(dim=1))

In [54]:
f1_macro = f1_score(labels_predicted, y_val.label, average='macro')
accuracy = accuracy_score(labels_predicted, y_val.label)
wandb.log({'f1_macro_mt': f1_macro})
wandb.log({'accuracy_mt': accuracy})
print(classification_report(labels_predicted, y_val.label))

              precision    recall  f1-score   support

           0       0.79      0.80      0.80      1269
           1       0.74      0.73      0.74       702
           2       0.76      0.76      0.76      1159

    accuracy                           0.77      3130
   macro avg       0.76      0.76      0.76      3130
weighted avg       0.77      0.77      0.77      3130



### Тест  - прочие каналы (Толока)

In [55]:
test_prediction, test_true = get_predictions(model, test_dl)

  0%|          | 0/21 [00:00<?, ?it/s]

In [56]:
test_labels_predicted = pd.DataFrame(F.softmax(test_prediction,dim=1) \
                                .argmax(dim=1))
labelled_test['pred'] = test_labels_predicted

In [57]:
f1_macro = f1_score(test_labels_predicted, labelled_test.label, average='macro')
accuracy = accuracy_score(test_labels_predicted, labelled_test.label)
wandb.log({'f1_macro_tg': f1_macro})
wandb.log({'accuracy_tg': accuracy})
print(classification_report(test_labels_predicted, labelled_test.label))

              precision    recall  f1-score   support

           0       0.71      0.69      0.70       200
           1       0.72      0.76      0.74       182
           2       0.66      0.62      0.64       118

    accuracy                           0.70       500
   macro avg       0.70      0.69      0.69       500
weighted avg       0.70      0.70      0.70       500

