In [None]:
import wandb
import pandas as pd
import numpy as np
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

from tqdm.notebook import tqdm

from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, AutoTokenizer

from nn import eval_model, train_epoch, get_predictions, TgClassifier, create_data_loader, TextDataset
from utils import save, calc_mae, cleanup, calc_accuracy, get_labels
from sklearn.metrics import accuracy_score
from config import cfg
import gc


[K     |████████████████████████████████| 4.0 MB 11.9 MB/s 
[K     |████████████████████████████████| 6.6 MB 26.2 MB/s 
[K     |████████████████████████████████| 895 kB 42.3 MB/s 
[K     |████████████████████████████████| 596 kB 50.2 MB/s 
[K     |████████████████████████████████| 77 kB 7.1 MB/s 
[K     |████████████████████████████████| 1.8 MB 11.3 MB/s 
[K     |████████████████████████████████| 181 kB 47.0 MB/s 
[K     |████████████████████████████████| 144 kB 52.8 MB/s 
[K     |████████████████████████████████| 63 kB 2.0 MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


In [None]:
np.random.seed(cfg['SEED'])
torch.manual_seed(cfg['SEED'])
random.seed(cfg['SEED'])

In [None]:
mt = pd.read_csv('..data/mt_processed.csv').reset_index().fillna(' ')
tg_other = pd.read_csv('..data/tg_other_processed.csv').reset_index().fillna(' ')
tg_toloka = pd.read_csv('..data/tg_toloka_processed.csv').reset_index().fillna(' ')

In [None]:
mt['label'] =  mt['label'].map({-1:2,0:0,1:1,2:2})

In [None]:
MODEL_NAME = cfg['model_name']
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
dl_mt = create_data_loader(
    text=mt.message,
    labels=mt.label,
    tokenizer=tokenizer,
    max_len=384,
    batch_size=32,
    shuffle=False
    )

tg_other['label']=0
dl_other = create_data_loader(
    text=tg_other.message,
    labels=tg_other.label,
    tokenizer=tokenizer,
    max_len=384,
    batch_size=32,
    shuffle=False
    )

dl_toloka = create_data_loader(
    text=tg_toloka.message,
    labels=tg_toloka.label,
    tokenizer=tokenizer,
    max_len=384,
    batch_size=32,
    shuffle=False
    )

In [None]:
device = 'cuda'
model = TgClassifier(3)
model.to(device)
model.load_state_dict(torch.load('./ruBERT-new-model/model_F.bin')['model_state_dict'])

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased-conversational were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

### Inference




#### Market Twits

In [None]:
prediction, _ = get_predictions(model, dl_mt)

temp = get_labels(prediction)
pd.concat([mt,temp],axis=1).rename(columns={'label':'true_label'}).to_csv('labelled_mt_F.csv', index=False)

  0%|          | 0/395 [00:00<?, ?it/s]

#### Telegram (toloka)

In [None]:
test_prediction, _ = get_predictions(model, dl_toloka)

temp = get_labels(test_prediction)
pd.concat([tg_toloka,temp],axis=1).rename(columns={'label':'true_label'}).to_csv('labelled_tg_F.csv', index=False)

  0%|          | 0/78 [00:00<?, ?it/s]

#### Telegram (other)

In [None]:
cleanup()
new_prediction, _ = get_predictions(model, tg_other)
temp = get_labels(new_prediction)
pd.concat([tg_other,temp],axis=1).drop('label',axis=1).to_csv('labelled_new_data_F.csv', index=False)

  0%|          | 0/533 [00:00<?, ?it/s]