# Imports

In [1]:
!pip install transformers nlpaug



In [2]:
import json
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from transformers import Trainer
from transformers import AutoTokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm_notebook as tqdm

from torch.utils.data import DataLoader
from torch.utils.data import Dataset

from transformers import AdamW
from transformers import AutoModel
from transformers import AutoTokenizer


In [3]:
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
random.seed(SEED)

# Loading data

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Base model


In [5]:
X_train, X_val, y_train, y_val = train_test_split(pd.DataFrame(train['text']), train[train.columns[2:]], test_size=0.2)

y_train = y_train.reset_index().drop('index',axis = 1)
y_val = y_val.reset_index().drop('index',axis = 1)

X_train = X_train.reset_index().drop('index',axis = 1)
X_val = X_val.reset_index().drop('index',axis = 1)

X_test = pd.DataFrame(test['text'])
y_test = pd.DataFrame(np.zeros((X_test.shape[0],11)))

In [6]:
import nlpaug.augmenter.word as naw
aug = naw.RandomWordAug(aug_p = 0.3)

augmented_random = np.array(aug.augment(list(X_train['text'].values)))

b = pd.DataFrame(columns = X_train.columns)
b['text'] = augmented_random

X_train = X_train.append(b, ignore_index=True)
y_train = y_train.append(y_train, ignore_index=True)

In [7]:
X_train

Unnamed: 0,text
0,Как производить профилактику расстройства пище...
1,Появились пролысины на шее у коровы.\nПодскажи...
2,Подскажите советом!!! Теленку 3 недели взяли д...
3,"У лейкозной коровы язвочки на сосках, что это?..."
4,"Тёлка месячная отказывается есть и поносит, чт..."
...,...
465,"Здравствуйте форумчане, хочу корову, но столкн..."
466,"всем привет, утром обнаружил слабость у телки,..."
467,"У у телки на шее шишка, и язвы начало еще боль..."
468,Телята дорастают до определённого возраста и п...


**Обучаем модель**

In [8]:
MODEL_NAME = "sberbank-ai/ruRoberta-large"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [9]:
tokenized = list()
[tokenized.append(tokenizer.tokenize(t)) for t in X_train.text]
tokenized = np.array(tokenized)
length = [len(i) for i in tokenized]
l = np.percentile(length,95)
MAX_LEN = int(l)

  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:

class CowDataset(Dataset):
    def __init__(self, ids, text, targets, tokenizer, max_len):
        self.ids = ids
        self.text = text
        self.targets = targets.values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        target = self.targets[item]
        ids = self.ids[item]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
        )
        return {
            'id': ids,
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(target, dtype=torch.long)
        }


def create_data_loader(ids, text, targets, tokenizer, batch_size, max_len):
    dataset = CowDataset(
        ids = ids,
        text= text,
        targets=targets,
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True,
    )


class CowDoctorlassifier(nn.Module):
    def __init__(self, n_classes, not_use_pool=False):
        super(CowDoctorlassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.fc1 = nn.Linear(self.bert.config.hidden_size, 512)
        self.fc2 = nn.Linear(512, n_classes)
        self.relu = nn.ReLU()
        self.not_use_pool = not_use_pool

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)

        last_hidden_state = outputs[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        last_hidden_state[input_mask_expanded == 0] = -1e9 
        max_emb = torch.max(input_mask_expanded * last_hidden_state, 1)[0]
        emb = max_emb

        if self.not_use_pool:
          last_hidden_state_cls = outputs[0][:, 0, :]
          emb = last_hidden_state_cls

        out =  self.fc2(
                self.relu(
                    self.drop(
                        self.fc1(emb)
                        )
                    )
                )

        return {
            'logits' : out
        }

In [11]:
train_ds = CowDataset(
    ids=X_train.index.to_numpy(),
    text=X_train.text,
    targets=y_train,
    tokenizer=tokenizer,
    max_len=MAX_LEN
    )
eval_ds = CowDataset(
    ids=X_val.index.to_numpy(),
    text=X_val.text,
    targets=y_val,
    tokenizer=tokenizer,
    max_len=MAX_LEN
    )
test_ds = CowDataset(
    ids=X_test.index.to_numpy(),
    text=X_test.text,
    targets=y_test,
    tokenizer=tokenizer,
    max_len=MAX_LEN
    )


In [12]:
class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(inputs.get('input_ids'), inputs.get('attention_mask'))
        logits = outputs.get('logits')
        loss_fct = nn.BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1, 11),
                        labels.float().view(-1, 11))
        return (loss, outputs) if return_outputs else loss


def log_loss_score(gt, pr):
    
    log_loss_ = 0
    
    gt = np.array(gt)
    
    for i in range(10):
        log_loss_ += log_loss(gt[:, i], pr[:, i])
        
    return log_loss_ / 10

In [13]:
import gc 
gc.collect() 
torch.cuda.empty_cache()

In [14]:
from transformers import TrainingArguments

model = CowDoctorlassifier(n_classes=11, not_use_pool = True)

training_args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="steps",
    lr_scheduler_type = "cosine",
    learning_rate = 2e-5,
    eval_steps=25,
    logging_steps=25,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=6,
    seed=SEED,
    load_best_model_at_end=True,
)

trainer = MultilabelTrainer(
    model=model,
    args=training_args,
    train_dataset = train_ds,
    eval_dataset= eval_ds,
)

Some weights of the model checkpoint at sberbank-ai/ruRoberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at sberbank-ai/ruRoberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to b

In [15]:
trainer.train()

***** Running training *****
  Num examples = 470
  Num Epochs = 6
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 354


Step,Training Loss,Validation Loss
25,0.5018,0.428977
50,0.4372,0.420348
75,0.3942,0.370766
100,0.3546,0.318445
125,0.2854,0.284371
150,0.2305,0.277428
175,0.2063,0.255368
200,0.1765,0.253878
225,0.1667,0.248366
250,0.1457,0.24838


***** Running Evaluation *****
  Num examples = 59
  Batch size = 8
***** Running Evaluation *****
  Num examples = 59
  Batch size = 8
***** Running Evaluation *****
  Num examples = 59
  Batch size = 8
***** Running Evaluation *****
  Num examples = 59
  Batch size = 8
***** Running Evaluation *****
  Num examples = 59
  Batch size = 8
***** Running Evaluation *****
  Num examples = 59
  Batch size = 8
***** Running Evaluation *****
  Num examples = 59
  Batch size = 8
***** Running Evaluation *****
  Num examples = 59
  Batch size = 8
***** Running Evaluation *****
  Num examples = 59
  Batch size = 8
***** Running Evaluation *****
  Num examples = 59
  Batch size = 8
***** Running Evaluation *****
  Num examples = 59
  Batch size = 8
***** Running Evaluation *****
  Num examples = 59
  Batch size = 8
***** Running Evaluation *****
  Num examples = 59
  Batch size = 8
***** Running Evaluation *****
  Num examples = 59
  Batch size = 8


Training completed. Do not forget to share you

TrainOutput(global_step=354, training_loss=0.2432732750466988, metrics={'train_runtime': 261.9125, 'train_samples_per_second': 10.767, 'train_steps_per_second': 1.352, 'total_flos': 0.0, 'train_loss': 0.2432732750466988, 'epoch': 6.0})

In [16]:
out,_,metrics = trainer.predict(eval_ds)

***** Running Prediction *****
  Num examples = 59
  Batch size = 8


In [17]:
logits = torch.sigmoid(torch.tensor(out))
logits = logits.numpy()
(1-log_loss_score(y_val, logits))*0.8

0.6104464908676633

# Submisson


In [18]:
submit, _, _ = trainer.predict(test_ds)
submit = torch.sigmoid(torch.tensor(submit)).numpy()


***** Running Prediction *****
  Num examples = 99
  Batch size = 8


In [19]:
submission_columns = ['text_id'] + list(train.columns[2:-1])
submission = pd.concat([test['text_id'], pd.DataFrame(submit[:,:10])], axis=1)
submission.columns = submission_columns

In [20]:
submission_json = {str(k): {"span": list(), "label": list(v.values())} \
                   for k,v in submission.set_index('text_id').to_dict('index').items()}

submission_json['294']

{'label': [0.9175109267234802,
  0.043323397636413574,
  0.8142229914665222,
  0.8065071105957031,
  0.05114462599158287,
  0.0673268660902977,
  0.046984218060970306,
  0.05588258430361748,
  0.04735879600048065,
  0.04344747215509415],
 'span': []}

In [21]:
with open('sample_submission.json', 'w') as final_submit:
    json.dump(submission_json, final_submit, indent=4)