In [246]:
#!g1.1
import numpy as np
import pandas as pd
import os

from IPython.display import clear_output

from pathlib import Path

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import RandomSampler, SequentialSampler,random_split
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report, roc_auc_score

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
from torchsummary import summary
from tqdm import tqdm

import random

import transformers
from transformers import BertModel
from transformers.models.bert.tokenization_bert import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

from torch.utils.tensorboard import SummaryWriter

In [247]:
#!g1.1
MAX_LENGTH = 50
EPOCHS = 500

In [248]:
#!g1.1
def fix_seed(seed: int = 42) -> None:
    """
    Set seeds for reproducibility
    :param seed: seed value
    """
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [249]:
#!g1.1
fix_seed(42)

model_cache_path = Path('./model_cache')
model_cache_path.mkdir(parents=True, exist_ok=True)

base_model_name = "cointegrated/rubert-tiny"

trained_model_path = Path('./trained_model_artifacts')
trained_model_path.mkdir(parents=True, exist_ok=True)


def plot_losses(train_losses, test_losses, train_accuracies, test_accuracies):
    clear_output()
    fig, axs = plt.subplots(1, 2, figsize=(13, 4))
    axs[0].plot(range(1, len(train_losses) + 1), train_losses, label='train')
    axs[0].plot(range(1, len(test_losses) + 1), test_losses, label='test')
    axs[0].set_ylabel('loss')

    axs[1].plot(range(1, len(train_accuracies) + 1), train_accuracies, label='train')
    axs[1].plot(range(1, len(test_accuracies) + 1), test_accuracies, label='test')
    axs[1].set_ylabel('metric')

    for ax in axs:
        ax.set_xlabel('epoch')
        ax.legend()

    plt.show()
    
os.listdir('.')
os.listdir('../../mnt/s3/hsedatafitpredict1392')
train_df = pd.read_parquet('../../mnt/s3/hsedatafitpredict1392/mega_train.parquet')

train_use_df = train_df[['sentence', 'communication', 'price', 'quality', 'safety']]
train_use_df[['communication', 'price', 'quality', 'safety']]

def only_char_left(text):
    text_new = ''
    for i in text:
        if i.isalpha() or i == ' ':
            text_new += i
    return text_new
        
    
train_use_df['sentence'] = train_use_df['sentence'].apply(lambda x: only_char_left(x))
train_use_df['sentence'] = train_use_df['sentence'].apply(lambda x: ' '.join(x.lower().split(' ')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_use_df['sentence'] = train_use_df['sentence'].apply(lambda x: only_char_left(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_use_df['sentence'] = train_use_df['sentence'].apply(lambda x: ' '.join(x.lower().split(' ')))


In [13]:
#!g1.1
train_use_df['temp'] = train_use_df['communication'] + train_use_df['price'] + train_use_df['quality'] + train_use_df['safety']
train_use_df = train_use_df[train_use_df.temp != 0]
train_use_df = train_use_df[['sentence', 'communication', 'price', 'quality', 'safety']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_use_df['temp'] = train_use_df['communication'] + train_use_df['price'] + train_use_df['quality'] + train_use_df['safety']


In [19]:
#!g1.1
train_use_df.loc[:, 'communication'] = (~(train_use_df.loc[:, 'communication'] == 0.0)).astype(int)
train_use_df.loc[:, 'price'] = (~(train_use_df.loc[:, 'price'] == 0.0)).astype(int)
train_use_df.loc[:, 'quality'] = (~(train_use_df.loc[:, 'quality'] == 0.0)).astype(int)
train_use_df.loc[:, 'safety'] = (~(train_use_df.loc[:, 'safety'] == 0.0)).astype(int)

In [20]:
#!g1.1
train_use_df

Unnamed: 0,sentence,communication,price,quality,safety
0,хотелось бы выразить огромную благодарность...,0,0,1,0
1,на что сотрудник банка ответила мне что данну...,1,0,0,0
2,тем самым оставив меня без средств к существо...,0,0,1,0
4,заблокировали счет якобы изза просроченой зад...,1,0,0,0
5,в итоге даже не извинилисьдолго искали картуот...,1,0,1,0
...,...,...,...,...,...
7159,я просто в шоке ничего о том что сосчет можно ...,1,0,0,1
7160,я просто не в состоянии заплатить такую сумму ...,0,1,1,0
7162,я у сотрудника спросила получила ответ но сотр...,1,0,0,1
7163,восхищаюсь я проходит еще неделя ну я думаю и...,1,0,0,0


In [234]:
#!g1.1
tr, valid = train_test_split(train_use_df, random_state=101, test_size=0.1)

In [235]:
#!g1.1
tr.head()

Unnamed: 0,sentence,communication,price,quality,safety
1478,даже не хочется думать какие препоны нас ждут ...,2.0,0.0,1.0,0.0
1922,звонить бесполезно хоть на час хоть на часов,3.0,0.0,0.0,0.0
2601,менеджер бойко отвечает в банкоматах райффайзе...,1.0,0.0,0.0,0.0
972,видно что есть определенные стандарты работы и...,1.0,0.0,1.0,0.0
4512,последнее обращение составлено октября до си...,2.0,0.0,0.0,0.0


In [236]:
#!g1.1
tr.sum(axis=0)

sentence         даже не хочется думать какие препоны нас ждут ...
communication                                                 6906
price                                                          435
quality                                                       5350
safety                                                         259
dtype: object

In [237]:
#!g1.1
tr_price = tr[(tr.price == 1) ] # & (tr.communication == 0) & (tr.quality == 0)
tr_safety = tr[(tr.safety == 1) ]  # & (tr.communication == 0) & (tr.quality == 0)

In [238]:
#!g1.1
tr_ups = tr.copy()
for i in range(10):
    tr_ups = pd.concat([tr_ups, tr_price, tr_safety], axis=0)

In [239]:
#!g1.1
for i in range(4):
    tr_ups = pd.concat([tr_ups, tr_safety], axis=0)

In [240]:
#!g1.1
for i in range(2):
    tr_ups = pd.concat([tr_ups, tr_safety], axis=0)

In [241]:
#!g1.1
# print(tr.shape)
tr_ups.sum(axis=0)

sentence         даже не хочется думать какие препоны нас ждут ...
communication                                                10020
price                                                         2261
quality                                                       8444
safety                                                        2403
dtype: object

In [242]:
#!g1.1
labels = train_use_df.columns[1:].tolist()
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['communication', 'price', 'quality', 'safety']

In [243]:
#!g1.1
id2label

{0: 'communication', 1: 'price', 2: 'quality', 3: 'safety'}

In [244]:
#!g1.1
# tr.loc[:, labels]

In [109]:
#!g1.1
class BERTDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.targets = df.loc[:, labels].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.df.iloc[index, 0]
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }


In [198]:
#!g1.1
base_rubert = BertModel.from_pretrained(base_model_name, cache_dir = model_cache_path)
tokenizer = transformers.BertTokenizer.from_pretrained(base_model_name, cache_dir = model_cache_path)

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [250]:
#!g1.1
train_dataset = BERTDataset(tr_ups, tokenizer, MAX_LENGTH)
valid_dataset = BERTDataset(valid, tokenizer, MAX_LENGTH)

NameError: name 'BERTDataset' is not defined

In [195]:
#!g1.1
train_loader = DataLoader(train_dataset, batch_size=256, 
                          shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=64, 
                          shuffle=False)

NameError: name 'train_dataset' is not defined

In [196]:
#!g1.1
class CategoriesBERT(nn.Module):
    def __init__(self, 
                 base_model: BertModel,
                 dropout: float = 0.3,
                 last_embedding_dim: int = 312, 
                 classification_dim: int = 4):
        
        super(CategoriesBERT, self).__init__()
        
        self.bert_model = base_model
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(last_embedding_dim, classification_dim)
        self.relu = nn.ReLU()
        
    def forward(self, ids, mask, token_type_ids):        
        _, pooled_output = self.bert_model(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        
        return final_layer

In [197]:
#!g1.1
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [177]:
#!g1.1
device = torch.cuda.current_device()
model = CategoriesBERT(base_model=base_rubert)
model.to(device)
optimizer = AdamW(params = model.parameters(), lr=0.00001, weight_decay=1e-7)

In [178]:
#!g1.1
best_metric = -1000

In [179]:
#!g1.1
def train(epoch):
    model.train()
    for data in train_loader:
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
    if epoch % 20 == 0:
        print('________________')
        print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        outputs, targets = validation()
        ans = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, ans)
        f1_score_micro = metrics.f1_score(targets, ans, average='micro')
        f1_score_macro = metrics.f1_score(targets, ans, average='macro')
        roc_auc = metrics.roc_auc_score(targets, outputs, average='weighted', multi_class='ovr')
        print(f"Accuracy Score = {accuracy}")
        print(f"F1 Score (Micro) = {f1_score_micro}")
        print(f"F1 Score (Macro) = {f1_score_macro}")
        print(f"Roc AUC (Macro) = {roc_auc}")

        if roc_auc > best_metric:
            torch.save(model.state_dict(), 'multi_label_model.bin')
    else:
        print('________________')
        print(f'Epoch: {epoch}, Loss:  {loss.item()}')

In [180]:
#!g1.1
def validation():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(valid_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [181]:
#!g1.1
for epoch in range(EPOCHS):
    train(epoch)

________________
Epoch: 0, Loss:  0.7342463135719299
Accuracy Score = 0.0
F1 Score (Micro) = 0.5090370370370371
F1 Score (Macro) = 0.4293079418728047
Roc AUC (Macro) = 0.5426747860197787


KeyboardInterrupt: 

# Predict

In [190]:
#!g1.1
test_df = pd.read_csv('../../mnt/s3/hsedatafitpredict1392/new_test.csv', index_col = 0)

In [199]:
#!g1.1
model = CategoriesBERT(base_model=base_rubert)
model.load_state_dict(torch.load('multi_label_model.bin'))

<All keys matched successfully>

In [201]:
#!g1.1
for lab in labels:
    test_df.loc[:, lab] = np.nan

In [202]:
#!g1.1
test_df.shape

(1000, 5)

In [203]:
#!g1.1
for i in range(len(test_df)):
    model.eval().cpu()
    text = test_df.iloc[i, 0]
    inputs = tokenizer.encode_plus(
        text,
        truncation=True,
        add_special_tokens=True,
        max_length=MAX_LENGTH,
        padding='max_length',
        return_token_type_ids=True
    )
    with torch.no_grad():
        ids = torch.tensor(inputs['input_ids'], dtype=torch.long).unsqueeze(0)
        mask = torch.tensor(inputs['attention_mask'], dtype=torch.long).unsqueeze(0)
        token_type_ids = torch.tensor(inputs["token_type_ids"], dtype=torch.long).unsqueeze(0)
        outputs = model(ids, mask, token_type_ids)
        outputs = 1/(1 + np.exp(-outputs.numpy()[0]))
    
    test_df.iloc[i, 1:] = outputs

In [204]:
#!g1.1
id2label

{0: 'communication', 1: 'price', 2: 'quality', 3: 'safety'}

In [205]:
#!g1.1
outputs.numpy()[0]

AttributeError: 'numpy.ndarray' object has no attribute 'numpy'

In [207]:
#!g1.1
test_sent_df = pd.read_csv('../../mnt/s3/hsedatafitpredict1392/test_answer/fit_predict_bert_sent.csv', index_col = 0)

In [208]:
#!g1.1
test_sent_df

Unnamed: 0,texts,?,-,+
4036,15.03.2022 обратился на горячую линию для закр...,0.568540,0.321740,0.109720
5804,"Уже который год в ТКБ не решается ""глобальная ...",0.192277,0.475214,0.332509
2752,Добрый день. Хочу оставить отзыв о пользовании...,0.537145,0.203312,0.259543
1921,"Добрый день Сегодня, зайдя в свой личный кабин...",0.373189,0.205419,0.421392
7374,"Обслуживаюсь в Тинькофф пару лет, возникла жес...",0.420800,0.176457,0.402744
...,...,...,...,...
146,Отвратительный сервис и отношение к клиентам! ...,0.147801,0.170057,0.682142
2677,28.04.2022 обратилась в банк о возможности пер...,0.762713,0.097972,0.139315
4481,В начале 2021 года была акция по выплате 8% ке...,0.614543,0.185364,0.200093
4112,Бездействие банка и некомпетентность сотрудник...,0.340911,0.241325,0.417763


In [219]:
#!g1.1
total_answer = pd.merge(left=test_sent_df, right=test_df, right_index=True, left_index=True, how='inner')

In [220]:
#!g1.1
total_answer = total_answer[['texts_x', '?', '-', '+', 'communication', 'price', 'quality','safety']]
total_answer.columns = ['texts', '?', '-', '+', 'communication', 'price', 'quality','safety']

In [223]:
#!g1.1
idx = 555
total_answer.iloc[idx].texts

'Здравствуйте !!! Сегодня, мне позвонили, из Московского Коммерческого Банка, с телефона, +79*****1124, с предложением, потребительского кредита и не дослушав, мой ответ, БРОСИЛИ ТРУБКУ !!! Я конечно, всё понимаю, но с таким отношением, к...'

In [225]:
#!g1.1
total_answer.loc[:, 'second_category'] = 0

In [229]:
#!g1.1
total_answer.loc[:, 'second_category'] = np.where((total_answer[['communication', 'price', 
                                                                'quality','safety']] > 0.5).sum(axis=1) >= 2, 1, 0)

In [231]:
#!g1.1
total_answer.loc[3925]

texts              После введенных ЦБ ограничений выдача наличных...
?                                                           0.395454
-                                                              0.484
+                                                           0.120546
communication                                               0.504164
price                                                       0.601446
quality                                                     0.637835
safety                                                           0.5
second_category                                                    1
Name: 3925, dtype: object

In [228]:
#!g1.1
total_answer

Unnamed: 0,texts,?,-,+,communication,price,quality,safety,second_category
3925,После введенных ЦБ ограничений выдача наличных...,0.395454,0.484000,0.120546,0.504164,0.601446,0.637835,0.500000,0
7492,02.08.2021 я передал в ПАО Сбербанк заявление ...,0.611788,0.263567,0.124645,0.500000,0.658181,0.623527,0.538851,0
6856,"Взял потребительский кредит в Совкомбанке, за ...",0.392900,0.476742,0.130359,0.500000,0.606375,0.577957,0.581178,0
4506,14.02.2022г на счет моей матери 408179********...,0.283775,0.485782,0.230443,0.500000,0.701118,0.563654,0.559280,0
6511,Пытаюсь продлить страховой полис. На сайте нет...,0.091968,0.848435,0.059596,0.507730,0.500000,0.563039,0.500000,0
...,...,...,...,...,...,...,...,...,...
4196,11 .03 обратилась в коллцентр банка с заявкой ...,0.440657,0.492002,0.067342,0.500000,0.562620,0.500000,0.522822,0
115,"9 октября 2022г., я снимала деньги с карты Газ...",0.155579,0.738466,0.105955,0.564595,0.614716,0.664643,0.500000,0
5122,Как развидеть и раслышать банк? Более двух лет...,0.066375,0.867250,0.066375,0.516455,0.500000,0.538429,0.500000,0
3922,Хочу поблагодарить службу поддержки Газпромбан...,0.171670,0.162058,0.666273,0.500000,0.560088,0.500000,0.503592,0


In [227]:
#!g1.1
total_answer.loc[4036]

texts              15.03.2022 обратился на горячую линию для закр...
?                                                            0.56854
-                                                            0.32174
+                                                            0.10972
communication                                                    0.5
price                                                            0.5
quality                                                     0.501729
safety                                                           0.5
second_category                                                    0
Name: 4036, dtype: object

In [232]:
#!g1.1
total_answer.to_csv('../../mnt/s3/hsedatafitpredict1392/test_answer/fit_predict_total_answer.csv')

In [None]:
#!g1.1
