# Подготовка данных

In [1]:
%pip install transformers==3.5

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import torch
from torch import nn

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModel, AdamW
from torch.optim import SGD
from sklearn.utils.class_weight import compute_class_weight
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def clean_data(data):
    stop = stopwords.words('russian')
    data['message'] = data['message'].replace('/n', ' ')
    data['message'] = data['message'].map(lambda x: x.lower())
    data['message'] = data['message'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    return data

In [4]:
# Загрузка обучающих и валидационных данных
dataset = pd.read_csv("pulse_stage1_patch.csv")
dataset["label"] = dataset["label"].astype(int)

train_dataset = dataset.iloc[np.where(dataset.split == 'valid')[0], :].drop(['split'], axis=1)
val_dataset = dataset.iloc[np.where(dataset.split == 'train')[0], :].drop(['split'], axis=1)

# Загрузка тестовых данных
test_dataset = pd.read_csv('test.csv')

In [5]:
train_clean = clean_data(train_dataset)
val_clean = clean_data(val_dataset)

train_clean.label = train_clean.label.astype(int)
val_clean.label = val_clean.label.astype(int)

train_text = train_clean.message
val_text = val_clean.message

train_labels = train_clean.label
val_labels = val_clean.label

In [6]:
# Предзагрузка tokenizer ruBert с https://huggingface.co/
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased-sentence")

In [7]:
max_token_len = 40
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length=max_token_len,
    padding=True,
    truncation=True,
    return_token_type_ids=False
)

tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(), 
    max_length=max_token_len,
    padding=True,
    truncation=True,
    return_token_type_ids=False
)

In [8]:
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

In [9]:
batch_size = 64

train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

# Построение модели

In [10]:
# Предзагрузка готовой модели ruBert с https://huggingface.co/
bert = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased-sentence")

In [11]:
class BERT_Arch(nn.Module):
    def __init__(self, bert):
      
        super(BERT_Arch, self).__init__()
        self.bert = bert 
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)

        # relu activation function
        self.relu =  nn.LeakyReLU()

        # dense layer 1
        self.fc1 = nn.Linear(768, 256)

        # dense layer 2 (Output layer)
        self.fc2 = nn.Linear(256, 2)
        
        #softmax activation function
        self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):

        #pass the inputs to the model 
        _, cls_hs = self.bert(sent_id, attention_mask=mask, return_dict=False)
        x = self.fc1(cls_hs)

        x = self.relu(x)

        x = self.dropout(x)
        
        x = self.fc2(x)

        # apply softmax activation
        x = self.softmax(x)
        
        return x

In [12]:
for name, child in bert.named_children():
    print(name)

embeddings
encoder
pooler


In [13]:
#!g1.1
# freeze all the parameters
for param in bert.parameters():
    param.requires_grad = False
    
model = BERT_Arch(bert)

In [14]:
#!g1.1
device = torch.device("cuda")

model = model.to(device)

In [15]:
#!g1.1
# Observe that all parameters are being optimized
optimizer = AdamW(model.parameters(), lr = 5e-4)


In [16]:
#!g1.1
#compute the class weights
class_wts = compute_class_weight('balanced', np.unique(train_labels), train_labels)
class_wts = [0.25, 0.75]
print(f'class weights are: {class_wts}')
weights= torch.tensor(class_wts,dtype=torch.float)
weights = weights.to(device)

# loss function
cross_entropy  = nn.NLLLoss(weight=weights)

class weights are: [0.25, 0.75]


In [17]:
#!g1.1
def train():
  
    model.train()
    total_loss, total_accuracy = 0, 0
  
    # empty list to save model predictions
    total_preds=[]
  
    # iterate over batches
    for step,batch in enumerate(train_dataloader):
    
    # progress update after every 50 batches.
        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

        # push the batch to gpu
        batch = [r.to(device) for r in batch]
 
        sent_id, mask, labels = batch

        # clear previously calculated gradients 
        model.zero_grad()        

        # get model predictions for the current batch
        preds = model(sent_id, mask)

        # compute the loss between actual and predicted values
        loss = cross_entropy(preds, labels)

        # add on to the total loss
        total_loss = total_loss + loss.item()

        # backward pass to calculate the gradients
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # update parameters
        optimizer.step()

        # model predictions are stored on GPU. So, push it to CPU
        preds=preds.detach().cpu().numpy()

        # append the model predictions
        total_preds.append(preds)

    # compute the training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)

    # predictions are in the form of (no. of batches, size of batch, no. of classes).
    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)
    #returns the loss and predictions
    return avg_loss, total_preds

In [18]:
#!g1.1
def evaluate():
  
    print("\nEvaluating...")
  
    # deactivate dropout layers
    model.eval()

    total_loss, total_accuracy = 0, 0
  
    # empty list to save the model predictions
    total_preds = []
    total_labels = []

    # iterate over batches
    for step,batch in enumerate(val_dataloader):
    
        # Progress update every 50 batches.
        if step % 50 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

        # push the batch to gpu
        batch = [t.to(device) for t in batch]

        sent_id, mask, labels = batch
        
        # deactivate autograd
        with torch.no_grad():
      
            # model predictions
            preds = model(sent_id, mask)

            # compute the validation loss between actual and predicted values
            loss = cross_entropy(preds, labels)

            total_loss = total_loss + loss.item()

            preds = preds.detach().cpu().numpy()
            labels = labels.detach().cpu().numpy()
    
            total_labels.append(labels)
            total_preds.append(preds)

    # compute the validation loss of the epoch
    avg_loss = total_loss / len(val_dataloader)

    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)
    total_preds = np.argmax(total_preds, axis = 1)
    
    total_labels  = np.concatenate(total_labels, axis=0)
    print(classification_report(total_labels, total_preds))

    return avg_loss, total_preds

In [19]:
#!g1.1
epochs = 20
best_valid_loss = float('inf')

#for each epoch
for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    #train model
    train_loss, _ = train()
    
    #evaluate model
    valid_loss, _ = evaluate()
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'toxic_bert_wts.pt')
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 20
  Batch    50  of    108.
  Batch   100  of    108.

Evaluating...
  Batch    50  of    433.
  Batch   100  of    433.
  Batch   150  of    433.
  Batch   200  of    433.
  Batch   250  of    433.
  Batch   300  of    433.
  Batch   350  of    433.
  Batch   400  of    433.
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     26788
           1       0.36      0.29      0.32       913

    accuracy                           0.96     27701
   macro avg       0.67      0.64      0.65     27701
weighted avg       0.96      0.96      0.96     27701


Training Loss: 0.246
Validation Loss: 0.122

 Epoch 2 / 20
  Batch    50  of    108.
  Batch   100  of    108.

Evaluating...
  Batch    50  of    433.
  Batch   100  of    433.
  Batch   150  of    433.
  Batch   200  of    433.
  Batch   250  of    433.
  Batch   300  of    433.
  Batch   350  of    433.
  Batch   400  of    433.
              precision    recall  f1-score   su

In [20]:
#!g1.1
test_text = test_dataset.text
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(), 
    max_length=40,
    pad_to_max_length=40,
    truncation=True,
    return_token_type_ids=False
)
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])

with torch.no_grad():
    preds = model(test_seq.to(device), test_mask.to(device))
    preds = preds.detach().cpu().numpy()
preds = np.argmax(preds, axis = 1)


dataframe_pred = pd.DataFrame(data = test_dataset.text)
dataframe_pred.columns = ["text"]
dataframe_pred["toxic"] = preds
dataframe_pred.to_csv("submittion.csv", index=False)




In [21]:
#!g1.1
