In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast, RobertaTokenizer, DistilBertTokenizer, DistilBertForSequenceClassification

## Loading dataset

In [4]:
PATH_TO_TRAIN_DATA = '../input/hseds-texts-2020/train.csv'
PATH_TO_TEST_DATA = '../input/hseds-texts-2020/test.csv'

train = pd.read_csv(PATH_TO_TRAIN_DATA)
test = pd.read_csv(PATH_TO_TEST_DATA)

## Preprocessing

In [5]:
df_train, df_val = train_test_split(train)

train_text = df_train['positive'] + ' ' + df_train['negative']
val_text = df_val['positive'] + ' ' + df_val['negative']
train_labels = df_train['score']
val_labels = df_val['score']

## Loading pretrained model from Hugging Face library

We also may use DistillBert - more lightweight modification of Bert model

In [6]:
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




## Tokenize sequences with Bert tokenizer and create dataloaders

In [7]:
seq_len = 200

In [8]:

tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)



In [9]:
# get labels, seqs and attentions masks from bert tokenizer
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

In [10]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 32

# create dataloaders
train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

## Specify model 

We may also freeze all downloaded Bert parameters and train only classification head, however it results in much higher MAE score

In [11]:
# freeze parameters
#for param in bert.parameters():
#    param.requires_grad = False

In [12]:
class bert_clf(nn.Module):

    def __init__(self, bert):

        super(bert_clf, self).__init__()

        self.bert = bert 
        
        # We add 3 layer fc head for classification task. It performes much better than simple linear layer
        self.fc1 = nn.Linear(768,384)
        self.relu =  nn.ReLU()
        self.fc2 = nn.Linear(384,128)
        # output dim is 1 since it is more  convinient to solve regression task than classification with
        # 100 classes ranging from 1 to 10
        self.fc3 = nn.Linear(128,1)
        self.dropout = torch.nn.Dropout(0.1)
        

    def forward(self, sent_id, mask):

        _, cls_hs = self.bert(sent_id, attention_mask=mask)
        x = self.fc1(cls_hs)
        x = self.dropout(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        
        return x

## Training and val loop

In [13]:
def train():
  
    model.train()

    total_loss = 0
    total_preds=[]

    for step,batch in enumerate(train_dataloader):
        
        batch = [i.to(device) for i in batch]
        sent_id, mask, labels = batch
        model.zero_grad()        
        
        preds = model(sent_id, mask)
        loss = criterion(preds.flatten(), labels)
        
        total_loss = total_loss + loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        preds=preds.detach().cpu().numpy()
        total_preds.append(preds)
        
        if step % 200 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}. Batch loss {}'.format(step, len(train_dataloader), loss))

    avg_loss = total_loss / len(train_dataloader)

    # (n batches, batch size, n classes) -->
    # (n samples, n classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds




def evaluate():
  
    print("\nEvaluating...")
    model.eval()
    total_loss = 0
    total_preds = []
    for step,batch in enumerate(val_dataloader):
        #if step % 100 == 0 and not step == 0:
        #    print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))
            
        batch = [t.to(device) for t in batch]
        sent_id, mask, labels = batch
        with torch.no_grad():
            preds = model(sent_id, mask)
            loss = criterion(preds.flatten(),labels)
            total_loss = total_loss + loss.item()
            preds = preds.detach().cpu().numpy()
            total_preds.append(preds)

            
    avg_loss = total_loss / len(val_dataloader) 

    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

## Proceed to training and validating

In [14]:
device = torch.device("cuda")
model = bert_clf(bert)
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr = 5e-5)
criterion = torch.nn.L1Loss()
epochs = 3

In [None]:
best_valid_loss = float('inf')

train_losses=[]
valid_losses=[]

for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    train_loss, _ = train()
    train_losses.append(train_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    
    if (epoch % 2 ==0) or (epoch == epochs - 1):
        valid_loss, _ = evaluate()
    
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'bert.pt')
    
        valid_losses.append(valid_loss)
        print(f'Validation Loss: {valid_loss:.3f}')
    
    


 Epoch 1 / 3
  Batch   200  of  2,344. Batch loss 0.8100377321243286
  Batch   400  of  2,344. Batch loss 0.6973710060119629
  Batch   600  of  2,344. Batch loss 0.8882416486740112


In [None]:
#load weights of best model
path = 'bert.pt'
model.load_state_dict(torch.load(path))