In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast, RobertaTokenizer, DistilBertTokenizer, DistilBertForSequenceClassification

In [2]:
PATH_TO_TRAIN_DATA = '../input/hseds-texts-2020/train.csv'
PATH_TO_TEST_DATA = '../input/hseds-texts-2020/test.csv'

train = pd.read_csv(PATH_TO_TRAIN_DATA)
test = pd.read_csv(PATH_TO_TEST_DATA)

In [3]:
df_train, df_val = train_test_split(train)

train_text = df_train['positive'] + ' ' + df_train['negative']
val_text = df_val['positive'] + ' ' + df_val['negative']
train_labels = df_train['score']
val_labels = df_val['score']

In [27]:
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [5]:
seq_len = 200

In [6]:
# tokenize sequences
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)



In [7]:
# get labels, seqs and attentions masks from bert tokenizer
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

In [8]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 32

# create dataloaders
train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

In [19]:
# freeze parameters
#for param in bert.parameters():
#    param.requires_grad = False

In [20]:
class bert_clf(nn.Module):

    def __init__(self, bert):

        super(bert_clf, self).__init__()

        self.bert = bert 
        self.fc1 = nn.Linear(768,384)
        
        # we may  add dropout and multiple fc layers, but would rather stick to simple model
        self.relu =  nn.ReLU()
        self.fc2 = nn.Linear(384,128)
        self.fc3 = nn.Linear(128,1)
        self.bn = nn.BatchNorm1d(128)

    def forward(self, sent_id, mask):

        _, cls_hs = self.bert(sent_id, attention_mask=mask)
        x = self.fc1(cls_hs)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        
        return x

In [24]:


device = torch.device("cuda")
model = bert_clf(bert)
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr = 5e-5)
criterion = torch.nn.L1Loss()
epochs = 2

In [25]:
def train():
  
    model.train()

    total_loss = 0
    total_preds=[]

    for step,batch in enumerate(train_dataloader):
        
        batch = [i.to(device) for i in batch]
        sent_id, mask, labels = batch
        model.zero_grad()        
        
        preds = model(sent_id, mask)
        loss = criterion(preds.flatten(), labels)
        
        total_loss = total_loss + loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        preds=preds.detach().cpu().numpy()
        total_preds.append(preds)
        
        if step % 10 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}. Batch loss {}'.format(step, len(train_dataloader), loss))

    avg_loss = total_loss / len(train_dataloader)

    # (n batches, batch size, n classes) -->
    # (n samples, n classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds




def evaluate():
  
    print("\nEvaluating...")
    model.eval()
    total_loss = 0
    total_preds = []
    for step,batch in enumerate(val_dataloader):
        #if step % 100 == 0 and not step == 0:
        #    print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))
            
        batch = [t.to(device) for t in batch]
        sent_id, mask, labels = batch
        with torch.no_grad():
            preds = model(sent_id, mask)
            loss = criterion(preds.flatten(),labels)
            total_loss = total_loss + loss.item()
            preds = preds.detach().cpu().numpy()
            total_preds.append(preds)

            
    avg_loss = total_loss / len(val_dataloader) 

    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

In [26]:
best_valid_loss = float('inf')

train_losses=[]
valid_losses=[]

for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    train_loss, _ = train()
    train_losses.append(train_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    
    if (epoch % 3 ==0) or (epoch == epochs - 1):
        valid_loss, _ = evaluate()
    
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'saved_weights.pt')
    
        valid_losses.append(valid_loss)
        print(f'Validation Loss: {valid_loss:.3f}')
    
    


 Epoch 1 / 2
  Batch    10  of  2,344. Batch loss 4.314742088317871
  Batch    20  of  2,344. Batch loss 1.7904291152954102
  Batch    30  of  2,344. Batch loss 2.8685097694396973
  Batch    40  of  2,344. Batch loss 1.3670973777770996
  Batch    50  of  2,344. Batch loss 2.346759796142578
  Batch    60  of  2,344. Batch loss 3.6744494438171387
  Batch    70  of  2,344. Batch loss 1.47781240940094
  Batch    80  of  2,344. Batch loss 1.1791908740997314
  Batch    90  of  2,344. Batch loss 2.0503082275390625
  Batch   100  of  2,344. Batch loss 1.0611646175384521
  Batch   110  of  2,344. Batch loss 1.415330171585083
  Batch   120  of  2,344. Batch loss 1.116589069366455
  Batch   130  of  2,344. Batch loss 1.9697917699813843
  Batch   140  of  2,344. Batch loss 3.1416361331939697
  Batch   150  of  2,344. Batch loss 3.5290260314941406
  Batch   160  of  2,344. Batch loss 1.7954175472259521


KeyboardInterrupt: 

In [None]:
#load weights of best model
path = 'saved_weights.pt'
model.load_state_dict(torch.load(path))