In [None]:
import torch
import torch.nn as nn

config = {
    'train_file_path': '/content/drive/MyDrive/data/train.csv',
    'test_file_path': '/content/drive/MyDrive/data/test.csv',
    'train_val_ratio': 0.1,
    # 'vocab_size': 30000,
    'model_path': '/content/drive/MyDrive/BERT_model',
    'batch_size': 16,
    'num_epoches': 2,
    'learning_rate': 1e-3,
    'logging_step': 300,
    'seed': 2021
}

config['device'] = 'cuda' if torch.cuda.is_available() else 'cpu' 

import random
import numpy as np

def seed_everything(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)

seed_everything(config['seed'])


In [None]:
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
def read_data(config, tokenizer, mode='train'):
  data_df = pd.read_csv(config[f'{mode}_file_path'], sep=',')

  if mode == 'train':
    X_train, y_train = defaultdict(list), []
    X_val, y_val = defaultdict(list), []

    num_val = int(config['train_val_ratio'] * len(data_df))
  
  else:
    X_test, y_test = defaultdict(list), []

  for i, row in tqdm(data_df.iterrows(), desc=f'Preprocesing {mode} data', total=len(data_df)):
    label = row[1] if mode=='train' else 0
    sentence = row[-1]
#-----------------------#
    inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True, return_attention_mask=True)
#-----------------------#
    if mode == 'train':
      if i < num_val:
        X_val['input_ids'].append(inputs['input_ids'])
        y_val.append(label)
        X_val['token_type_ids'].append(inputs['token_type_ids'])
        X_val['attention_mask'].append(inputs['attention_mask'])
      else:
        X_train['input_ids'].append(inputs['input_ids'])
        y_train.append(label)
        X_train['token_type_ids'].append(inputs['token_type_ids'])
        X_train['attention_mask'].append(inputs['attention_mask'])

    else:
      X_test['input_ids'].append(inputs['input_ids'])
      y_test.append(label)
      X_test['token_type_ids'].append(inputs['token_type_ids'])
      X_test['attention_mask'].append(inputs['attention_mask'])


  if mode == 'train':
    label2id = {label: i for i, label in enumerate(np.unique(y_train))}
    id2label = {i: label for label, i in label2id.items()}

    y_train = torch.tensor([label2id[label] for label in y_train], dtype=torch.long)
    y_val = torch.tensor([label2id[label] for label in y_val], dtype=torch.long)

    return X_train, y_train, X_val, y_val, label2id, id2label
  
  else:
    y_test = torch.tensor(y_test, dtype=torch.long)

    return X_test, y_test

In [None]:
from torch.utils.data import Dataset
class TNEWSDataset(Dataset):
  def __init__(self, X, y):
    self.x = X
    self.y = y

  def __getitem__(self, idx):
    return{
        'input_ids': self.x['input_ids'][idx],
        'label': self.y[idx],
        'token_type_ids': self.x['token_type_ids'][idx],
        'attention_mask': self.x['attention_mask'][idx]
    }

  def __len__(self):
    return self.y.size(0)


In [None]:
def collate_fn(examples):
  input_ids_list = []
  labels = []
  token_type_ids_list = []
  attention_mask_list = []

  for example in examples:
    input_ids_list.append(example['input_ids'])
    labels.append(example['label'])
    token_type_ids_list.append(example['token_type_ids'])
    attention_mask_list.append(example['attention_mask'])

  max_length = max(len(input_ids) for input_ids in input_ids_list)

  input_ids_tensor = torch.zeros((len(labels), max_length), dtype=torch.long)
  token_type_ids_tensor = torch.zeros_like(input_ids_tensor)
  attention_mask_tensor = torch.zeros_like(input_ids_tensor)

  for i, input_ids in enumerate(input_ids_list):
    seq_len = len(input_ids)
    input_ids_tensor[i, : seq_len] = torch.tensor(input_ids, dtype=torch.long)
    token_type_ids_tensor[i, : seq_len] = torch.tensor(token_type_ids_list[i], dtype=torch.long)
    attention_mask_tensor[i, : seq_len] = torch.tensor(attention_mask_list[i], dtype=torch.long)

  return{
      'input_ids': input_ids_tensor,
      'labels': torch.tensor(labels, dtype=torch.long),
      'token_type_ids': token_type_ids_tensor,
      'attention_mask': attention_mask_tensor
  }


In [None]:
import collections
from torch.utils.data import DataLoader
from transformers import BertTokenizer
def build_dataloader(config):
#-----------------------#
  tokenizer = BertTokenizer.from_pretrained(config['model_path'])
#-----------------------#
  X_train, y_train, X_val, y_val, label2id, id2label = read_data(config, tokenizer, mode='train')
  X_test, y_test = read_data(config, tokenizer, mode='test')

  train_dataset = TNEWSDataset(X_train, y_train)
  val_dataset = TNEWSDataset(X_val, y_val)
  test_dataset = TNEWSDataset(X_test, y_test)

  train_dataloader = DataLoader(dataset=train_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=True, collate_fn=collate_fn)
  val_dataloader = DataLoader(dataset=val_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=False, collate_fn=collate_fn)
  test_dataloader = DataLoader(dataset=test_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=False, collate_fn=collate_fn)

  return id2label, train_dataloader, val_dataloader, test_dataloader

In [None]:
id2label, train_dataloader, val_dataloader, test_dataloader = build_dataloader(config)

Preprocesing train data: 100%|██████████| 63360/63360 [00:49<00:00, 1280.30it/s]
Preprocesing test data: 100%|██████████| 10000/10000 [00:05<00:00, 1757.35it/s]
  cpuset_checked))


In [None]:
for batch in test_dataloader:
  print(len(batch['input_ids']))
  print(batch)
  break

  cpuset_checked))


16
{'input_ids': tensor([[  101,   143,  5500,  8038,   123,   702,  5301,  1146,  7566,  1818,
          7987,  1928,   702,  5500,   966,  2533,  5500,  3696,  1068,  3800,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0],
        [  101,   743,  1947,  2791,   679,  7676,  1408,  8043,   711,   784,
           720,   833,  3300,   782,  2703,  5709,  8298,   674,   743,  6956,
          2797,  3322,  8043,   102,     0,     0,     0,     0,     0,     0,
             0,     0],
        [  101,  8212,  1914,  2157,  2791,  1765,   772,  1062,  1385,  6760,
          6121,  1075,  4343,  8043,   872,  4692,  1168,  4638,   788,   788,
          3221,  6134,  7481,   102,     0,     0,     0,     0,     0,     0,
             0,     0],
        [  101,   523,  2900,  3144,  4764,  5296,  4788,   855,  1400,  4638,
          2590,  6662,  3463,  4415,   524,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0

In [None]:
from sklearn.metrics import f1_score
def evaluation(model, config, val_dataloader):
  model.eval()
  preds = []
  labels = []
  val_loss = 0.
  val_iterator = tqdm(val_dataloader, desc='Evaluation', total=len(val_dataloader))

  with torch.no_grad():
    for batch in val_iterator:
      labels.append(batch['labels'])
      batch = {item: value.to(config['device']) for item, value in batch.items()}

      loss, logits = model(**batch)[:2]
      val_loss += loss.item()
      
      preds.append(logits.argmax(dim=-1).detach().cpu())

  avg_val_loss = val_loss/len(val_dataloader)
  labels = torch.cat(labels, dim=0).numpy()
  preds = torch.cat(preds, dim=0).numpy()

  f1 = f1_score(labels, preds, average='macro')

  return avg_val_loss, f1


In [None]:
from transformers import BertConfig, BertForSequenceClassification
from torch.optim import AdamW
from tqdm import trange
def train(id2label, config, train_dataloader, val_dataloader):
#-----------------------#
  bert_config = BertConfig.from_pretrained(config['model_path'])
  bert_config.num_labels = len(id2label)
  model = BertForSequenceClassification.from_pretrained(config['model_path'], config=bert_config)
#-----------------------#  

  optimizer = AdamW(model.parameters(), lr=config['learning_rate'])
  model.to(config['device'])
  epoches_iterator = trange(config['num_epoches'])

  global_steps = 0
  train_loss = 0.
  logging_loss = 0.


  for epoch in epoches_iterator:
    train_iterator = tqdm(train_dataloader, desc='Training', total=len(train_dataloader))
    model.train()
    
    for batch in train_iterator:
      batch = {item: value.to(config['device']) for item, value in batch.items()}

      loss = model(**batch)[0]


      model.zero_grad()
      loss.backward()
      optimizer.step()

      train_loss += loss
      global_steps +=1

      if global_steps % config['logging_step'] == 0:
        print_train_loss = (train_loss - logging_loss)/ config['logging_step'] 
        logging_loss = train_loss

        avg_val_loss, f1 = evaluation(model, config, val_dataloader)
        print(avg_val_loss, f1)
        model.train()
        

  return model

In [None]:
best_model = train(id2label, config, train_dataloader, val_dataloader)

In [None]:
def predict(config, id2label, model, test_dataloader):
  model.eval()
  test_iterator = tqdm(test_dataloader, desc='Predicting', total=len(test_dataloader))
  test_preds =[]

  with torch.no_grad():
     for batch in test_iterator:
       batch = {item:value.to(config['device']) for item, value in batch.items()}
       logits = model(**batch)[1]
       test_preds.append(logits.argmax(dim=-1).detach().cpu())
  test_preds = torch.cat(test_preds, dim=0).numpy()
  test_preds = [id2label[idx] for idx in test_preds]

  test_df = pd.read_csv(config['test_file_path'], sep=',')
  test_df['preds'] = test_preds
  test_df.to_csv('/content/drive/MyDrive/BERT_result.csv', index=False, encoding='utf8')
  return test_df

In [None]:
test_df = predict(config, id2label, best_model, test_dataloader)

  cpuset_checked))
Predicting: 100%|██████████| 625/625 [00:50<00:00, 12.40it/s]
