In [1]:
import random
import os
import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification, AdamW, get_scheduler

## Config

In [2]:
config = {
    'fold_num': 5,
    'seed': 1234,
    #'model': 'roberta-base',
    #'model': '../input/robertalarge',
    #'model': 'allenai/longformer-base-4096',
    #'model': 'allenai/longformer-large-4096',
    'model': 'google/bigbird-roberta-base',
    'max_len': 1024,
    'epochs': 5,
    'train_bs': 3,
    'valid_bs': 6,
    'lr': 1e-5/4,
    'num_workers': 0,
    'weight_decay': 1e-6,
    'num_warmup_steps': 500,
    'lr_scheduler_type': 'linear',
    'gradient_accumulation_steps': 8,
}


In [3]:
labels = ['O', 'B-Lead', 'I-Lead', 'B-Position', 'I-Position', 'B-Claim', 'I-Claim', 'B-Counterclaim', 'I-Counterclaim',
          'B-Rebuttal', 'I-Rebuttal', 'B-Evidence', 'I-Evidence', 'B-Concluding Statement', 'I-Concluding Statement']
labels2index = {
    'Lead': 1, 'Position': 3, 'Claim': 5, 'Counterclaim': 7, 'Rebuttal': 9, 'Evidence': 11, 'Concluding Statement': 13
}

## Set Seed

In [4]:
def set_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


set_seed(config['seed'])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Load Train Data

In [5]:
train_df = pd.read_csv('./data/train.csv')
train_df.head(2)

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59


In [6]:
train_names, train_texts = [], []
for f in tqdm(list(os.listdir('./data/train'))):
    train_names.append(f.replace('.txt', ''))
    with open('./data/train/' + f, 'r', encoding='utf-8') as f:
        text = ''
        for line in f.readlines():
            #text += line.replace('\n', '').replace('\xa0', '')
            text += line.replace('\n', ' ')
        train_texts.append(text)
train_texts = pd.DataFrame({'id': train_names, 'text': train_texts})
train_texts['text'] = train_texts['text'].apply(lambda x: x.split())

100%|██████████| 15594/15594 [00:00<00:00, 17985.61it/s]


In [7]:
train_texts = train_texts.sort_values(by='id').reset_index(drop=True)
train_df = train_df.sort_values(by=['id', 'discourse_start']).reset_index(drop=True)

In [8]:
text_index = dict.fromkeys(train_texts['id'].values.tolist())
for i in range(len(train_df)):
    id = train_df.iloc[i]['id']
    if not text_index[id]:
        text_index[id] = [i]
    else:
        text_index[id].append(i)
    if (i+1) % 20000 == 0:
        print("Processed {0} discourses.".format(i+1))

Processed 20000 discourses.
Processed 40000 discourses.
Processed 60000 discourses.
Processed 80000 discourses.
Processed 100000 discourses.
Processed 120000 discourses.
Processed 140000 discourses.


In [9]:
taggings = []
essays = 0
for i in range(len(train_texts)):
    text_id = train_texts.iloc[i]['id']
    text = train_texts.iloc[i]['text']
    tagging = [0] * config['max_len']
    for k in text_index[text_id]:
        if train_df.iloc[k]['id'] != train_texts.iloc[i]['id']:
            break

        discourse_type = train_df.iloc[k]['discourse_type']
        predictionstring = train_df.iloc[k]['predictionstring'].split(' ')
        label = labels2index[discourse_type]
        if int(predictionstring[0]) > config['max_len'] - 2:
            break
        else:
            tagging[int(predictionstring[0]) + 1] = label
        for m in range(int(predictionstring[0]) + 2, int(predictionstring[-1]) + 2):
            if m > config['max_len'] - 2:
                break
            else:
                tagging[m] = label + 1
    tagging[-1] = 0
    taggings.append(tagging)
    essays += 1
    if essays % 2000 == 0:
        print("Processed {0} essays.".format(essays))

Processed 2000 essays.
Processed 4000 essays.
Processed 6000 essays.
Processed 8000 essays.
Processed 10000 essays.
Processed 12000 essays.
Processed 14000 essays.


In [10]:
train_texts['tagging'] = taggings

In [11]:
tokenizer = AutoTokenizer.from_pretrained(config['model'], add_prefix_space=True)

normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


In [12]:
class MyDataset(Dataset):
    def __init__(self, df, phase='Train'):
        self.df = df
        self.phase = phase

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.text.values[idx]
        if self.phase == 'Train':
            label = self.df.tagging.values[idx]
            return {'text': text, 'label': label}
        else:
            return {'text': text}


def collate_fn(data):
    input_ids, attention_mask = [], []
    text = [item['text'] for item in data]
    tokenized_inputs = tokenizer(
        text,
        max_length=config['max_len'],
        padding='max_length',
        truncation=True,
        is_split_into_words=True,
        return_tensors='pt'
    )

    words = []
    for i in range(len(data)):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        words.append(word_ids)

    tokenized_inputs["word_ids"] = words
    if 'label' in data[0].keys():
        label = [item['label'] for item in data]
        tokenized_inputs['labels'] = torch.LongTensor(label)

    return tokenized_inputs

In [13]:
train_dataset = MyDataset(train_texts, phase='Train')
train_iter = DataLoader(train_dataset, batch_size=config['train_bs'], collate_fn=collate_fn, shuffle=False,
                        num_workers=config['num_workers'])

## Load Model and Prepare Optimizer and LR Scheduler

In [None]:
model = AutoModelForTokenClassification.from_pretrained(config['model'], num_labels=15).to(device)

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {"params": [p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)],
     "weight_decay": config['weight_decay'],
     },
    {"params": [p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)],
     "weight_decay": 0.0,
     },
]
optimizer = AdamW(optimizer_grouped_parameters,
                  #lr=config['lr'],
                  lr=config['lr'] * config['gradient_accumulation_steps'],
                  betas=(0.9, 0.999),
                  eps=1e-6
                  )
lr_scheduler = get_scheduler(
    name=config['lr_scheduler_type'],
    optimizer=optimizer,
    num_warmup_steps=config['num_warmup_steps'],
    num_training_steps=config['epochs'] * len(train_iter) /  config['gradient_accumulation_steps'], )

Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BigBirdForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForTokenClassification were no

## Start Training!

In [None]:
tk = tqdm(train_iter, total=len(train_iter), position=0, leave=True)
model.train()
step = 0
model.zero_grad()
for epoch in range(config['epochs']):
    losses = []
    print("Epoch {}/{}".format(epoch+1, config['epochs']))
    for batch in tk:
        batch = {k: v.to(device) for k, v in batch.items() if k != 'word_ids'}
        loss = model(input_ids=batch['input_ids'],
                     attention_mask=batch['attention_mask'],
                     labels=batch['labels']).loss
        loss /= config['gradient_accumulation_steps']
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1, norm_type=2)
        if (step + 1) % config['gradient_accumulation_steps'] == 0 or (step + 1) == len(train_iter) * config['epochs']:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        step += 1
        losses.append(loss.item() * config['gradient_accumulation_steps'])
        if (step + 1) % 100 == 0:
            print("Epoch {}/{} Step {}/{}  Average Training Loss:{:6f}".format(
                epoch+1,
                config['epochs'],
                step + 1,
                len(train_iter),
                np.mean(losses)))
    # print average loss
    print("Epoch {}/{}  Average Training Loss:{:6f}".format(
        epoch+1,
        config['epochs'],
        np.mean(losses)))



  0%|          | 0/5198 [00:00<?, ?it/s]

Epoch 1/5


  2%|▏         | 99/5198 [01:22<1:09:51,  1.22it/s]

Epoch 1/5 Step 100/5198  Average Training Loss:2.781197


  4%|▍         | 199/5198 [02:44<1:08:16,  1.22it/s]

Epoch 1/5 Step 200/5198  Average Training Loss:2.769809


  6%|▌         | 299/5198 [04:06<1:06:28,  1.23it/s]

Epoch 1/5 Step 300/5198  Average Training Loss:2.747930


  8%|▊         | 399/5198 [05:28<1:05:02,  1.23it/s]

Epoch 1/5 Step 400/5198  Average Training Loss:2.716120


 10%|▉         | 499/5198 [06:50<1:03:28,  1.23it/s]

Epoch 1/5 Step 500/5198  Average Training Loss:2.672472


 12%|█▏        | 599/5198 [08:12<1:02:14,  1.23it/s]

Epoch 1/5 Step 600/5198  Average Training Loss:2.604754


 13%|█▎        | 699/5198 [09:35<1:00:14,  1.24it/s]

Epoch 1/5 Step 700/5198  Average Training Loss:2.502904


 15%|█▌        | 799/5198 [10:57<59:43,  1.23it/s]  

Epoch 1/5 Step 800/5198  Average Training Loss:2.410652


 17%|█▋        | 899/5198 [12:19<58:51,  1.22it/s]  

Epoch 1/5 Step 900/5198  Average Training Loss:2.332454


 19%|█▉        | 999/5198 [13:41<58:06,  1.20it/s]  

Epoch 1/5 Step 1000/5198  Average Training Loss:2.261404


 21%|██        | 1099/5198 [15:03<55:43,  1.23it/s]

Epoch 1/5 Step 1100/5198  Average Training Loss:2.194715


 23%|██▎       | 1199/5198 [16:24<54:55,  1.21it/s]

Epoch 1/5 Step 1200/5198  Average Training Loss:2.129232


 25%|██▍       | 1299/5198 [17:46<53:36,  1.21it/s]

Epoch 1/5 Step 1300/5198  Average Training Loss:2.067647


 27%|██▋       | 1399/5198 [19:09<52:01,  1.22it/s]

Epoch 1/5 Step 1400/5198  Average Training Loss:2.011991


 29%|██▉       | 1499/5198 [20:31<50:52,  1.21it/s]

Epoch 1/5 Step 1500/5198  Average Training Loss:1.957682


 31%|███       | 1599/5198 [21:53<48:58,  1.22it/s]

Epoch 1/5 Step 1600/5198  Average Training Loss:1.911867


 33%|███▎      | 1699/5198 [23:15<48:37,  1.20it/s]

Epoch 1/5 Step 1700/5198  Average Training Loss:1.867177


 35%|███▍      | 1799/5198 [24:37<46:36,  1.22it/s]

Epoch 1/5 Step 1800/5198  Average Training Loss:1.826661


 37%|███▋      | 1899/5198 [25:59<43:53,  1.25it/s]

Epoch 1/5 Step 1900/5198  Average Training Loss:1.786865


 38%|███▊      | 1999/5198 [27:18<42:28,  1.26it/s]

Epoch 1/5 Step 2000/5198  Average Training Loss:1.750513


 40%|████      | 2099/5198 [28:38<40:35,  1.27it/s]

Epoch 1/5 Step 2100/5198  Average Training Loss:1.718320


 42%|████▏     | 2199/5198 [29:59<39:55,  1.25it/s]

Epoch 1/5 Step 2200/5198  Average Training Loss:1.688921


 44%|████▍     | 2299/5198 [31:18<37:50,  1.28it/s]

Epoch 1/5 Step 2300/5198  Average Training Loss:1.660923


 46%|████▌     | 2399/5198 [32:38<37:56,  1.23it/s]

Epoch 1/5 Step 2400/5198  Average Training Loss:1.638128


 48%|████▊     | 2499/5198 [33:57<35:08,  1.28it/s]

Epoch 1/5 Step 2500/5198  Average Training Loss:1.614818


 50%|█████     | 2599/5198 [35:16<33:26,  1.30it/s]

Epoch 1/5 Step 2600/5198  Average Training Loss:1.593653


 52%|█████▏    | 2699/5198 [36:36<33:02,  1.26it/s]

Epoch 1/5 Step 2700/5198  Average Training Loss:1.572342


 54%|█████▍    | 2799/5198 [37:56<31:51,  1.26it/s]

Epoch 1/5 Step 2800/5198  Average Training Loss:1.552976


 56%|█████▌    | 2899/5198 [39:15<30:25,  1.26it/s]

Epoch 1/5 Step 2900/5198  Average Training Loss:1.534945


 58%|█████▊    | 2999/5198 [40:35<28:54,  1.27it/s]

Epoch 1/5 Step 3000/5198  Average Training Loss:1.515842


 60%|█████▉    | 3099/5198 [41:55<27:10,  1.29it/s]

Epoch 1/5 Step 3100/5198  Average Training Loss:1.498769


 62%|██████▏   | 3199/5198 [43:14<26:38,  1.25it/s]

Epoch 1/5 Step 3200/5198  Average Training Loss:1.482969


 63%|██████▎   | 3299/5198 [44:33<24:59,  1.27it/s]

Epoch 1/5 Step 3300/5198  Average Training Loss:1.468479


 65%|██████▌   | 3399/5198 [45:52<23:48,  1.26it/s]

Epoch 1/5 Step 3400/5198  Average Training Loss:1.455250


 67%|██████▋   | 3499/5198 [47:13<23:23,  1.21it/s]

Epoch 1/5 Step 3500/5198  Average Training Loss:1.441690


 69%|██████▉   | 3599/5198 [48:32<21:18,  1.25it/s]

Epoch 1/5 Step 3600/5198  Average Training Loss:1.427863


 71%|███████   | 3699/5198 [49:52<20:18,  1.23it/s]

Epoch 1/5 Step 3700/5198  Average Training Loss:1.416275


 73%|███████▎  | 3799/5198 [51:13<18:44,  1.24it/s]

Epoch 1/5 Step 3800/5198  Average Training Loss:1.405741


 75%|███████▌  | 3899/5198 [52:32<16:48,  1.29it/s]

Epoch 1/5 Step 3900/5198  Average Training Loss:1.395064


 77%|███████▋  | 3999/5198 [53:52<15:49,  1.26it/s]

Epoch 1/5 Step 4000/5198  Average Training Loss:1.385499


 79%|███████▉  | 4099/5198 [55:12<14:20,  1.28it/s]

Epoch 1/5 Step 4100/5198  Average Training Loss:1.374857


 81%|████████  | 4199/5198 [56:31<12:54,  1.29it/s]

Epoch 1/5 Step 4200/5198  Average Training Loss:1.364579


 83%|████████▎ | 4299/5198 [57:50<11:41,  1.28it/s]

Epoch 1/5 Step 4300/5198  Average Training Loss:1.354088


 85%|████████▍ | 4399/5198 [59:09<10:42,  1.24it/s]

Epoch 1/5 Step 4400/5198  Average Training Loss:1.345101


 87%|████████▋ | 4499/5198 [1:00:28<09:18,  1.25it/s]

Epoch 1/5 Step 4500/5198  Average Training Loss:1.336652


 88%|████████▊ | 4599/5198 [1:01:47<07:50,  1.27it/s]

Epoch 1/5 Step 4600/5198  Average Training Loss:1.328459


 90%|█████████ | 4699/5198 [1:03:07<06:35,  1.26it/s]

Epoch 1/5 Step 4700/5198  Average Training Loss:1.320562


 92%|█████████▏| 4799/5198 [1:04:26<05:09,  1.29it/s]

Epoch 1/5 Step 4800/5198  Average Training Loss:1.311298


 94%|█████████▍| 4899/5198 [1:05:46<03:58,  1.26it/s]

Epoch 1/5 Step 4900/5198  Average Training Loss:1.302986


 96%|█████████▌| 4999/5198 [1:07:05<02:34,  1.29it/s]

Epoch 1/5 Step 5000/5198  Average Training Loss:1.295234


 98%|█████████▊| 5099/5198 [1:08:24<01:17,  1.28it/s]

Epoch 1/5 Step 5100/5198  Average Training Loss:1.287996


100%|██████████| 5198/5198 [1:09:43<00:00,  1.24it/s]


Epoch 1/5  Average Training Loss:1.281206
Epoch 2/5
Epoch 2/5 Step 5200/5198  Average Training Loss:1.016840
Epoch 2/5 Step 5300/5198  Average Training Loss:0.923704
Epoch 2/5 Step 5400/5198  Average Training Loss:0.925322
Epoch 2/5 Step 5500/5198  Average Training Loss:0.908107
Epoch 2/5 Step 5600/5198  Average Training Loss:0.897733
Epoch 2/5 Step 5700/5198  Average Training Loss:0.900524
Epoch 2/5 Step 5800/5198  Average Training Loss:0.893949
Epoch 2/5 Step 5900/5198  Average Training Loss:0.887190
Epoch 2/5 Step 6000/5198  Average Training Loss:0.889781
Epoch 2/5 Step 6100/5198  Average Training Loss:0.888018
Epoch 2/5 Step 6200/5198  Average Training Loss:0.883516
Epoch 2/5 Step 6300/5198  Average Training Loss:0.878048
Epoch 2/5 Step 6400/5198  Average Training Loss:0.872679
Epoch 2/5 Step 6500/5198  Average Training Loss:0.869100
Epoch 2/5 Step 6600/5198  Average Training Loss:0.867774
Epoch 2/5 Step 6700/5198  Average Training Loss:0.864904
Epoch 2/5 Step 6800/5198  Average Tr

## Save Model

In [None]:
tokenizer.save_pretrained(f'./model_trained')
model.save_pretrained(f'./model_trained')

In [None]:
model = AutoModelForTokenClassification.from_pretrained(config['model'], num_labels=15).to(device)
model.load_state_dict(torch.load('./model_trained/pytorch_model.bin'))

## Load Test Data

In [None]:
test_df = pd.read_csv('./data/sample_submission.csv')
test_df.head(5)

In [None]:
test_names, test_texts = [], []
for f in tqdm(list(os.listdir('./data/test'))):
    test_names.append(f.replace('.txt', ''))
    with open('./data/test/' + f, 'r', encoding='utf-8') as f:
        text = ''
        for line in f.readlines():
            #text += line.replace('\n', '').replace('\xa0', '')
            text += line.replace('\n', ' ')
        test_texts.append(text)
test_texts = pd.DataFrame({'id': test_names, 'text': test_texts})
test_texts['text'] = test_texts['text'].apply(lambda x: x.split())
test_texts

In [None]:
test_dataset = MyDataset(test_texts, phase='Test')
test_iter = DataLoader(test_dataset, batch_size=config['valid_bs'], collate_fn=collate_fn, shuffle=False,
                        num_workers=config['num_workers'])

## Predict

In [None]:
y_pred = []
words = []

with torch.no_grad():
    model.eval()
    tk = tqdm(test_iter, total=len(test_iter), position=0, leave=True)
    for step, batch in enumerate(tk):
        word_ids = batch['word_ids']
        words.extend(word_ids)
        batch = {k: v.to(device) for k, v in batch.items() if k != 'word_ids'}

        output = model(input_ids=batch['input_ids'],
                       attention_mask=batch['attention_mask']).logits

        y_pred.extend(output.argmax(-1).cpu().numpy())
        
y_pred = np.array(y_pred)

In [None]:
y_pred[0][:200]

In [None]:
final_preds = []

for i in tqdm(range(len(test_texts))):
    idx = test_texts.id.values[i]
    pred = ['']*len(y_pred[i]-2)

    for j in range(1, len(y_pred[i])):
        pred[j-1] = labels[y_pred[i][j]]

    pred = [x.replace('B-','').replace('I-','') for x in pred]
    
    j = 0
    while j < len(pred):
        cls = pred[j]
        if cls == 'O':
            j += 1
        end = j + 1
        while end < len(pred) and pred[end] == cls:
            end += 1
            
        if cls != 'O' and cls != '' and end - j > 10:
            final_preds.append((idx, cls, ' '.join(map(str, list(range(j, end))))))
        
        j = end
        
final_preds[0]

In [None]:
sub = pd.DataFrame(final_preds)
sub.columns = test_df.columns
sub

In [None]:
sub.to_csv('submission.csv', index=False)