In [1]:
import random
import os
import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification, AdamW, get_scheduler

## Config

In [2]:
config = {
    'fold_num': 5,
    'seed': 1234,
    #'model': 'roberta-base',
    'model': 'allenai/longformer-base-4096',
    #'model': 'allenai/longformer-large-4096',
    #'model': 'google/bigbird-roberta-base',
    'max_len': 1024,
    'epochs': 5,
    'train_bs': 3,
    'valid_bs': 6,
    'lr': 8e-5,
    'num_workers': 0,
    'weight_decay': 1e-2,
    'num_warmup_steps': 100,
    'lr_scheduler_type': 'linear',
    'gradient_accumulation_steps': 8,
}


In [3]:
labels = ['O', 'B-Lead', 'I-Lead', 'B-Position', 'I-Position', 'B-Claim', 'I-Claim', 'B-Counterclaim', 'I-Counterclaim',
          'B-Rebuttal', 'I-Rebuttal', 'B-Evidence', 'I-Evidence', 'B-Concluding Statement', 'I-Concluding Statement']
labels2index = {
    'Lead': 1, 'Position': 3, 'Claim': 5, 'Counterclaim': 7, 'Rebuttal': 9, 'Evidence': 11, 'Concluding Statement': 13
}

## Set Seed

In [4]:
def set_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


set_seed(config['seed'])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Load Train Data

In [5]:
train_df = pd.read_csv('./data/train.csv')
train_df.head(2)

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59


In [6]:
train_names, train_texts = [], []
for f in tqdm(list(os.listdir('./data/train'))):
    train_names.append(f.replace('.txt', ''))
    with open('./data/train/' + f, 'r', encoding='utf-8') as f:
        text = ''
        for line in f.readlines():
            #text += line.replace('\n', '').replace('\xa0', '')
            text += line.replace('\n', ' ')
        train_texts.append(text)
train_texts = pd.DataFrame({'id': train_names, 'text': train_texts})
train_texts['text'] = train_texts['text'].apply(lambda x: x.split())

100%|██████████| 15594/15594 [00:00<00:00, 18557.41it/s]


In [7]:
train_texts = train_texts.sort_values(by='id').reset_index(drop=True)
train_df = train_df.sort_values(by=['id', 'discourse_start']).reset_index(drop=True)

In [8]:
text_index = dict.fromkeys(train_texts['id'].values.tolist())
for i in range(len(train_df)):
    id = train_df.iloc[i]['id']
    if not text_index[id]:
        text_index[id] = [i]
    else:
        text_index[id].append(i)
    if (i+1) % 20000 == 0:
        print("Processed {0} discourses.".format(i+1))

Processed 20000 discourses.
Processed 40000 discourses.
Processed 60000 discourses.
Processed 80000 discourses.
Processed 100000 discourses.
Processed 120000 discourses.
Processed 140000 discourses.


In [9]:
taggings = []
essays = 0
for i in range(len(train_texts)):
    text_id = train_texts.iloc[i]['id']
    text = train_texts.iloc[i]['text']
    tagging = [0] * config['max_len']
    for k in text_index[text_id]:
        if train_df.iloc[k]['id'] != train_texts.iloc[i]['id']:
            break

        discourse_type = train_df.iloc[k]['discourse_type']
        predictionstring = train_df.iloc[k]['predictionstring'].split(' ')
        label = labels2index[discourse_type]
        if int(predictionstring[0]) > config['max_len'] - 2:
            break
        else:
            tagging[int(predictionstring[0]) + 1] = label
        for m in range(int(predictionstring[0]) + 2, int(predictionstring[-1]) + 2):
            if m > config['max_len'] - 2:
                break
            else:
                tagging[m] = label + 1
    tagging[-1] = 0
    taggings.append(tagging)
    essays += 1
    if essays % 2000 == 0:
        print("Processed {0} essays.".format(essays))

Processed 2000 essays.
Processed 4000 essays.
Processed 6000 essays.
Processed 8000 essays.
Processed 10000 essays.
Processed 12000 essays.
Processed 14000 essays.


In [10]:
train_texts['tagging'] = taggings

In [11]:
tokenizer = AutoTokenizer.from_pretrained(config['model'], add_prefix_space=True)

Downloading:   0%|          | 0.00/694 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [12]:
class MyDataset(Dataset):
    def __init__(self, df, phase='Train'):
        self.df = df
        self.phase = phase

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.text.values[idx]
        if self.phase == 'Train':
            label = self.df.tagging.values[idx]
            return {'text': text, 'label': label}
        else:
            return {'text': text}


def collate_fn(data):
    input_ids, attention_mask = [], []
    text = [item['text'] for item in data]
    tokenized_inputs = tokenizer(
        text,
        max_length=config['max_len'],
        padding='max_length',
        truncation=True,
        is_split_into_words=True,
        return_tensors='pt'
    )

    words = []
    for i in range(len(data)):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        words.append(word_ids)

    tokenized_inputs["word_ids"] = words
    if 'label' in data[0].keys():
        label = [item['label'] for item in data]
        tokenized_inputs['labels'] = torch.LongTensor(label)

    return tokenized_inputs

In [13]:
train_dataset = MyDataset(train_texts, phase='Train')
train_iter = DataLoader(train_dataset, batch_size=config['train_bs'], collate_fn=collate_fn, shuffle=False,
                        num_workers=config['num_workers'])

## Load Model and Prepare Optimizer and LR Scheduler

In [14]:
model = AutoModelForTokenClassification.from_pretrained(config['model'], num_labels=15).to(device)

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {"params": [p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)],
     "weight_decay": config['weight_decay'],
     },
    {"params": [p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)],
     "weight_decay": 0.0,
     },
]
optimizer = AdamW(optimizer_grouped_parameters,
                  lr=config['lr'],
                  betas=(0.9, 0.999),
                  eps=1e-6
                  )
lr_scheduler = get_scheduler(
    name=config['lr_scheduler_type'],
    optimizer=optimizer,
    num_warmup_steps=config['num_warmup_steps'],
    num_training_steps=config['epochs'] * len(train_iter) /  config['gradient_accumulation_steps'], )

Downloading:   0%|          | 0.00/570M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForTokenClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing LongformerForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForTokenClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN

## Start Training!

In [None]:
model.train()
step = 0
model.zero_grad()
for epoch in range(config['epochs']):
    losses = []
    print("Epoch {}/{}".format(epoch+1, config['epochs']))
    for epoch_step, batch in enumerate(train_iter):
        batch = {k: v.to(device) for k, v in batch.items() if k != 'word_ids'}
        loss = model(input_ids=batch['input_ids'],
                     attention_mask=batch['attention_mask'],
                     labels=batch['labels']).loss
        loss /= config['gradient_accumulation_steps']
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=5, norm_type=2)
        if (step + 1) % config['gradient_accumulation_steps'] == 0 or (step + 1) == len(train_iter) * config['epochs']:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        step += 1
        losses.append(loss.item() * config['gradient_accumulation_steps'])
        if (epoch_step + 1) % 100 == 0:
            print("Epoch {}/{} Step {}/{}  Average Training Loss:{:6f}".format(
                epoch+1,
                config['epochs'],
                epoch_step + 1,
                len(train_iter),
                np.mean(losses)))
    # print average loss
    print("Epoch {}/{}  Average Training Loss:{:6f}".format(
        epoch+1,
        config['epochs'],
        np.mean(losses)))



Epoch 1/5
Epoch 1/5 Step 100/5198  Average Training Loss:2.955353
Epoch 1/5 Step 200/5198  Average Training Loss:2.608121
Epoch 1/5 Step 300/5198  Average Training Loss:2.322504
Epoch 1/5 Step 400/5198  Average Training Loss:2.163729
Epoch 1/5 Step 500/5198  Average Training Loss:2.028495
Epoch 1/5 Step 600/5198  Average Training Loss:1.898943
Epoch 1/5 Step 700/5198  Average Training Loss:1.782778
Epoch 1/5 Step 800/5198  Average Training Loss:1.700088
Epoch 1/5 Step 900/5198  Average Training Loss:1.628187
Epoch 1/5 Step 1000/5198  Average Training Loss:1.566201
Epoch 1/5 Step 1100/5198  Average Training Loss:1.513214
Epoch 1/5 Step 1200/5198  Average Training Loss:1.469263
Epoch 1/5 Step 1300/5198  Average Training Loss:1.430204
Epoch 1/5 Step 1400/5198  Average Training Loss:1.397842
Epoch 1/5 Step 1500/5198  Average Training Loss:1.367921
Epoch 1/5 Step 1600/5198  Average Training Loss:1.344093
Epoch 1/5 Step 1700/5198  Average Training Loss:1.320535
Epoch 1/5 Step 1800/5198  Aver

## Save Model

In [None]:
tokenizer.save_pretrained(f'./model_trained')
model.save_pretrained(f'./model_trained')

In [None]:
model = AutoModelForTokenClassification.from_pretrained(config['model'], num_labels=15).to(device)
model.load_state_dict(torch.load('./model_trained/pytorch_model.bin'))

## Load Test Data

In [None]:
test_df = pd.read_csv('./data/sample_submission.csv')
test_df.head(5)

In [None]:
test_names, test_texts = [], []
for f in tqdm(list(os.listdir('./data/test'))):
    test_names.append(f.replace('.txt', ''))
    with open('./data/test/' + f, 'r', encoding='utf-8') as f:
        text = ''
        for line in f.readlines():
            #text += line.replace('\n', '').replace('\xa0', '')
            text += line.replace('\n', ' ')
        test_texts.append(text)
test_texts = pd.DataFrame({'id': test_names, 'text': test_texts})
test_texts['text'] = test_texts['text'].apply(lambda x: x.split())
test_texts

In [None]:
test_dataset = MyDataset(test_texts, phase='Test')
test_iter = DataLoader(test_dataset, batch_size=config['valid_bs'], collate_fn=collate_fn, shuffle=False,
                        num_workers=config['num_workers'])

## Predict

In [None]:
y_pred = []
words = []

with torch.no_grad():
    model.eval()
    tk = tqdm(test_iter, total=len(test_iter), position=0, leave=True)
    for step, batch in enumerate(tk):
        word_ids = batch['word_ids']
        words.extend(word_ids)
        batch = {k: v.to(device) for k, v in batch.items() if k != 'word_ids'}

        output = model(input_ids=batch['input_ids'],
                       attention_mask=batch['attention_mask']).logits

        y_pred.extend(output.argmax(-1).cpu().numpy())
        
y_pred = np.array(y_pred)

In [None]:
y_pred[0][:200]

In [None]:
final_preds = []

for i in tqdm(range(len(test_texts))):
    idx = test_texts.id.values[i]
    pred = ['']*len(y_pred[i]-2)

    for j in range(1, len(y_pred[i])):
        pred[j-1] = labels[y_pred[i][j]]

    pred = [x.replace('B-','').replace('I-','') for x in pred]
    
    j = 0
    while j < len(pred):
        cls = pred[j]
        if cls == 'O':
            j += 1
        end = j + 1
        while end < len(pred) and pred[end] == cls:
            end += 1
            
        if cls != 'O' and cls != '' and end - j > 10:
            final_preds.append((idx, cls, ' '.join(map(str, list(range(j, end))))))
        
        j = end
        
final_preds[0]

In [None]:
sub = pd.DataFrame(final_preds)
sub.columns = test_df.columns
sub

In [None]:
sub.to_csv('submission.csv', index=False)

In [None]:
!shutdown