In [1]:
import random
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification, AdamW, get_scheduler
from transformers.trainer import Trainer
from transformers.models.roberta import RobertaTokenizer

In [2]:
config = {
    'fold_num': 5,
    'seed': 42,
    'model': 'roberta-base',
    'max_len': 512,
    'epochs': 1,
    'train_bs': 8,
    'valid_bs': 8,
    'lr': 3e-5,
    'num_workers': 0,
    'weight_decay': 1e-2,
    'num_warmup_steps': 0,
    'lr_scheduler_type': 'linear',
    'gradient_accumulation_steps': 1,
}

labels = ['O', 'B-Lead', 'I-Lead', 'B-Position', 'I-Position', 'B-Claim', 'I-Claim', 'B-Counterclaim', 'I-Counterclaim',
          'B-Rebuttal', 'I-Rebuttal', 'B-Evidence', 'I-Evidence', 'B-Concluding Statement', 'I-Concluding Statement']
labels2index = {
    'Lead': 1, 'Position': 3, 'Claim': 5, 'Counterclaim': 7, 'Rebuttal': 9, 'Evidence': 11, 'Concluding Statement': 13
}


In [3]:
def set_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


set_seed(config['seed'])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
train_df = pd.read_csv('./data/train.csv')
train_df.head(2)

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59


In [5]:
train_names, train_texts = [], []
for f in tqdm(list(os.listdir('./data/train'))):
    train_names.append(f.replace('.txt', ''))
    with open('./data/train/' + f, 'r', encoding='utf-8') as f:
        text = ''
        for line in f.readlines():
            #text += line.replace('\n', '').replace('\xa0', '')
            text += line.replace('\n', ' ')
        train_texts.append(text)
train_texts = pd.DataFrame({'id': train_names, 'text': train_texts})
train_texts['text'] = train_texts['text'].apply(lambda x: x.split())

100%|██████████| 15594/15594 [00:40<00:00, 383.28it/s] 


In [6]:
train_texts = train_texts.sort_values(by='id').reset_index(drop=True)
train_df = train_df.sort_values(by=['id', 'discourse_start']).reset_index(drop=True)

In [7]:
text_index = dict.fromkeys(train_texts['id'].values.tolist())
for i in range(len(train_df)):
    id = train_df.iloc[i]['id']
    if not text_index[id]:
        text_index[id] = [i]
    else:
        text_index[id].append(i)
    if (i + 1) % 20000 == 0:
        print("Processed {0} discourses.".format(i + 1))

Processed 20000 discourses.
Processed 40000 discourses.
Processed 60000 discourses.
Processed 80000 discourses.
Processed 100000 discourses.
Processed 120000 discourses.
Processed 140000 discourses.


In [8]:
taggings = []
essays = 0
for i in range(len(train_texts)):
    text_id = train_texts.iloc[i]['id']
    text = train_texts.iloc[i]['text']
    tagging = [0] * config['max_len']
    for k in text_index[text_id]:
        if train_df.iloc[k]['id'] != train_texts.iloc[i]['id']:
            break

        discourse_type = train_df.iloc[k]['discourse_type']
        predictionstring = train_df.iloc[k]['predictionstring'].split(' ')
        label = labels2index[discourse_type]
        if int(predictionstring[0]) > config['max_len'] - 2:
            break
        else:
            tagging[int(predictionstring[0]) + 1] = label
        for m in range(int(predictionstring[0]) + 2, int(predictionstring[-1]) + 2):
            if m > config['max_len'] - 2:
                break
            else:
                tagging[m] = label + 1
    tagging[-1] = 0
    taggings.append(tagging)
    essays += 1
    if essays % 2000 == 0:
        print("Processed {0} essays.".format(essays))

Processed 2000 essays.
Processed 4000 essays.
Processed 6000 essays.
Processed 8000 essays.
Processed 10000 essays.
Processed 12000 essays.
Processed 14000 essays.


In [9]:
train_texts['tagging'] = taggings

In [10]:
tokenizer = AutoTokenizer.from_pretrained(config['model'], add_prefix_space=True)

In [11]:
class MyDataset(Dataset):
    def __init__(self, df, phase='Train'):
        self.df = df
        self.phase = phase

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.text.values[idx]
        if self.phase == 'Train':
            label = self.df.tagging.values[idx]
            return {'text': text, 'label': label}
        else:
            return {'text': text}


def collate_fn(data):
    input_ids, attention_mask = [], []
    text = [item['text'] for item in data]
    tokenized_inputs = tokenizer(
        text,
        max_length=config['max_len'],
        padding='max_length',
        truncation=True,
        is_split_into_words=True,
        return_tensors='pt'
    )

    words = []
    for i in range(len(data)):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        words.append(word_ids)

    tokenized_inputs["word_ids"] = words
    if 'label' in data[0].keys():
        label = [item['label'] for item in data]
        tokenized_inputs['labels'] = torch.LongTensor(label)

    return tokenized_inputs

In [12]:
train_dataset = MyDataset(train_texts, phase='Train')
train_iter = DataLoader(train_dataset, batch_size=config['train_bs'], collate_fn=collate_fn, shuffle=False,
                        num_workers=config['num_workers'])

In [13]:
for sample in train_iter:
    print(sample['input_ids'][0])
    print(sample['word_ids'][0])
    break

tensor([    0,   993,    82, 12138,  2088,    14,     5,    98,   373,    22,
         9021,   113,    15, 35899,    21,  1412,    30,   301,    15, 35899,
            4,   152,    16,    45,     5,   403,     4,    20,   652,    15,
         6507,    16,    10,  8366, 37627,  5206,  1212,  1026,   373,    10,
        10969,   102,     4,    85,    21,    45,  1412,    30, 20739,     6,
            8,    89,    16,   117,  7407,   853,  5073,     7,  7433, 13058,
          301, 33334,    15, 35899,     4,   345,    16,   117,  1283,    14,
         6109,    34,   303,    14,   190,  3649,    14,    42,   652,    21,
         1412,    30, 20739,     4,    83, 10969,   102,    16,    10,  8366,
        37627,  5206,  3152,  9285,     6,    14,    16,   303,    15,  6507,
            8,  3875,     4,   152,    22,  9021,   113,    15, 35899,   129,
         1326,   101,    10,   652,   142,  5868,  3805,     7,   192,  2419,
        11263,    52,   356,     6,  5868,    32,  3334,  2778, 

In [16]:
word_ids = sample['word_ids'][1]
train_dataset[1]['text']

['Driverless',
 'cars',
 'are',
 'exaclty',
 'what',
 'you',
 'would',
 'expect',
 'them',
 'to',
 'be.',
 'Cars',
 'that',
 'will',
 'drive',
 'without',
 'a',
 'person',
 'actually',
 'behind',
 'the',
 'wheel',
 'controlling',
 'the',
 'actions',
 'of',
 'the',
 'vehicle.',
 'The',
 'idea',
 'of',
 'driverless',
 'cars',
 'going',
 'in',
 'to',
 'developement',
 'shows',
 'the',
 'amount',
 'of',
 'technological',
 'increase',
 'that',
 'the',
 'wolrd',
 'has',
 'made.',
 'The',
 'leader',
 'of',
 'this',
 'idea',
 'of',
 'driverless',
 'cars',
 'are',
 'the',
 'automobiles',
 'they',
 'call',
 'Google',
 'cars.',
 'The',
 'arduous',
 'task',
 'of',
 'creating',
 'safe',
 'driverless',
 'cars',
 'has',
 'not',
 'been',
 'fully',
 'mastered',
 'yet.',
 'The',
 'developement',
 'of',
 'these',
 'cars',
 'should',
 'be',
 'stopped',
 'immediately',
 'because',
 'there',
 'are',
 'too',
 'many',
 'hazardous',
 'and',
 'dangerous',
 'events',
 'that',
 'could',
 'occur.',
 'One',
 'thing

In [13]:
model = AutoModelForTokenClassification.from_pretrained(config['model'], num_labels=15).to(device)

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {"params": [p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)],
     "weight_decay": config['weight_decay'],
     },
    {"params": [p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)],
     "weight_decay": 0.0,
     },
]
optimizer = AdamW(optimizer_grouped_parameters,
                  lr=config['lr'],
                  betas=(0.9, 0.999),
                  eps=1e-6
                  )
lr_scheduler = get_scheduler(
    name=config['lr_scheduler_type'],
    optimizer=optimizer,
    num_warmup_steps=config['num_warmup_steps'],
    num_training_steps=config['epochs'] * len(train_iter) /  config['gradient_accumulation_steps'], )

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able

In [14]:
words, losses = [], []
tk = tqdm(train_iter, total=len(train_iter), position=0, leave=True)
model.train()
step = 0
for epoch in range(config['epochs']):
    print("Epoch {}/{}".format(epoch, config['epochs']))
    for batch in tk:
        word_ids = batch['word_ids']
        words.extend(word_ids)
        batch = {k: v.to(device) for k, v in batch.items() if k != 'word_ids'}
        loss = model(input_ids=batch['input_ids'],
                     attention_mask=batch['attention_mask'],
                     labels=batch['labels']).loss
        loss /= config['gradient_accumulation_steps']
        loss.backward()
        if (step + 1) % config['gradient_accumulation_steps'] == 0:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        step += 1
        losses.append(loss.item() * config['gradient_accumulation_steps'])
    # print average loss
    print("Epoch {}/{}  Average Training Loss:{:6f}".format(
        epoch,
        config['epochs'],
        np.mean(losses)))


100%|██████████| 1950/1950 [13:29<00:00,  2.41it/s]

Epoch 0/1  Average Training Loss:0.914627





In [15]:
test_df = pd.read_csv('./data/sample_submission.csv')
test_df.head(5)

Unnamed: 0,id,class,predictionstring
0,18409261F5C2,,
1,D46BCB48440A,,
2,0FB0700DAF44,,
3,D72CB1C11673,,
4,DF920E0A7337,,


In [37]:
test_names, test_texts = [], []
for f in tqdm(list(os.listdir('./data/test'))):
    test_names.append(f.replace('.txt', ''))
    with open('./data/test/' + f, 'r', encoding='utf-8') as f:
        text = ''
        for line in f.readlines():
            #text += line.replace('\n', '').replace('\xa0', '')
            text += line.replace('\n', ' ')
        test_texts.append(text)
test_texts = pd.DataFrame({'id': test_names, 'text': test_texts})
test_texts['text'] = test_texts['text'].apply(lambda x: x.split())
test_texts

100%|██████████| 5/5 [00:00<00:00, 4356.36it/s]


Unnamed: 0,id,text
0,0FB0700DAF44,"[During, a, group, project,, have, you, ever, ..."
1,18409261F5C2,"[80%, of, Americans, believe, seeking, multipl..."
2,D46BCB48440A,"[When, people, ask, for, advice,they, sometime..."
3,D72CB1C11673,"[Making, choices, in, life, can, be, very, dif..."
4,DF920E0A7337,"[Have, you, ever, asked, more, than, one, pers..."


In [61]:
test_dataset = MyDataset(test_texts, phase='Test')
test_iter = DataLoader(test_dataset, batch_size=config['valid_bs'], collate_fn=collate_fn, shuffle=False,
                       num_workers=config['num_workers'])

In [63]:
y_pred = []
words = []

with torch.no_grad():
    tk = tqdm(test_iter, total=len(test_iter), position=0, leave=True)
    for step, batch in enumerate(tk):
        word_ids = batch['word_ids']
        words.extend(word_ids)
        batch = {k: v.to(device) for k, v in batch.items() if k != 'word_ids'}

        output = model(input_ids=batch['input_ids'],
                       attention_mask=batch['attention_mask']).logits

        y_pred.extend(output.argmax(-1).cpu().numpy())

y_pred = np.array(y_pred)

100%|██████████| 1/1 [00:00<00:00,  7.72it/s]


In [66]:
for i in tqdm(range(len(test_texts))):
    idx = test_texts.id.values[i]
    pred = [''] * len(test_texts.text.values[i])

    for j in range(len(y_pred[i])):
        if words[i][j] != None:
            pred[words[i][j]] = labels[y_pred[i][j]]

    pred = [x.replace('B-', '').replace('I-', '') for x in pred]

    break

  0%|          | 0/5 [00:00<?, ?it/s]


In [68]:
pred

['Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Lead',
 'Position',
 'Position',
 'Position',
 'Position',
 'Position',
 'Position',
 'Position',
 'Position',
 'Position',
 'Position',
 'Position',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Position',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Evidence',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim',
 'Claim'

In [None]:
final_preds = []

for i in tqdm(range(len(test_texts))):
    idx = test_texts.id.values[i]
    pred = [''] * len(test_texts.text.values[i])

    for j in range(len(y_pred[i])):
        if words[i][j] != None:
            pred[words[i][j]] = labels[y_pred[i][j]]

    pred = [x.replace('B-', '').replace('I-', '') for x in pred]

    preds = []
    j = 0
    while j < len(pred):
        cls = pred[j]
        if cls == 'O':
            j += 1
        end = j + 1
        while end < len(pred) and pred[end] == cls:
            end += 1

        if cls != 'O' and cls != '' and end - j > 10:
            final_preds.append((idx, cls, ' '.join(map(str, list(range(j, end))))))
        
        j = end

final_preds[0]

In [None]:
sub = pd.DataFrame(final_preds)
sub.columns = test_df.columns
sub

In [None]:
sub.to_csv('submission.csv', index=False)