In [1]:
# !pip install transformers
import torch
import pickle
from torch import nn
import numpy as np
import pandas as pd
from transformers import *
from sklearn.metrics import roc_curve, auc
from torch.utils.data import TensorDataset, RandomSampler, SequentialSampler, DataLoader, random_split

In [2]:
class InputExample(object):
    def __init__(self, id, text, labels=None):
        self.id = id
        self.text = text
        self.labels = labels

class InputFeatures(object):
    def __init__(self, input_ids, input_mask, segment_ids, label_ids):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_ids = label_ids

In [3]:
def get_train_examples(train_file):
    train_df = pd.read_csv(train_file)
    ids = train_df['id'].values
    text = train_df['comment_text'].values
    labels = train_df[train_df.columns[2:]].values
    examples = []
    for i in range(len(train_df)):
        examples.append(InputExample(ids[i], text[i], labels=labels[i]))
    return examples

In [4]:
def get_features_from_examples(examples, max_seq_len, tokenizer):
    features = []
    for i,example in enumerate(examples):
        tokens = tokenizer.tokenize(example.text)
        if len(tokens) > max_seq_len - 2:
            tokens = tokens[:(max_seq_len - 2)]
        tokens = ["[CLS]"] + tokens + ["[SEP]"]
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)
        segment_ids = [0] * len(tokens)
        padding = [0] * (max_seq_len - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding
        assert len(input_ids) == max_seq_len
        assert len(input_mask) == max_seq_len
        assert len(segment_ids) == max_seq_len
        label_ids = [float(label) for label in example.labels]
        features.append(InputFeatures(input_ids=input_ids,
                                      input_mask=input_mask,
                                      segment_ids=segment_ids,
                                      label_ids=label_ids))
    return features

In [5]:
def get_dataset_from_features(features):
    input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.float)
    dataset = TensorDataset(input_ids,
                            input_mask,
                            segment_ids,
                            label_ids)
    return dataset

In [6]:
class KimCNN(nn.Module):
    def __init__(self, embed_num, embed_dim, dropout=0.1, kernel_num=3, kernel_sizes=[2,3,4], num_labels=2):
        super().__init__()
        self.num_labels = num_labels
        self.embed_num = embed_num
        self.embed_dim = embed_dim
        self.dropout = dropout
        self.kernel_num = kernel_num
        self.kernel_sizes = kernel_sizes
        self.embed = nn.Embedding(self.embed_num, self.embed_dim)
        self.convs = nn.ModuleList([nn.Conv2d(1, self.kernel_num, (k, self.embed_dim)) for k in self.kernel_sizes])
        self.dropout = nn.Dropout(self.dropout)
        self.classifier = nn.Linear(len(self.kernel_sizes)*self.kernel_num, self.num_labels)
        
    def forward(self, inputs, labels=None):
        output = inputs.unsqueeze(1)
        output = [nn.functional.relu(conv(output)).squeeze(3) for conv in self.convs]
        output = [nn.functional.max_pool1d(i, i.size(2)).squeeze(2) for i in output]
        output = torch.cat(output, 1)
        output = self.dropout(output)
        logits = self.classifier(output)
        if labels is not None:
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))
            return loss
        else:
            return logits

In [7]:
device = torch.device(type='cuda')
pretrained_weights = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
basemodel = BertModel.from_pretrained(pretrained_weights)
basemodel.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [8]:
seq_len = 256
train_file = 'train.csv'
train_examples = get_train_examples(train_file)
train_features = get_features_from_examples(train_examples, seq_len, tokenizer)
train_dataset = get_dataset_from_features(train_features)


train_val_split = 0.1
train_size = int(len(train_dataset)*(1-train_val_split))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

batch = 8
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch)
val_sampler = SequentialSampler(val_dataset)
val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=batch)

In [9]:
embed_num = seq_len 
embed_dim = basemodel.config.hidden_size 
dropout = basemodel.config.hidden_dropout_prob
kernel_num = 3
kernel_sizes = [2,3,4]
num_labels = 6

model = KimCNN(embed_num, embed_dim, dropout=dropout, kernel_num=kernel_num, kernel_sizes=kernel_sizes, num_labels=num_labels)
model.to(device)

KimCNN(
  (embed): Embedding(256, 768)
  (convs): ModuleList(
    (0): Conv2d(1, 3, kernel_size=(2, 768), stride=(1, 1))
    (1): Conv2d(1, 3, kernel_size=(3, 768), stride=(1, 1))
    (2): Conv2d(1, 3, kernel_size=(4, 768), stride=(1, 1))
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (classifier): Linear(in_features=9, out_features=6, bias=True)
)

In [10]:
lr = 3e-5
epochs = 1
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for i in range(epochs):
    print('-----------EPOCH #{}-----------'.format(i+1))
    print('training...')
    model.train()
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch
        with torch.no_grad():
            inputs,_ = basemodel(input_ids, segment_ids, input_mask)
        loss = model(inputs, label_ids)
        loss = loss.mean()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()        
    
    y_true = []
    y_pred = []

    model.eval()
    print('evaluating...')
    for step, batch in enumerate(val_dataloader):
        batch = tuple(t.to(device) for t in batch)
        val_input_ids, val_input_mask, val_segment_ids, val_label_ids = batch
        with torch.no_grad():
            val_inputs,_ = basemodel(val_input_ids, val_segment_ids, val_input_mask)
            logits = model(val_inputs)
        y_true.append(val_label_ids)
        y_pred.append(logits)

    y_true = torch.cat(y_true, dim=0).float().cpu().detach().numpy()
    y_pred = torch.cat(y_pred, dim=0).float().cpu().detach().numpy()

    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    for i,label in enumerate(labels):
        fpr[label], tpr[label], _ = roc_curve(y_true[:, i], y_pred[:, i])
        roc_auc[label] = auc(fpr[label], tpr[label])

    print('ROC AUC per label:')
    for label in labels:
        print(label, ': ', roc_auc[label])

-----------EPOCH #1-----------
training...
evaluating...
ROC AUC per label:
toxic :  0.9605486064463264
severe_toxic :  0.9841929394445961
obscene :  0.9621109584292289
threat :  0.7886769265503675
insult :  0.9427310938607746
identity_hate :  0.905177301817393


In [11]:
def sigmoid(z):
    s = 1.0 / (1.0 + np.exp(-1.0 * z))
    return s

for i in np.random.randint(0, len(y_pred), size=10):
    string = tokenizer.decode(val_dataset[i][0], skip_special_tokens=True)
    print('---------------------------------')
    print('Comment:')
    print(string)
    preds = dict(zip(labels, sigmoid(y_pred[i])))
    print('Prediction:')
    for label in preds:
        print(label, ': ', preds[label])

---------------------------------
Comment:
Mainly to JTD ( i think ) - I've just scanned through this discussion stuff having read the article. My initial impression was that this was going very kindly on the British indeed. It is admitted at some point in the discussion that there is a move among Irish historians away from previous over - simplifications of the matter - but isn't this move itself perhpas reactionary? I mean, Irish historians of this or any generation speak for their times, and, for lay readers like myself, the CONTEXT of this estimation of the role of the British imperial adventure in the famine is lost to those who have not studied so carefully the history of the history of the famine. This is, afterall, a general encyclopedia. So please twist the screws a bit. I won't go editing the article, but some feedback would be appreciated in this regard, Basically, you're all better historians than myself, but maybe some of your nuances are therefore misplaced here, ( a bit 