In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="3"  # specify which GPU(s) to be used

In [2]:
from transformers import BertTokenizerFast, BertModel, BertConfig, BertForSequenceClassification, TextDataset, BertForNextSentencePrediction
from transformers import AdamW, PreTrainedTokenizer
from transformers import get_linear_schedule_with_warmup
import numpy as np
from torch.utils.data import DataLoader, RandomSampler, TensorDataset, SequentialSampler
import torch
from tqdm.notebook import tqdm
import os
import pandas as pd
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import shuffle
from sklearn.metrics import f1_score

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [4]:
files = os.listdir("conll_scenes_speech")

In [5]:
def transform_label(x):
    
    if x == "scene_start":
        return 1
    else:
        return 0

In [6]:
# create training data
X = []
Y = []
context = 200
for fname in files[1:]:
    
    data = pd.read_csv("conll_scenes_speech/"+fname, sep="\t", index_col=0)
    sent_indexes = data[data.sentstart == "yes"].index
    for index in sent_indexes[2:]:
        try:
            string = " ".join(data.iloc[index-context:index-1,0])+" [SEP] "+" ".join(data.iloc[index:index+context,0])
            label = transform_label(data.iloc[index,3])
            X.append(string)
            Y.append(label)
        except IndexError:
            pass

In [7]:
# create eval data
X_val = []
Y_val = []
for fname in files[:1]:
    
    data = pd.read_csv("conll_scenes_speech/"+fname, sep="\t", index_col=0)
    sent_indexes = data[data.sentstart == "yes"].index
    for index in sent_indexes[2:]:
        try:
            string = " ".join(data.iloc[index-context:index-1,0])+" [SEP] "+" ".join(data.iloc[index:index+context,0])
            label = transform_label(data.iloc[index,3])
            X_val.append(string)
            Y_val.append(label)
        except IndexError:
            pass

In [8]:
# balance borders in training examples
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_sample(np.array(X).reshape(-1,1), np.array(Y).reshape(-1,1))
X_rus, y_rus = shuffle(X_rus,y_rus)

In [9]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-german-cased")

In [10]:
# tokenizer + padding
def create_example(x, maxlen):
    input_ids = []
    mask = []
    for exp in x:
        encoded = tokenizer.encode_plus(exp[0], padding=True)
        padding_need = maxlen-len(encoded["input_ids"])
        if padding_need > 0:
            input_ids.append((encoded["input_ids"]+[0]*padding_need))
            mask.append((encoded["attention_mask"]+[0]*padding_need))
        else:
            input_ids.append((encoded["input_ids"][:512]))
            mask.append((encoded["attention_mask"][:512]))
    mask = torch.tensor(mask)
    input_ids = torch.tensor(input_ids)
    return input_ids, mask

In [11]:
# execute tokenizer + padding
input_ids, masks = create_example(X_rus, 512)
labels = torch.tensor(y_rus)
input_ids_val, masks_val = create_example(X_val, 512)
labels_val = torch.tensor(Y_val)

In [15]:
# Training batches
batches = []
bsize = 8
for b in list(range(0,len(labels)-bsize,bsize)):
    
    if len(input_ids[b:b+bsize]) != bsize:
        break
    batches.append([input_ids[b:b+bsize],masks[b:b+bsize],labels[b:b+bsize]])

In [16]:
# eval batches
val_batches = []
for b in list(range(0,len(labels_val)-bsize,bsize)):
    
    if len(input_ids_val[b:b+bsize]) != bsize:
        break
    val_batches.append([input_ids_val[b:b+bsize],masks_val[b:b+bsize],labels_val[b:b+bsize]])

In [17]:
model = BertForNextSentencePrediction.from_pretrained("bert-base-german-cased")

In [18]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(pred_flat,labels_flat, average="macro")

In [19]:
optimizer = AdamW(model.parameters())

In [20]:
device = "cuda"
model.to(device)

BertForNextSentencePrediction(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [21]:
epochs = 10
total_steps = len(batches) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

In [None]:
# training
for epoch_i in range(0, epochs):
    
    total_loss = 0 
    model.train() 
    i = 0
    for step, batch in enumerate(batches):
        i+=1

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
   
        model.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, next_sentence_label=b_labels)


        loss = outputs[0] 
        total_loss += loss.item()


        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step() 
        scheduler.step()
        avg_train_loss = total_loss / i 
        
        if i % 10 == 0:
            print("Batch Loss: "+str(i)+" "+str(avg_train_loss))
        
        
        
    nb_eval_steps = 0
    eval_accuracy = 0
    for batch in val_batches:


        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = outputs[0] 
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()


            tmp_eval_accuracy = flat_accuracy(logits, label_ids)
            eval_accuracy += tmp_eval_accuracy 
            nb_eval_steps += 1 
    print("validation acc: "+str(eval_accuracy/nb_eval_steps))

Batch Loss: 10 1.2292439699172975
Batch Loss: 20 0.9781380504369735
Batch Loss: 30 0.8999764561653137
Batch Loss: 40 0.8780328065156937
Batch Loss: 50 0.8661887979507447
Batch Loss: 60 0.8665582756201426
Batch Loss: 70 0.8448717389787946
Batch Loss: 80 0.8263776428997517
Batch Loss: 90 0.8204327344894409
Batch Loss: 100 0.8215398174524308
Batch Loss: 110 0.8220973898064007
Batch Loss: 120 0.8152921517690023
Batch Loss: 130 0.8101320550991938
Batch Loss: 140 0.8055408477783204
Batch Loss: 150 0.8098158299922943
Batch Loss: 160 0.8037900768220425
Batch Loss: 170 0.7971783921999089
validation acc: 0.8875127733496835
Batch Loss: 10 0.7249630749225616
Batch Loss: 20 0.7136924237012863
Batch Loss: 30 0.7310001869996389
Batch Loss: 40 0.7617949590086937
Batch Loss: 50 0.7546090054512024
Batch Loss: 60 0.7574657211701076
Batch Loss: 70 0.7552310909543719
Batch Loss: 80 0.7518838889896869
Batch Loss: 90 0.7567178686459859
Batch Loss: 100 0.7567897325754166
Batch Loss: 110 0.7562584925781597
Bat