In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import os
os.chdir("/content/drive/MyDrive/MasterThesis/scripts/")

In [9]:
%env REPODIR=/content/drive/MyDrive/MasterThesis/

env: REPODIR=/content/drive/MyDrive/MasterThesis/


In [10]:
!pip install transformers datasets



In [11]:
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, AdamW
from tools_data import getBaseDir
from model_finetuned import FintunedModel
import os
import torch
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, DataLoader, TensorDataset, random_split
from torch import Tensor
import math
from tools_dataset import getDataSplitSizes, BertDataset
from tqdm import tqdm
from datasets import Dataset
import gc

In [12]:
gc.collect()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
#print(torch.cuda.memory_summary(device=device, abbreviated=False))
torch.cuda.empty_cache()

small = False

if small:
  postfix = "_small"
else:
  postfix = ""

repoDir = getBaseDir()
baseDir = os.path.join(repoDir,"data")
embeddingsDir = os.path.join(baseDir,"embeddings")
modelsDir = os.path.join(baseDir,"models")

tokens_path = os.path.join(embeddingsDir,"tokens"+postfix)
tokens = torch.load(tokens_path)

labels_path = os.path.join(embeddingsDir,"labels"+postfix)
labels = torch.load(labels_path)

seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)



dataset = BertDataset(tokens, labels)
del tokens
del labels

batchSize = 8

lengths = getDataSplitSizes(dataset)

trainData, testData, valData = random_split(
                                        dataset,
                                        lengths, 
                                        generator=torch.Generator().manual_seed(seed))
del dataset


trainSampler = RandomSampler(trainData)
trainDataloader = DataLoader(trainData, sampler=trainSampler, batch_size=batchSize)

#testSampler = RandomSampler(testData)
#testDataloader = DataLoader(testData, sampler=testSampler, batch_size=batchSize)

#valSampler = RandomSampler(valData)
#valDataloader = DataLoader(valData, sampler=valSampler, batch_size=batchSize)

model = BertForSequenceClassification.from_pretrained(
                                'bert-base-uncased',
                                num_labels = 9,
                                output_attentions = False,
                                output_hidden_states = False)

Dataset got split: 30988 Training, 5810 Testing & 1937 Validation Sub-dataset of total 38735


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [13]:
params = list(model.named_parameters())
print('The BERT model has {:} different named parameters.\n'.format(len(params)))
print('==== Embedding Layer ====\n')
for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== First Transformer ====\n')
for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== Output Layer ====\n')
for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
#print(params)

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [None]:
#model.to(device)
model.train().to(device)

optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-5)
for epoch in range(3):
    for i, batch in enumerate(tqdm(trainDataloader)):
        input_ids = batch["input_ids"]
        input_ids = input_ids.to(device)
        token_type_ids = batch["token_type_ids"]
        token_type_ids = token_type_ids.to(device)
        attention_mask = batch["attention_mask"]
        attention_mask = attention_mask.to(device)
        labels = batch["labels"]
        labels = labels.to(device)
        outputs = model(input_ids=input_ids, labels=labels)
        #, token_type_ids=token_type_ids,
                        #attention_mask=attention_mask,
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if i % 10 == 0:
            print(f"loss: {loss}")
    torch.save(model.state_dict(), os.path.join(modelsDir,"model_epoch"+str(epoch)+postfix))