# 1. Imports

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from transformers import BertPreTrainedModel, BertModel, BertConfig
from tqdm.autonotebook import tqdm
from torch.utils.data.sampler import WeightedRandomSampler
from torch.optim import Adam
from transformers import AutoTokenizer

my_device = torch.device("cuda")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MAX_LEN = 92
BATCH_SIZE = 32
EPOCHS = 8
LR = 0.001
PRETRAINED_MODEL = 'allegro/herbert-base-cased'

# 2. Data Loading

In [3]:
labels = pd.read_csv('training_set_clean_only_tags.txt', header=None)
labels.columns = ["y"]
text = pd.read_fwf('training_set_clean_only_text.txt', header=None)
text.columns = ["X"]

data_df = text.join(labels)

data_df.head(10)

Unnamed: 0,X,y
0,Dla mnie faworytem do tytułu będzie Cracovia. ...,0
1,@anonymized_account @anonymized_account Brawo ...,0
2,"@anonymized_account @anonymized_account Super,...",0
3,@anonymized_account @anonymized_account Musi. ...,0
4,"Odrzut natychmiastowy, kwaśna mina, mam problem",0
5,"Jaki on był fajny xdd pamiętam, że spóźniłam s...",0
6,@anonymized_account No nie ma u nas szczęścia 😉,0
7,@anonymized_account Dawno kogoś tak wrednego n...,0
8,@anonymized_account @anonymized_account Zaległ...,0
9,@anonymized_account @anonymized_account @anony...,2


In [4]:
data_df.groupby(["y"]).count()

Unnamed: 0_level_0,X
y,Unnamed: 1_level_1
0,9190
1,253
2,598


# 3. Data processing

In [5]:
df = data_df
df.label = df.y.astype(int)
df = df.dropna().reset_index(drop=True)

  df.label = df.y.astype(int)


In [6]:
df['X'] = df['X'].apply(lambda r: r.replace("@anonymized_account", "@osoba"))

# 4. Dataset preparation

In [7]:
class MyDataset:
    def __init__(self, text, label):
        self.text = text
        self.label = label
        self.tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
    
    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = ' '.join(self.text[item].split())
        label = self.label[item]
        enc = self.tokenizer(text, max_length=MAX_LEN, truncation=True, padding='max_length', return_tensors='pt')

        return {
            'ids': enc.input_ids[0],
            'mask': enc.attention_mask[0],
            'token_type_ids': enc.token_type_ids[0],
            'targets': torch.tensor(label, dtype=torch.long)
        } 

In [8]:
X_idx, y_idx = train_test_split(df.index, test_size=0.2)
df_train = df.iloc[X_idx].reset_index(drop=True)
df_valid = df.iloc[y_idx].reset_index(drop=True)

### Samples weights computing

In [9]:
target = df_train.y.values
classes_count = np.bincount(target)
weights = 1. / classes_count
samples_weight = weights[target]
sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

In [10]:
train_dataset = MyDataset(df_train.X.values, df_train.y.values)
valid_dataset = MyDataset(df_valid.X.values, df_valid.y.values)

valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=sampler)

# 5. MODEL

In [11]:
class MyHerBertaModel(BertPreTrainedModel):
    def __init__(self, conf):
        super(MyHerBertaModel, self).__init__(conf)
        self.bert = BertModel.from_pretrained(PRETRAINED_MODEL, config=conf)
        for param in self.bert.parameters():
            param.requires_grad = False
        self.mx = nn.MaxPool1d(MAX_LEN - 1)
        
        self.l0 = nn.Conv1d(768, 100, 2)
        self.drop_out1 = nn.Dropout(0.3)
        self.l1 = nn.Linear(100, 1536)
        self.drop_out2 = nn.Dropout(0.3)
        self.l2 = nn.Linear(1536, 768)
        self.drop_out3 = nn.Dropout(0.2)
        self.l3 = nn.Linear(768, 3)
        
        torch.nn.init.normal_(self.l0.weight, std=0.02)
        torch.nn.init.normal_(self.l1.weight, std=0.02)
        torch.nn.init.normal_(self.l2.weight, std=0.02)
        torch.nn.init.normal_(self.l3.weight, std=0.02)
        
        self.activation0 = torch.nn.LeakyReLU(negative_slope=0.05, inplace=False)
        self.activation1 = torch.nn.LeakyReLU(negative_slope=0.05, inplace=False)
        self.activation2 = torch.nn.LeakyReLU(negative_slope=0.05, inplace=False)
        self.activation3 = torch.nn.Softmax(dim=1)
    
    def forward(self, ids, mask, token_type_ids):
        out = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        out = out[0]
        out = out.permute(0,2,1)

        out = self.activation0(self.l0(out))

        out = torch.squeeze(self.mx(out))

        out = self.drop_out1(out)
        out = self.activation1(self.l1(out))
        
        out = self.drop_out2(out)
        out = self.activation2(self.l2(out))

        out = self.drop_out3(out)
        out = self.activation3(self.l3(out))
        return out

In [12]:
model_config = BertConfig.from_pretrained(PRETRAINED_MODEL)
model = MyHerBertaModel(conf=model_config)
model.to(my_device)

Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.sso.sso_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


MyHerBertaModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [13]:
loss_f = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=LR)  

# 6. Train and test

In [14]:
def train_fn(data_loader, model, optimizer, device):
    model.train()
    all_outputs = []
    all_targets = []
    for d in tqdm(data_loader, total=len(data_loader)):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets = d["targets"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.long)

        model.zero_grad()
        outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)

        loss = loss_f(outputs, targets)
        loss.backward()
        optimizer.step()
        outputs = torch.argmax(outputs, dim=1).cpu().detach().numpy().tolist()
        targets = targets.cpu().detach().numpy().tolist()
        
        all_outputs.extend(outputs)
        all_targets.extend(targets)
        
    f1_macro = metrics.f1_score(all_targets, all_outputs, average='macro')
    print(f"F1 macro: {f1_macro}")
    
    return all_outputs, all_targets

In [15]:
def eval_fn(data_loader, model, device):
    model.eval()
    all_targets = []
    all_outputs = []
    with torch.no_grad():
        for d in data_loader:
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            
            outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
            
            outputs = torch.argmax(outputs, dim=1).cpu().detach().numpy().tolist()
            targets = targets.cpu().detach().numpy().tolist()
            
            all_targets.extend(targets)
            all_outputs.extend(outputs)

    print(f"F1 micro: {metrics.f1_score(all_targets,all_outputs, average='micro')}")
    print(f"F1 macro: {metrics.f1_score(all_targets,all_outputs, average='macro')}")
    print(f"confusion matrix:\n {metrics.confusion_matrix(all_targets,all_outputs)}")
    
    return all_outputs, all_targets

In [16]:
top_f1 = 0.0
model_path=f"model.bin"
for epoch in range(EPOCHS):
    print(f"Epoch {epoch}:")
    print("TRAINING:")
    print("="*20)
    train_fn(train_dataloader, model, optimizer, my_device)
    print("TESTING:")
    print("="*20)
    o, p = eval_fn(valid_dataloader, model, my_device)
    
    if (f1 :=metrics.f1_score(p,o, average='macro')) > top_f1:
        top_f1 = f1
        print(f"Saved model after epoch {epoch}, top f1: {top_f1}")
        torch.save(model.state_dict(), model_path)
        
    

Epoch 0:
TRAINING:


100%|██████████| 251/251 [04:49<00:00,  1.15s/it]


F1 macro: 0.6332049703130839
TESTING:
F1 micro: 0.8232951717272274
F1 macro: 0.4817347607547646
confusion matrix:
 [[1575  124  148]
 [  13   20    4]
 [  31   35   59]]
Saved model after epoch 0, top f1: 0.4817347607547646
Epoch 1:
TRAINING:


100%|██████████| 251/251 [04:46<00:00,  1.14s/it]


F1 macro: 0.7477860347627656
TESTING:
F1 micro: 0.5584868093578895
F1 macro: 0.33472847209821205
confusion matrix:
 [[1018  176  653]
 [   2   11   24]
 [   3   29   93]]
Epoch 2:
TRAINING:


100%|██████████| 251/251 [04:54<00:00,  1.17s/it]


F1 macro: 0.7482777729232563
TESTING:
F1 micro: 0.7386759581881533
F1 macro: 0.42889462173506593
confusion matrix:
 [[1390  206  251]
 [   9   13   15]
 [  10   34   81]]
Epoch 3:
TRAINING:


100%|██████████| 251/251 [04:45<00:00,  1.14s/it]


F1 macro: 0.7678397653526826
TESTING:
F1 micro: 0.7834743653558984
F1 macro: 0.4292379366156431
confusion matrix:
 [[1473   35  339]
 [  16    3   18]
 [  18    9   98]]
Epoch 4:
TRAINING:


100%|██████████| 251/251 [04:46<00:00,  1.14s/it]


F1 macro: 0.7984935149655007
TESTING:
F1 micro: 0.7526132404181185
F1 macro: 0.44357976765654006
confusion matrix:
 [[1406  126  315]
 [   6   13   18]
 [  14   18   93]]
Epoch 5:
TRAINING:


100%|██████████| 251/251 [04:45<00:00,  1.14s/it]


F1 macro: 0.7902390769206443
TESTING:
F1 micro: 0.7207565953210554
F1 macro: 0.43354996661787326
confusion matrix:
 [[1347  256  244]
 [   5   18   14]
 [   9   33   83]]
Epoch 6:
TRAINING:


100%|██████████| 251/251 [04:45<00:00,  1.14s/it]


F1 macro: 0.7473303232052388
TESTING:
F1 micro: 0.5281234444997511
F1 macro: 0.334628031480146
confusion matrix:
 [[948 364 535]
 [  0  18  19]
 [  2  28  95]]
Epoch 7:
TRAINING:


100%|██████████| 251/251 [04:45<00:00,  1.14s/it]


F1 macro: 0.7440352294987478
TESTING:
F1 micro: 0.8860129417620707
F1 macro: 0.46583277871301987
confusion matrix:
 [[1723   35   89]
 [  22    3   12]
 [  57   14   54]]


### SAVE FINAL

In [24]:
torch.save(model.state_dict(), "final_model.bin")

# TEST

In [22]:
labels = pd.read_csv('./test_samples/test_set_only_tags.txt', header=None)
labels.columns = ["y"]
text = pd.read_fwf('./test_samples/test_set_only_text.txt', header=None)
text = text[[0]]
text.columns = ["X"]

test_df = text.join(labels)
test_df.label = test_df.y.astype(int)
test_df = test_df.dropna().reset_index(drop=True)
test_df['X'] = test_df['X'].apply(lambda r: r.replace("@anonymized_account", "@osoba"))

test_dataset = MyDataset(test_df.X.values, test_df.y.values)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)

  test_df.label = test_df.y.astype(int)


final model test

In [34]:
model = MyHerBertaModel(conf=model_config)
model.load_state_dict(torch.load('./final_model.bin', map_location=my_device))
model.to(my_device)

Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.sso.sso_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


MyHerBertaModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [35]:
o,p = eval_fn(test_dataloader, model, my_device)

F1 micro: 0.866
F1 macro: 0.4842411511725182
confusion matrix:
 [[826  13  27]
 [ 13   3   9]
 [ 58  14  37]]


In [38]:
with open('results.txt', 'w') as f:
    for res in o:
        f.write(f"{str(res)}\n")

model test


In [32]:
model = MyHerBertaModel(conf=model_config)
model.load_state_dict(torch.load('./model.bin', map_location=my_device))
model.to(my_device)

Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.sso.sso_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


MyHerBertaModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [33]:
o,p = eval_fn(test_dataloader, model, my_device)

F1 micro: 0.828
F1 macro: 0.4998330450572878
confusion matrix:
 [[777  40  49]
 [  8  10   7]
 [ 30  38  41]]
