In [1]:
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import PIL
import torchvision
import numpy
import pandas
import torch 
import torch.optim as optim
import gc
from torch.optim.lr_scheduler import StepLR
import cv2
import os
import json
import numpy as np
from transformers import BertModel, BertTokenizer
import torch
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
from transformers import T5EncoderModel
from transformers import GPT2Tokenizer, GPT2Model
from transformers import ViTImageProcessor, ViTModel
from PIL import Image
import requests
from tqdm import tqdm
import re 
import string 
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler




In [2]:
torch.cuda.is_available()

True

In [3]:
PATH_DATASETS = "."
PATH_JSON_TRAIN = os.path.join(PATH_DATASETS, "annotations/data/subtask1/train.json") 
PATH_JSON_VAL = os.path.join(PATH_DATASETS, "annotations/data/subtask1/validation.json") 
PATH_JSON_DEV = os.path.join(PATH_DATASETS, "annotations/data/subtask1/dev_subtask1_en.json") 


PATH_JSON_TEST = "./test_data/bulgarian/bg_subtask1_test_unlabeled.json"

PATH_SAVE_MODEL = "subtask1_models"
PATH_SAVE_SUBMISSION = "subtask1_submissions"

os.makedirs(PATH_SAVE_MODEL, exist_ok=True)
os.makedirs(PATH_SAVE_SUBMISSION, exist_ok=True)

BATCH_SIZE = 16

EPOCHS_FULL = 0
LR_FULL = 1e-5

EPOCHS_FC = 3
LR_FC = 3e-6

TRAIN_ALL = True

In [4]:
data = json.load(open(PATH_JSON_TRAIN,"r",encoding='utf-8'))

print(data[0])

{'id': '65635', 'text': 'THIS IS WHY YOU NEED\\n\\nA SHARPIE WITH YOU AT ALL TIMES', 'labels': ['Black-and-white Fallacy/Dictatorship'], 'link': 'https://www.facebook.com/photo/?fbid=4023552137722493&set=g.633131750534436'}


In [5]:
def preprocess(text):
    return text

In [6]:
model_name = 'usmiva/bert-web-bg'  
tokenizer = AutoTokenizer.from_pretrained(model_name)
text_model = AutoModel.from_pretrained(model_name)

In [7]:
class MyDataset(Dataset):
    
    def __init__(self, paths_json, bin_classes):
        self.texts = []
        self.ids = []
        self.labels = []
        
        for path_json in paths_json:
            data = json.load(open(path_json,"r",encoding='utf-8'))

            for x in tqdm(data):
                self.ids.append(x['id'])

                if 'labels' in x:
                    curr_labels = []
                    for bin_class in bin_classes:
                        if bin_class in x['labels']:
                            curr_labels.append(1)
                        else:
                            curr_labels.append(0)
                    self.labels.append(curr_labels)
                else:
                    self.labels.append([])

                text = preprocess(x['text'])
                if text is None:
                    text = ""
                self.texts.append(tokenizer(text,return_tensors='pt',padding='max_length',max_length=128,truncation=True))

    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self,idx):
        text_tensors = {}
        for key, value in self.texts[idx].items():
            text_tensors[key] = value.cuda() if isinstance(value, torch.Tensor) else value
        
        return (text_tensors,torch.tensor(self.labels[idx]))

In [8]:

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()

        self.text_encoder = AutoModel.from_pretrained('usmiva/bert-web-bg')
        
        self.fc = nn.Linear(98304, 20)  
    def forward(self,text_input):

        
        text_outputs = []

        for i in range(text_input['input_ids'].shape[0]):
            x = dict()
            x['input_ids'] = text_input['input_ids'][i]
            #x['token_type_ids'] = text_input['token_type_ids'][i]
            x['attention_mask'] = text_input['attention_mask'][i]
            text_outputs.append(self.text_encoder(**x).last_hidden_state)
            
            
        text_outputs = torch.stack(text_outputs)

        text_outputs = text_outputs.view(text_outputs.size(0), -1)
        

        output = nn.Sigmoid()(self.fc(nn.Tanh()(text_outputs)))
        return output

In [9]:
data = json.load(open(PATH_JSON_TRAIN,"r",encoding='utf-8'))

bin_classes = []

for x in data:
    for label in x['labels']:
        if label not in bin_classes:
            bin_classes.append(label)

print(len(bin_classes))
print(bin_classes)

20
['Black-and-white Fallacy/Dictatorship', 'Loaded Language', 'Glittering generalities (Virtue)', 'Thought-terminating cliché', 'Whataboutism', 'Slogans', 'Causal Oversimplification', 'Smears', 'Name calling/Labeling', 'Appeal to authority', 'Exaggeration/Minimisation', 'Repetition', 'Flag-waving', 'Appeal to fear/prejudice', 'Reductio ad hitlerum', 'Doubt', "Misrepresentation of Someone's Position (Straw Man)", 'Obfuscation, Intentional vagueness, Confusion', 'Bandwagon', 'Presenting Irrelevant Data (Red Herring)']


In [10]:
if TRAIN_ALL:
    train_data = MyDataset([PATH_JSON_TRAIN, PATH_JSON_VAL, PATH_JSON_DEV], bin_classes)
else:
    train_data = MyDataset([PATH_JSON_TRAIN], bin_classes)
valid_data = MyDataset([PATH_JSON_VAL], bin_classes)
test_data = MyDataset([PATH_JSON_TEST], bin_classes)

train_dataloader = DataLoader(dataset = train_data, batch_size = BATCH_SIZE, shuffle = True)
valid_dataloader = DataLoader(dataset = valid_data, batch_size = BATCH_SIZE, shuffle = False)
test_dataloader = DataLoader(dataset = test_data, batch_size = 1, shuffle = False)

100%|████████████████████████████████████████████████████████████████████████████| 7000/7000 [00:01<00:00, 5020.69it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 4900.91it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 4911.85it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 5206.45it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 436/436 [00:00<00:00, 5190.88it/s]


In [11]:
print(np.sum(train_data.labels))
print(len(train_data.labels))

13938
8500


In [13]:
predictions = {}
    
print(len(train_data))
print(train_data.texts[0]['input_ids'].shape)

model = Model()
model.cuda()
model.train()

criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = LR_FULL)

best_loss = 1e9

for epoch in range(EPOCHS_FULL):

    train_loss = 0.0    
    model.train()
    for useless_id, (texts_batch, labels_batch) in tqdm(enumerate(train_dataloader)):
        optimizer.zero_grad()

        labels_batch = labels_batch.to(torch.float32)
        labels_batch = labels_batch.to('cuda')

        labels_predictions = model(texts_batch)


        
        loss = criterion(labels_predictions, labels_batch)
        loss.backward()

        optimizer.step()

        train_loss = train_loss + loss.item()

   
    validation_loss = 0.0
    model.eval()
    correct = 0
    total = 0

    for useless_id, (texts_batch, labels_batch) in tqdm(enumerate(valid_dataloader)):
        labels_batch = labels_batch.to(torch.float32)
        labels_batch = labels_batch.to('cuda')
        labels_predictions = model(texts_batch)

        loss = criterion(labels_predictions, labels_batch)


        validation_loss = validation_loss + loss.item()


        predicted = (labels_predictions > 0.5)
        
        total += labels_batch.size(0)
        correct += (predicted == labels_batch).sum().item()


    train_loss /= len(train_dataloader.dataset)
    validation_loss /= len(train_dataloader.dataset)
    accuracy = (correct / total) / len(bin_classes)
    print(f'Epoch: {epoch} Train Loss: {train_loss} Validation Loss: {validation_loss} Validation Accuracy: {accuracy * 100:.2f}%')

    # Save checkpoint if needed
    # checkpoint = {'checkpoint': model.state_dict()}
    # torch.save(checkpoint, os.path.join(PATH_SAVE_MODEL, f'checkpoint_{epoch}.pt'))
    print(f'Checkpoint reached! Validation loss modified from {best_loss} to {validation_loss}')
    best_loss = validation_loss
    torch.cuda.empty_cache()
                    


8500
torch.Size([1, 128])


532it [03:19,  2.67it/s]
32it [00:04,  6.57it/s]


Epoch: 0 Train Loss: 0.014814638285075917 Validation Loss: 0.0008007739817394929 Validation Accuracy: 92.18%
Checkpoint reached! Validation loss modified from 1000000000.0 to 0.0008007739817394929


532it [03:18,  2.68it/s]
32it [00:04,  6.57it/s]


Epoch: 1 Train Loss: 0.013031194153077462 Validation Loss: 0.0006810870494912653 Validation Accuracy: 93.09%
Checkpoint reached! Validation loss modified from 0.0008007739817394929 to 0.0006810870494912653


532it [03:17,  2.69it/s]
32it [00:04,  6.62it/s]

Epoch: 2 Train Loss: 0.011522897233857828 Validation Loss: 0.0005712775219889248 Validation Accuracy: 94.09%
Checkpoint reached! Validation loss modified from 0.0006810870494912653 to 0.0005712775219889248





In [14]:
for param in model.text_encoder.parameters():
    param.requires_grad = False

optimizer = torch.optim.Adam(model.parameters(), lr = LR_FC)
best_loss = 1e9


for epoch in range(EPOCHS_FC):

    train_loss = 0.0    
    model.train()
    for useless_id, (texts_batch, labels_batch) in tqdm(enumerate(train_dataloader)):
        optimizer.zero_grad()

        labels_batch = labels_batch.to(torch.float32)
        labels_batch = labels_batch.to('cuda')

        labels_predictions = model(texts_batch)

        loss = criterion(labels_predictions, labels_batch)
        loss.backward()

        optimizer.step()

        train_loss = train_loss + loss.item()

    # Validation loop
    validation_loss = 0.0
    model.eval()
    correct = 0
    total = 0

    for useless_id, (texts_batch, labels_batch) in tqdm(enumerate(valid_dataloader)):
        labels_batch = labels_batch.to(torch.float32)
        labels_batch = labels_batch.to('cuda')

        labels_predictions = model(texts_batch)

        loss = criterion(labels_predictions, labels_batch)


        validation_loss = validation_loss + loss.item()


        predicted = (labels_predictions > 0.5)
        total += labels_batch.size(0)
        correct += (predicted == labels_batch).sum().item()


    train_loss /= len(train_dataloader.dataset)
    validation_loss /= len(train_dataloader.dataset)
    accuracy = (correct / total) / len(bin_classes)
    print(f'Epoch: {epoch} Train Loss: {train_loss} Validation Loss: {validation_loss} Validation Accuracy: {accuracy * 100:.2f}%')





    # Save checkpoint if needed
    # checkpoint = {'checkpoint': model.state_dict()}
    # torch.save(checkpoint, os.path.join(PATH_SAVE_MODEL, f'fc_checkpoint_{epoch}.pt'))
    print(f'Checkpoint reached! Validation loss modified from {best_loss} to {validation_loss}')
    best_loss = validation_loss
    torch.cuda.empty_cache()

    checkpoint = {'checkpoint': model.state_dict()}
    torch.save(checkpoint, os.path.join(PATH_SAVE_MODEL, f'checkpoint_extra.pt'))

    #import torch
    # model.train()
    # checkpoint = torch.load(os.path.join(PATH_SAVE_MODEL, f'fc_checkpoint_{4}.pt'))

    # # Apply the state dictionary to the model
    # model.load_state_dict(checkpoint['checkpoint'])

532it [01:12,  7.31it/s]
32it [00:03,  8.05it/s]


Epoch: 0 Train Loss: 0.009788282729247037 Validation Loss: 0.0005440089439644533 Validation Accuracy: 94.52%
Checkpoint reached! Validation loss modified from 1000000000.0 to 0.0005440089439644533


532it [01:12,  7.30it/s]
32it [00:03,  8.03it/s]


Epoch: 1 Train Loss: 0.009459767501143848 Validation Loss: 0.0005278383388238794 Validation Accuracy: 94.64%
Checkpoint reached! Validation loss modified from 0.0005440089439644533 to 0.0005278383388238794


532it [01:12,  7.33it/s]
32it [00:04,  7.98it/s]


Epoch: 2 Train Loss: 0.009215528062161277 Validation Loss: 0.0005110651184530819 Validation Accuracy: 94.92%
Checkpoint reached! Validation loss modified from 0.0005278383388238794 to 0.0005110651184530819


86it [00:11,  7.81it/s]


KeyboardInterrupt: 

5


In [18]:
predictions = {}
ids = []

for useless_id, (texts_batch, labels_batch) in tqdm(enumerate(test_dataloader)):
    model.eval()
    #print(texts_batch)
    labels_predictions = model(texts_batch)


    predicted = (labels_predictions > 0.25)[0]
    
    curr_id = test_data.ids[useless_id]
    if curr_id not in predictions:
        predictions[curr_id] = []
        
    idx_bin_class = 0
    for bin_class in bin_classes:
        if predicted[idx_bin_class]:
            predictions[curr_id].append(bin_class)
        idx_bin_class += 1

436it [00:04, 95.31it/s] 


In [19]:
output_json = []
for k,v in predictions.items():
    output_json.append({"id" : k, "labels" : v})

with open(os.path.join(PATH_SAVE_SUBMISSION, "submission_BG_BERT.txt"),"w") as fout:
    json.dump(output_json, fout)