In [21]:
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import PIL
import torchvision
import numpy
import pandas
import torch 
import torch.optim as optim
import gc
from torch.optim.lr_scheduler import StepLR
import cv2
import os
import json
import numpy as np
from transformers import BertModel, BertTokenizer
import torch
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
from transformers import T5EncoderModel
from transformers import GPT2Tokenizer, GPT2Model
from transformers import ViTImageProcessor, ViTModel
from PIL import Image
import requests
from tqdm import tqdm
import re 
import string 

In [22]:
PATH_DATASETS = "../datasets"
PATH_JSON_TRAIN = os.path.join(PATH_DATASETS, "data/subtask2a/train.json") 
PATH_JSON_VAL = os.path.join(PATH_DATASETS, "data/subtask2a/validation.json") 
PATH_JSON_DEV = os.path.join(PATH_DATASETS, "dev_gold_labels/dev_subtask2a_en.json") 
PATH_JSON_TEST = os.path.join(PATH_DATASETS, "test_data/bulgarian/bg_subtask2a_test_unlabeled.json") 


PATH_IMG_TRAIN = os.path.join(PATH_DATASETS, "train_images") 
PATH_IMG_VAL = os.path.join(PATH_DATASETS, "validation_images") 
PATH_IMG_DEV = os.path.join(PATH_DATASETS, "dev_images") 
PATH_IMG_TEST = os.path.join(PATH_DATASETS, "test_images/subtask1_2a/bulgarian") 

PATH_SAVE_MODEL = "subtask2a_models"
PATH_SAVE_SUBMISSION = "subtask2a_submissions"

os.makedirs(PATH_SAVE_MODEL, exist_ok=True)
os.makedirs(PATH_SAVE_SUBMISSION, exist_ok=True)

BERT_MODEL = 'usmiva/bert-web-bg'

BATCH_SIZE = 8

EPOCHS_FULL = 3
LR_FULL = 1e-5

EPOCHS_FC = 0
LR_FC = 3e-6

TRAIN_ALL = True

In [23]:
data = json.load(open(PATH_JSON_TRAIN,"r",encoding='utf-8'))

print(data[0])

{'id': '63292', 'text': "This is why we're free\\n\\nThis is why we're safe\\n", 'image': 'prop_meme_556.png', 'labels': ['Causal Oversimplification', 'Transfer', 'Flag-waving'], 'link': 'https://www.facebook.com/SilentmajorityDJT/photos/2119966118152814/'}


In [24]:
def preprocess(text):
    return text

In [25]:
transform = torchvision.transforms.Compose([
                #torchvision.transforms.ToPILImage(),
                #torchvision.transforms.Resize((224,224),interpolation = PIL.Image.BICUBIC),
                #torchvision.transforms.ToTensor(),
                #torchvision.transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
            ])

In [26]:
processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k',do_resize = True,do_rescale = True,do_normalize = True,image_mean = [0.5,0.5,0.5],image_std = [0.5,0.5,0.5])

tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)
text_model = AutoModel.from_pretrained(BERT_MODEL)

In [27]:
class MyDataset(Dataset):
    
    def __init__(self, paths_json_img, bin_classes):
        self.filenames = []
        self.texts = []
        self.images = []
        self.ids = []
        self.labels = []
        
        for path_json, path_img in paths_json_img:
            print(path_json)
            data = json.load(open(path_json,"r",encoding='utf-8'))

            for x in tqdm(data):
                currentPath = os.path.join(path_img,x['image'])

                self.ids.append(x['id'])

                if 'labels' in x:
                    curr_labels = []
                    for bin_class in bin_classes:
                        if bin_class in x['labels']:
                            curr_labels.append(1)
                        else:
                            curr_labels.append(0)
                    self.labels.append(curr_labels)
                else:
                    self.labels.append([])

                text = preprocess(x['text'])
                if text is None:
                    text = ""
                self.texts.append(tokenizer(text,return_tensors='pt',padding='max_length',max_length=128,truncation=True))
                self.filenames.append(x['image'])

                currentImage = cv2.imread(currentPath)
                currentImage = torch.tensor(transform(currentImage)).unsqueeze(0)
                features = processor(currentImage)
                self.images.append(features)

    def __len__(self):
        return len(self.images)
    
    def __getitem__(self,idx):
        text_tensors = {}
        for key, value in self.texts[idx].items():
            text_tensors[key] = value.cuda() if isinstance(value, torch.Tensor) else value
        
        
        image_tensors = {}
        for key, value in self.images[idx].items():
            image_tensors[key] = value.cuda() if isinstance(value, torch.Tensor) else value
        
        return ((image_tensors,text_tensors),torch.tensor(self.labels[idx]))

In [28]:
#torchvision.models.efficientnet_b0(pretrained=True)
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        # Define text and image encoders
        self.text_encoder = AutoModel.from_pretrained(BERT_MODEL)
        
        self.image_encoder = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
        
        self.fc = nn.Linear(249600, 22)  # Adjust num_classes accordingly
        self.fc2 = nn.Linear(128,2)
    def forward(self,  images,text_input):
        # Process text input
        
        text_outputs = []

        for i in range(text_input['input_ids'].shape[0]):
            x = dict()
            x['input_ids'] = text_input['input_ids'][i]
            x['token_type_ids'] = text_input['token_type_ids'][i]
            x['attention_mask'] = text_input['attention_mask'][i]
            text_outputs.append(self.text_encoder(**x).last_hidden_state)
            
            
        text_outputs = torch.stack(text_outputs)
        image_outputs = []
        
        for i in range(images['pixel_values'][0].shape[0]):
            x = dict()
            x['pixel_values'] = images['pixel_values'][0][i].unsqueeze(0).cuda()
          
            image_outputs.append(self.image_encoder(**x).last_hidden_state)
        
        image_outputs = torch.stack(image_outputs)

        # Flatten and concatenate the outputs
        text_outputs = text_outputs.view(text_outputs.size(0), -1)
        
        image_outputs = image_outputs.view(image_outputs.size(0), -1)
        combined = torch.cat((text_outputs, image_outputs), dim=1)
        
        # Pass through fully connected layer
        output = nn.Sigmoid()(self.fc(nn.Tanh()(combined)))
        return output

In [29]:
data = json.load(open(PATH_JSON_TRAIN,"r",encoding='utf-8'))

bin_classes = []

for x in data:
    for label in x['labels']:
        if label not in bin_classes:
            bin_classes.append(label)

print(len(bin_classes))
print(bin_classes)

22
['Causal Oversimplification', 'Transfer', 'Flag-waving', 'Black-and-white Fallacy/Dictatorship', 'Smears', 'Loaded Language', 'Glittering generalities (Virtue)', 'Thought-terminating cliché', 'Whataboutism', 'Slogans', 'Doubt', 'Name calling/Labeling', 'Repetition', 'Appeal to authority', 'Appeal to (Strong) Emotions', 'Reductio ad hitlerum', 'Appeal to fear/prejudice', 'Exaggeration/Minimisation', "Misrepresentation of Someone's Position (Straw Man)", 'Obfuscation, Intentional vagueness, Confusion', 'Bandwagon', 'Presenting Irrelevant Data (Red Herring)']


In [30]:
if TRAIN_ALL:
    train_data = MyDataset([(PATH_JSON_TRAIN, PATH_IMG_TRAIN), (PATH_JSON_VAL, PATH_IMG_VAL), (PATH_JSON_DEV, PATH_IMG_DEV)], bin_classes)
    # train_data = MyDataset([(PATH_JSON_VAL, PATH_IMG_VAL)], bin_classes)
else:
    train_data = MyDataset([(PATH_JSON_TRAIN, PATH_IMG_TRAIN)], bin_classes)
valid_data = MyDataset([(PATH_JSON_VAL, PATH_IMG_VAL)], bin_classes)
test_data = MyDataset([(PATH_JSON_TEST, PATH_IMG_TEST)], bin_classes)

train_dataloader = DataLoader(dataset = train_data, batch_size = BATCH_SIZE, shuffle = True)
valid_dataloader = DataLoader(dataset = valid_data, batch_size = BATCH_SIZE, shuffle = False)
test_dataloader = DataLoader(dataset = test_data, batch_size = 1, shuffle = False)

../datasets\data/subtask2a/train.json


100%|██████████| 7000/7000 [02:09<00:00, 54.15it/s]


../datasets\data/subtask2a/validation.json


100%|██████████| 500/500 [00:10<00:00, 46.14it/s]


../datasets\dev_gold_labels/dev_subtask2a_en.json


100%|██████████| 1000/1000 [00:29<00:00, 33.42it/s]


../datasets\data/subtask2a/validation.json


100%|██████████| 500/500 [00:13<00:00, 36.65it/s]


../datasets\test_data/bulgarian/bg_subtask2a_test_unlabeled.json


100%|██████████| 436/436 [00:11<00:00, 36.93it/s]


In [31]:
# m = nn.Sigmoid()
# loss = nn.BCELoss()
# input = torch.randn(3, requires_grad=True)
# target = torch.empty(3).random_(2)

# print(input)
# print(target)
# output = loss(m(input), target)

In [32]:
print(len(train_data.images[0]['pixel_values']))
print(len(train_data))
print(train_data.texts[0]['input_ids'].shape)

model = Model()
model.cuda()
model.train()

for param in model.text_encoder.parameters():
    param.requires_grad = False

criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = LR_FULL)

best_loss = 1e9

for epoch in range(EPOCHS_FULL):

    train_loss = 0.0    
    model.train()
    for useless_id, ((images_batch, texts_batch), labels_batch) in tqdm(enumerate(train_dataloader)):
        optimizer.zero_grad()

        labels_batch = labels_batch.to(torch.float32)
        labels_batch = labels_batch.to('cuda')

        labels_predictions = model(images_batch, texts_batch)

#         print(labels_predictions.shape)
#         print(labels_batch.shape)
        
#         print(labels_predictions.type())
#         print(labels_batch.type())
        
#         print(labels_predictions)
#         print(labels_batch)
        
        loss = criterion(labels_predictions, labels_batch)
        loss.backward()

        optimizer.step()

        train_loss = train_loss + loss.item()

    # Validation loop
    validation_loss = 0.0
    model.eval()
    correct = 0
    total = 0

    for useless_id, ((images_batch, texts_batch), labels_batch) in tqdm(enumerate(valid_dataloader)):
        labels_batch = labels_batch.to(torch.float32)
        labels_batch = labels_batch.to('cuda')
        labels_predictions = model(images_batch, texts_batch)

        loss = criterion(labels_predictions, labels_batch)


        validation_loss = validation_loss + loss.item()


        predicted = (labels_predictions > 0.5)
        
        total += labels_batch.size(0)
        correct += (predicted == labels_batch).sum().item()


    train_loss /= len(train_dataloader.dataset)
    validation_loss /= len(train_dataloader.dataset)
    accuracy = (correct / total) / len(bin_classes)
    print(f'Epoch: {epoch} Train Loss: {train_loss} Validation Loss: {validation_loss} Validation Accuracy: {accuracy * 100:.2f}%')

    # Save checkpoint if needed
    # checkpoint = {'checkpoint': model.state_dict()}
    # torch.save(checkpoint, os.path.join(PATH_SAVE_MODEL, f'checkpoint_{epoch}.pt'))
    print(f'Checkpoint reached! Validation loss modified from {best_loss} to {validation_loss}')
    best_loss = validation_loss
    torch.cuda.empty_cache()

    
    
    
for param in model.text_encoder.parameters():
    param.requires_grad = False

for param in model.image_encoder.parameters():
    param.requires_grad = False 

optimizer = torch.optim.Adam(model.parameters(), lr = LR_FC)
best_loss = 1e9


for epoch in range(EPOCHS_FC):

    train_loss = 0.0    
    model.train()
    for useless_id, ((images_batch, texts_batch), labels_batch) in tqdm(enumerate(train_dataloader)):
        optimizer.zero_grad()

        labels_batch = labels_batch.to(torch.float32)
        labels_batch = labels_batch.to('cuda')

        labels_predictions = model(images_batch, texts_batch)

        loss = criterion(labels_predictions, labels_batch)
        loss.backward()

        optimizer.step()

        train_loss = train_loss + loss.item()

    # Validation loop
    validation_loss = 0.0
    model.eval()
    correct = 0
    total = 0

    for useless_id, ((images_batch, texts_batch), labels_batch) in tqdm(enumerate(valid_dataloader)):
        labels_batch = labels_batch.to(torch.float32)
        labels_batch = labels_batch.to('cuda')

        labels_predictions = model(images_batch, texts_batch)

        loss = criterion(labels_predictions, labels_batch)


        validation_loss = validation_loss + loss.item()


        predicted = (labels_predictions > 0.5)
        total += labels_batch.size(0)
        correct += (predicted == labels_batch).sum().item()


    train_loss /= len(train_dataloader.dataset)
    validation_loss /= len(train_dataloader.dataset)
    accuracy = (correct / total) / len(bin_classes)
    print(f'Epoch: {epoch} Train Loss: {train_loss} Validation Loss: {validation_loss} Validation Accuracy: {accuracy * 100:.2f}%')





# Save checkpoint if needed
# checkpoint = {'checkpoint': model.state_dict()}
# torch.save(checkpoint, os.path.join(PATH_SAVE_MODEL, f'fc_checkpoint_{epoch}.pt'))
print(f'Checkpoint reached! Validation loss modified from {best_loss} to {validation_loss}')
best_loss = validation_loss
torch.cuda.empty_cache()

checkpoint = {'checkpoint': model.state_dict()}
torch.save(checkpoint, os.path.join(PATH_SAVE_MODEL, f'checkpoint.pt'))

#import torch
# model.train()
# checkpoint = torch.load(os.path.join(PATH_SAVE_MODEL, f'fc_checkpoint_{4}.pt'))

# # Apply the state dictionary to the model
# model.load_state_dict(checkpoint['checkpoint'])
                    


1
8500
torch.Size([1, 128])


  return self.fget.__get__(instance, owner)()
1063it [05:55,  2.99it/s]
63it [00:11,  5.72it/s]


Epoch: 0 Train Loss: 0.032200079805710734 Validation Loss: 0.0015899663953220142 Validation Accuracy: 91.34%
Checkpoint reached! Validation loss modified from 1000000000.0 to 0.0015899663953220142


1063it [06:01,  2.94it/s]
63it [00:10,  6.05it/s]


Epoch: 1 Train Loss: 0.025776574396911788 Validation Loss: 0.0011927990326110055 Validation Accuracy: 93.25%
Checkpoint reached! Validation loss modified from 0.0015899663953220142 to 0.0011927990326110055


1063it [05:33,  3.19it/s]
63it [00:06,  9.20it/s]


Epoch: 2 Train Loss: 0.019793729868005303 Validation Loss: 0.0008045081434880986 Validation Accuracy: 96.22%
Checkpoint reached! Validation loss modified from 0.0011927990326110055 to 0.0008045081434880986
Checkpoint reached! Validation loss modified from 1000000000.0 to 0.0008045081434880986


In [33]:
predictions = {}
ids = []

for useless_id, ((images_batch, texts_batch), labels_batch) in tqdm(enumerate(test_dataloader)):
    model.eval()

    labels_predictions = model(images_batch, texts_batch)


    predicted = (labels_predictions > 0.25)[0]
    
    curr_id = test_data.ids[useless_id]
    if curr_id not in predictions:
        predictions[curr_id] = []
        
    idx_bin_class = 0
    for bin_class in bin_classes:
        if predicted[idx_bin_class]:
            predictions[curr_id].append(bin_class)
        idx_bin_class += 1

436it [00:06, 62.68it/s]


In [34]:
output_json = []
for k,v in predictions.items():
    output_json.append({"id" : k, "labels" : v})

with open(os.path.join(PATH_SAVE_SUBMISSION, "submission.txt"),"w") as fout:
    json.dump(output_json, fout)