In [1]:
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import PIL
import torchvision
import numpy
import pandas
import torch 
import torch.optim as optim
import gc
from torch.optim.lr_scheduler import StepLR
import cv2
import os
import json
import numpy as np
from transformers import BertModel, BertTokenizer
import torch
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
from transformers import T5EncoderModel
from transformers import GPT2Tokenizer, GPT2Model
from transformers import ViTImageProcessor, ViTModel
from PIL import Image
import requests
from tqdm import tqdm
import re 
import string 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
PATH_DATASETS = "../datasets"
PATH_JSON_TRAIN = os.path.join(PATH_DATASETS, "data/subtask2b/train.json") 
PATH_JSON_VAL = os.path.join(PATH_DATASETS, "data/subtask2b/val.json") 
PATH_JSON_DEV = os.path.join(PATH_DATASETS, "dev_gold_labels/dev_subtask2b_en.json") 
PATH_JSON_TEST = os.path.join(PATH_DATASETS, "test_data/north_macedonian/mk_subtask2b_test_unlabeled.json") 

PATH_IMG_TRAIN = os.path.join(PATH_DATASETS, "subtask2b_images/train") 
PATH_IMG_VAL = os.path.join(PATH_DATASETS, "subtask2b_images/val") 
PATH_IMG_DEV = os.path.join(PATH_DATASETS, "subtask2b_images/dev") 
PATH_IMG_TEST = os.path.join(PATH_DATASETS, "test_images/subtask2b/north_macedonian") 

PATH_SAVE_MODEL = "subtask2b_models"
PATH_SAVE_SUBMISSION = "subtask2b_submissions"

os.makedirs(PATH_SAVE_MODEL, exist_ok=True)
os.makedirs(PATH_SAVE_SUBMISSION, exist_ok=True)

BERT_MODEL = 'macedonizer/mk-gpt2'

BATCH_SIZE = 8

EPOCHS_FULL = 5
LR_FULL = 1e-5

EPOCHS_FC = 5
LR_FC = 3e-6

In [3]:
data = json.load(open(PATH_JSON_TRAIN,"r",encoding='utf-8'))

print(data[0])

{'id': '35807', 'text': 'DONALD TRUMP: BARACK\\nOBAMA AND JOE BIDEN\\nWILL BE IMPLICATED IN\\nRUSSIA HOAX\\nAP Photo/Pablo Martinez Monsivais', 'image': 'prop_meme_6570.png', 'label': 'propagandistic'}


In [4]:
def preprocess(text):
    return text

In [5]:
transform = torchvision.transforms.Compose([
                #torchvision.transforms.ToPILImage(),
                #torchvision.transforms.Resize((224,224),interpolation = PIL.Image.BICUBIC),
                #torchvision.transforms.ToTensor(),
                #torchvision.transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
            ])

In [6]:
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)
processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k',do_resize = True,do_rescale = True,do_normalize = True,image_mean = [0.5,0.5,0.5],image_std = [0.5,0.5,0.5])

In [7]:
class MyDataset(Dataset):
    
    def __init__(self, paths_json_img):
        self.filenames = []
        self.texts = []
        self.labels = []
        self.images = []
        self.ids = []
        
        for path_json, path_img in paths_json_img:
            data_train = json.load(open(path_json,"r",encoding='utf-8'))

            for x in tqdm(data_train):
                currentPath = os.path.join(path_img,x['image'])

                self.ids.append(x['id'])

                if 'label' in x:
                    if x['label']=="non_propagandistic":
                        self.labels.append(0)
                    else:
                        self.labels.append(1)
                else:
                    self.labels.append(0)

                text = preprocess(x['text'])
                if text is None:
                    text = ""
                self.texts.append(tokenizer(text,return_tensors='pt',padding='max_length',max_length=128,truncation=True))
                self.filenames.append(x['image'])

                currentImage = cv2.imread(currentPath)
                currentImage = torch.tensor(transform(currentImage)).unsqueeze(0)
                features = processor(currentImage)
                self.images.append(features)

    def __len__(self):
        return len(self.images)
    
    def __getitem__(self,idx):
        
        text_tensors = {}
        for key, value in self.texts[idx].items():
            text_tensors[key] = value.cuda() if isinstance(value, torch.Tensor) else value
        
        
        image_tensors = {}
        for key, value in self.images[idx].items():
            image_tensors[key] = value.cuda() if isinstance(value, torch.Tensor) else value
            
        return ((image_tensors,text_tensors),self.labels[idx])

In [8]:
train_data = MyDataset([(PATH_JSON_TRAIN, PATH_IMG_TRAIN), (PATH_JSON_VAL, PATH_IMG_VAL), (PATH_JSON_DEV, PATH_IMG_DEV)])
valid_data = MyDataset([(PATH_JSON_VAL, PATH_IMG_VAL)])
test_data = MyDataset([(PATH_JSON_TEST, PATH_IMG_TEST)])

train_dataloader = DataLoader(dataset = train_data, batch_size = BATCH_SIZE, shuffle = True)
valid_dataloader = DataLoader(dataset = valid_data, batch_size = BATCH_SIZE, shuffle = False)
test_dataloader = DataLoader(dataset = test_data, batch_size = 1, shuffle = False)

100%|██████████| 1200/1200 [00:24<00:00, 49.09it/s]
100%|██████████| 150/150 [00:03<00:00, 48.67it/s]
100%|██████████| 300/300 [00:06<00:00, 46.10it/s]
100%|██████████| 150/150 [00:02<00:00, 60.83it/s]
100%|██████████| 100/100 [00:01<00:00, 52.49it/s]


In [9]:
print(len(train_data.images[0]['pixel_values']))
print(len(train_data))
print(train_data.texts[260]['input_ids'].shape)

1
1650
torch.Size([1, 128])


In [10]:
model_name = BERT_MODEL 
text_tokenizer = AutoTokenizer.from_pretrained(model_name)
text_model = AutoModel.from_pretrained(model_name)

  return self.fget.__get__(instance, owner)()


In [11]:
#torchvision.models.efficientnet_b0(pretrained=True)
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        # Define text and image encoders
        self.text_encoder = AutoModel.from_pretrained(BERT_MODEL)
        
        self.image_encoder = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
        
        self.fc = nn.Linear(249600, 2)  # Adjust num_classes accordingly
        self.fc2 = nn.Linear(128,2)
    def forward(self,  images,text_input):
        # Process text input
        
        text_outputs = []

        for i in range(text_input['input_ids'].shape[0]):
            x = dict()
            x['input_ids'] = text_input['input_ids'][i]
            # x['token_type_ids'] = text_input['token_type_ids'][i]
            x['attention_mask'] = text_input['attention_mask'][i]
            text_outputs.append(self.text_encoder(**x).last_hidden_state)
            
            
        text_outputs = torch.stack(text_outputs)
        image_outputs = []
        
        for i in range(images['pixel_values'][0].shape[0]):
            x = dict()
            x['pixel_values'] = images['pixel_values'][0][i].unsqueeze(0).cuda()
          
            image_outputs.append(self.image_encoder(**x).last_hidden_state)
        
        image_outputs = torch.stack(image_outputs)

        # Flatten and concatenate the outputs
        text_outputs = text_outputs.view(text_outputs.size(0), -1)
        
        image_outputs = image_outputs.view(image_outputs.size(0), -1)
        combined = torch.cat((text_outputs, image_outputs), dim=1)
        
        # Pass through fully connected layer
        output = self.fc(nn.Tanh()(combined))
        return output

In [12]:
print(np.sum(train_data.labels))

1100


In [13]:
total_samples = len(train_data)
print(total_samples)
print(np.sum(train_data.labels))

class_sample_counts = [449, 900]  # Replace with your actual class sample counts
class_weights = [total_samples / (len(class_sample_counts) * count) for count in class_sample_counts]
class_weights = torch.tensor(class_weights, dtype=torch.float).to('cuda')

1650
1100


In [14]:
model = Model()
model.cuda()
model.train()

for param in model.text_encoder.parameters():
    param.requires_grad = False

In [15]:
criterion = torch.nn.CrossEntropyLoss(weight = class_weights)

In [16]:
optimizer = torch.optim.Adam(model.parameters(), lr = LR_FULL)

In [17]:
best_loss = 1e9

for epoch in range(EPOCHS_FULL):

    train_loss = 0.0    
    model.train()
    for useless_id, ((images_batch, texts_batch), labels_batch) in tqdm(enumerate(train_dataloader)):
        optimizer.zero_grad()
        
        labels_batch = labels_batch.type(torch.LongTensor)
        labels_batch = labels_batch.to('cuda')
        
        labels_predictions = model(images_batch, texts_batch)
        
        loss = criterion(labels_predictions, labels_batch)
        loss.backward()
        
        optimizer.step()
        
        train_loss = train_loss + loss.item()
    
    # Validation loop
    validation_loss = 0.0
    model.eval()
    correct = 0
    total = 0
    
    for useless_id, ((images_batch, texts_batch), labels_batch) in tqdm(enumerate(valid_dataloader)):
        labels_batch = labels_batch.to('cuda')
        labels_predictions = model(images_batch, texts_batch)
        
        loss = criterion(labels_predictions, labels_batch)
       
        
        validation_loss = validation_loss + loss.item()
        
        
        _, predicted = torch.max(labels_predictions, 1)
        total += labels_batch.size(0)
        correct += (predicted == labels_batch).sum().item()
        

    train_loss /= len(train_dataloader.dataset)
    validation_loss /= len(train_dataloader.dataset)
    accuracy = correct / total
    print(f'Epoch: {epoch} Train Loss: {train_loss} Validation Loss: {validation_loss} Validation Accuracy: {accuracy * 100:.2f}%')
        
    # Save checkpoint if needed
    # checkpoint = {'checkpoint': model.state_dict()}
    # torch.save(checkpoint, os.path.join(PATH_SAVE_MODEL, f'checkpoint_{epoch}.pt'))
    print(f'Checkpoint reached! Validation loss modified from {best_loss} to {validation_loss}')
    best_loss = validation_loss
    torch.cuda.empty_cache()

207it [01:02,  3.31it/s]
19it [00:03,  5.01it/s]


Epoch: 0 Train Loss: 0.06775409778410738 Validation Loss: 0.0039329520409757445 Validation Accuracy: 88.67%
Checkpoint reached! Validation loss modified from 1000000000.0 to 0.0039329520409757445


207it [01:12,  2.85it/s]
19it [00:03,  5.33it/s]


Epoch: 1 Train Loss: 0.03636925510723482 Validation Loss: 0.002323710782961412 Validation Accuracy: 94.67%
Checkpoint reached! Validation loss modified from 0.0039329520409757445 to 0.002323710782961412


207it [01:06,  3.13it/s]
19it [00:02,  8.37it/s]


Epoch: 2 Train Loss: 0.015080504648838982 Validation Loss: 0.0009416953169486739 Validation Accuracy: 97.33%
Checkpoint reached! Validation loss modified from 0.002323710782961412 to 0.0009416953169486739


207it [00:58,  3.54it/s]
19it [00:02,  8.27it/s]


Epoch: 3 Train Loss: 0.004527455367938135 Validation Loss: 0.00046591659923168747 Validation Accuracy: 98.67%
Checkpoint reached! Validation loss modified from 0.0009416953169486739 to 0.00046591659923168747


207it [01:05,  3.18it/s]
19it [00:03,  5.44it/s]

Epoch: 4 Train Loss: 0.0015762402380068757 Validation Loss: 0.000332595917226916 Validation Accuracy: 98.67%
Checkpoint reached! Validation loss modified from 0.00046591659923168747 to 0.000332595917226916





In [18]:
for param in model.text_encoder.parameters():
    param.requires_grad = False
    
for param in model.image_encoder.parameters():
    param.requires_grad = False 
    

In [19]:
optimizer = torch.optim.Adam(model.parameters(), lr = LR_FC)

In [20]:
best_loss = 1e9


for epoch in range(EPOCHS_FC):

    train_loss = 0.0    
    model.train()
    for useless_id, ((images_batch, texts_batch), labels_batch) in tqdm(enumerate(train_dataloader)):
        optimizer.zero_grad()
        
        labels_batch = labels_batch.type(torch.LongTensor)
        labels_batch = labels_batch.to('cuda')
        
        labels_predictions = model(images_batch, texts_batch)
        
        loss = criterion(labels_predictions, labels_batch)
        loss.backward()
        
        optimizer.step()
        
        train_loss = train_loss + loss.item()
    
    # Validation loop
    validation_loss = 0.0
    model.eval()
    correct = 0
    total = 0
    
    for useless_id, ((images_batch, texts_batch), labels_batch) in tqdm(enumerate(valid_dataloader)):
        labels_batch = labels_batch.to('cuda')
        
        labels_predictions = model(images_batch, texts_batch)
        
        loss = criterion(labels_predictions, labels_batch)
       
        
        validation_loss = validation_loss + loss.item()
        
        
        _, predicted = torch.max(labels_predictions, 1)
        total += labels_batch.size(0)
        correct += (predicted == labels_batch).sum().item()
        

    train_loss /= len(train_dataloader.dataset)
    validation_loss /= len(train_dataloader.dataset)
    accuracy = correct / total
    print(f'Epoch: {epoch} Train Loss: {train_loss} Validation Loss: {validation_loss} Validation Accuracy: {accuracy * 100:.2f}%')
        
    # Save checkpoint if needed
    # checkpoint = {'checkpoint': model.state_dict()}
    # torch.save(checkpoint, os.path.join(PATH_SAVE_MODEL, f'fc_checkpoint_{epoch}.pt'))
    print(f'Checkpoint reached! Validation loss modified from {best_loss} to {validation_loss}')
    best_loss = validation_loss
    torch.cuda.empty_cache()

207it [00:24,  8.53it/s]
19it [00:02,  8.93it/s]


Epoch: 0 Train Loss: 0.0007350683830714622 Validation Loss: 0.00019417805473715292 Validation Accuracy: 98.67%
Checkpoint reached! Validation loss modified from 1000000000.0 to 0.00019417805473715292


207it [00:23,  8.68it/s]
19it [00:02,  8.90it/s]


Epoch: 1 Train Loss: 0.0004821440336915354 Validation Loss: 0.00022143103588005583 Validation Accuracy: 98.67%
Checkpoint reached! Validation loss modified from 0.00019417805473715292 to 0.00022143103588005583


207it [00:23,  8.64it/s]
19it [00:02,  8.92it/s]


Epoch: 2 Train Loss: 0.0006059995553126403 Validation Loss: 0.0002158653073252715 Validation Accuracy: 99.33%
Checkpoint reached! Validation loss modified from 0.00022143103588005583 to 0.0002158653073252715


207it [00:23,  8.70it/s]
19it [00:02,  8.86it/s]


Epoch: 3 Train Loss: 0.0005452822690131143 Validation Loss: 0.0002464278722464135 Validation Accuracy: 99.33%
Checkpoint reached! Validation loss modified from 0.0002158653073252715 to 0.0002464278722464135


207it [00:26,  7.90it/s]
19it [00:03,  6.19it/s]

Epoch: 4 Train Loss: 0.00041467505705826847 Validation Loss: 0.0001762576583220807 Validation Accuracy: 99.33%
Checkpoint reached! Validation loss modified from 0.0002464278722464135 to 0.0001762576583220807





In [21]:
checkpoint = {'checkpoint': model.state_dict()}
torch.save(checkpoint, os.path.join(PATH_SAVE_MODEL, f'checkpoint.pt'))

In [22]:
#import torch
# model.train()
# checkpoint = torch.load(os.path.join(PATH_SAVE_MODEL, f'fc_checkpoint_{4}.pt'))

# # Apply the state dictionary to the model
# model.load_state_dict(checkpoint['checkpoint'])

In [23]:
predictions = []
ids = []

for useless_id, ((images_batch, texts_batch), labels_batch) in tqdm(enumerate(test_dataloader)):
    model.eval()
    labels_batch = labels_batch.type(torch.LongTensor)
        
    # Move data to GPU
    #images_batch = images_batch.to('cuda')
    #texts_batch = texts_batch.to(device)
    labels_batch = labels_batch.to('cuda')

    labels_predictions = model(images_batch, texts_batch)


    _, predicted = torch.max(labels_predictions, 1)

    predictions.append(predicted.item())
    ids.append(test_data.ids[useless_id])

100it [00:02, 44.31it/s]


In [24]:
with open(os.path.join(PATH_SAVE_SUBMISSION, "submission.txt"),"w") as fout:
    print("[\n",end='',file=fout)
    
    idx = 0
    for (ID,pred) in zip(ids,predictions):
        idx += 1
        predName = None
        if pred==0:
            predName = 'non_propagandistic'
        else:
            predName = 'propagandistic'
            
        ID = "\"" + ID + "\""
        predName = "\"" + predName + "\""

        print("{\n\"id\":",end='',file=fout)
        print(f" {ID},\n",end='',file=fout)
        print("\"label\":",end='',file=fout)
        print(f" {predName}\n",end='',file=fout)
        
        
        if idx < len(predictions):
            print("},",file=fout)
        else:
            print("}",file=fout)
    print(']',file=fout)