In [1]:
!pip install pyvi underthesea



Collecting pyvi
  Downloading pyvi-0.1.1-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting underthesea
  Downloading underthesea-6.8.4-py3-none-any.whl.metadata (15 kB)
Collecting sklearn-crfsuite (from pyvi)
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.6 (from underthesea)
  Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting underthesea-core==1.0.4 (from underthesea)
  Downloading underthesea_core-1.0.4-cp310-cp310-manylinux2010_x86_64.whl.metadata (1.7 kB)
Downloading pyvi-0.1.1-py2.py3-none-any.whl (8.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m58.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading underthesea-6.8.4-py3-none-any.whl (20.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m85.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading underthesea_core-1

In [2]:
pip install deep-translator

Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep-translator
Successfully installed deep-translator-1.11.4
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import random
import json
import logging
import warnings
from tqdm import tqdm, trange
import seaborn as sns
from PIL import Image
import emoji
import cv2
import numpy as np
import pandas as pd
import regex as re 
import time
import string
from pyvi import ViTokenizer
from underthesea import sent_tokenize
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader 
from transformers import (AutoTokenizer, AutoModel, AutoModelForImageClassification, AutoImageProcessor, BertPreTrainedModel, XLMRobertaModel, 
                          Trainer, TrainingArguments, EarlyStoppingCallback, pipeline, get_linear_schedule_with_warmup)
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import f1_score, accuracy_score
from sklearn import metrics

warnings.filterwarnings("ignore")
device = torch.device('cuda')

# Preprocessing

In [4]:
### STEP 1: REMOVE HTML
def remove_HTML(text):
    return re.sub(r'<[^>]*>', '', text)

### STEP 2: STANDARDIZE UNICODE
def convert_unicode(text):
    char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'
    charutf8 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'
    char1252 = char1252.split('|')
    charutf8 = charutf8.split('|')
    
    dic = {}
    for i in range(len(char1252)): dic[char1252[i]] = charutf8[i]
    return re.sub(
        r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ',
        lambda x: dic[x.group()], text
    )

### STEP 3: NORMALIZE ACRONYM
replace_list = {
    'ô kêi': 'ok', 'okie': 'ok', 'o kê': 'ok', 'okey': 'ok', 'ôkê': 'ok', 'oki': 'ok', 'oke': 'ok', 'okay': 'ok', 'okê': 'ok',
    'tks': 'cảm ơn', 'thks': 'cảm ơn', 'thanks': 'cảm ơn', 'ths': 'cảm ơn', 'thank': 'cảm ơn',
    'kg': 'không', 'not': 'không', 'k': 'không', 'kh': 'không', 'kô': 'không', 'hok': 'không', 'ko': 'không', 'khong': 'không', 'kp': 'không phải',
    'he he': 'tích cực', 'hehe': 'tích cực', 'hihi': 'tích cực', 'haha': 'tích cực', 'hjhj': 'tích cực', 'thick': 'tích cực',
    'lol': 'tiêu cực', 'cc': 'tiêu cực', 'huhu': 'tiêu cực', 'cute': 'dễ thương',
     
    'sz': 'cỡ', 'size': 'cỡ', 
    'wa': 'quá', 'wá': 'quá', 'qá': 'quá', 
    'đx': 'được', 'dk': 'được', 'dc': 'được', 'đk': 'được', 'đc': 'được', 
    'vs': 'với', 'j': 'gì', '“': ' ', 'time': 'thời gian', 'm': 'mình', 'mik': 'mình', 'r': 'rồi', 'bjo': 'bao giờ', 'very': 'rất',

    'authentic': 'chuẩn chính hãng', 'aut': 'chuẩn chính hãng', 'auth': 'chuẩn chính hãng', 'date': 'hạn sử dụng', 'hsd': 'hạn sử dụng', 
    'store': 'cửa hàng', 'sop': 'cửa hàng', 'shopE': 'cửa hàng', 'shop': 'cửa hàng', 
    'sp': 'sản phẩm', 'product': 'sản phẩm', 'hàg': 'hàng', 
    'ship': 'giao hàng', 'delivery': 'giao hàng', 'síp': 'giao hàng', 'order': 'đặt hàng',

    'gud': 'tốt', 'wel done': 'tốt', 'good': 'tốt', 'gút': 'tốt', 'tot': 'tốt', 'nice': 'tốt', 'perfect': 'rất tốt', 
    'quality': 'chất lượng', 'chất lg': 'chất lượng', 'chat': 'chất', 'excelent': 'hoàn hảo', 'bt': 'bình thường',
    'sad': 'tệ', 'por': 'tệ', 'poor': 'tệ', 'bad': 'tệ', 
    'beautiful': 'đẹp tuyệt vời', 'dep': 'đẹp', 
    'xau': 'xấu', 'sấu': 'xấu', 
     
    'thik': 'thích', 'iu': 'yêu', 'fake': 'giả mạo', 
    'quickly': 'nhanh', 'quick': 'nhanh', 'fast': 'nhanh',
    'fresh': 'tươi', 'delicious': 'ngon',

    'dt': 'điện thoại', 'fb': 'facebook', 'face': 'facebook', 'ks': 'khách sạn', 'nv': 'nhân viên',
    'nt': 'nhắn tin', 'ib': 'nhắn tin', 'tl': 'trả lời', 'trl': 'trả lời', 'rep': 'trả lời',
    'fback': 'feedback', 'fedback': 'feedback',
    'sd': 'sử dụng', 'sài': 'xài', 

    '^_^': 'tích cực', ':)': 'tích cực', ':(': 'tiêu cực',
    '❤️': 'tích cực', '👍': 'tích cực', '🎉': 'tích cực', '😀': 'tích cực', '😍': 'tích cực', '😂': 'tích cực', '🤗': 'tích cực', '😙': 'tích cực', '🙂': 'tích cực', 
    '😔': 'tiêu cực', '😓': 'tiêu cực', 
    '⭐': 'star', '*': 'star', '🌟': 'star',
}


def normalize_acronym(text):
    words = []
    for word in text.strip().split():
        word = word.strip(string.punctuation)
        if word.lower() not in replace_list.keys():
            words.append(word)
        else:
            words.append(replace_list[word.lower()])
    return emoji.demojize(' '.join(words)) # replace emoji with text

def preprocess(text):
    text = remove_HTML(text)
    text = convert_unicode(text)
    text = normalize_acronym(text)
    return text

def split_chunk_new_data(df, max_len=240):
    new_data = []
    
    for i in df.index:
        caption = df.at[i, 'caption']
        image = df.at[i, 'image']
        label = df.at[i, 'label']
        
        if len(caption.split()) <= max_len:
            new_data.append({"caption": caption, "image": image, "label": label})
        else:
            sentences = sent_tokenize(caption)
            token_sentences = [s.split() for s in sentences]

            tmp_context_token = []
            tmp_context = []
            
            for idx in range(len(sentences)):
                check = True
                if len(tmp_context_token) + len(token_sentences[idx]) <= max_len:
                    tmp_context_token += token_sentences[idx]
                    tmp_context.append(sentences[idx])
                    check = False

                if len(tmp_context_token) + len(token_sentences[idx]) > max_len or idx == len(sentences) - 1:
                    context_sub = ' '.join(tmp_context).strip()
                    if len(context_sub) > 0:
                        new_data.append({"caption": context_sub, "image": image, "label": label})
                    
                    if check:
                        tmp_context_token = token_sentences[idx]
                        tmp_context = [sentences[idx]]
                    else:
                        tmp_context_token = []
                        tmp_context = []
    
    return pd.DataFrame(new_data)
        


In [None]:
import os
import random
import torch
from torch.utils.data import Dataset
from PIL import Image
import cv2
import albumentations as A
from albumentations.pytorch import ToTensorV2
from deep_translator import GoogleTranslator
import nltk
from nltk.corpus import wordnet

nltk.download('wordnet')
nltk.download('omw-1.4')

# Augmentation for image
image_transform = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(p=0.2),
    A.GaussNoise(p=0.2),
    A.Rotate(limit=20, p=0.5),
    A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.05, rotate_limit=15, p=0.5),
    A.Resize(224, 224),
    ToTensorV2()
])

def augment_image(image_path):
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    augmented = image_transform(image=image)['image']
    return augmented

# Augmentation for text
def back_translate(text, src="en", mid="fr", tgt="en"):
    translated = GoogleTranslator(source=src, target=mid).translate(text)
    back_translated = GoogleTranslator(source=mid, target=tgt).translate(translated)
    return back_translated

def synonym_replace(sentence):
    words = sentence.split()
    new_words = []
    for word in words:
        synonyms = wordnet.synsets(word)
        if synonyms:
            new_word = synonyms[0].lemmas()[0].name()
        else:
            new_word = word
        new_words.append(new_word)
    return " ".join(new_words)

def augment_text(text):
    if random.random() > 0.5:
        return back_translate(text)  
    else:
        return synonym_replace(text)  


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


# Dataset

In [None]:
class Dataset(Dataset):
    def __init__(self, df, image_train_path, tokenizer_text, processor, max_len=256, augment=False):
        """
        Dataset cho bài toán Sarcasm Detection.
        Nếu augment=True, sẽ thực hiện tăng cường dữ liệu cho image-sarcasm và text-sarcasm.
        """
        self.df = df
        self.max_len = max_len
        self.image_train_path = image_train_path
        self.tokenizer_text = tokenizer_text
        self.processor = processor
        self.augment = augment 

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        caption, image, label = self.get_input_data(row)

        text_encoding = self.tokenizer_text.encode_plus(
            caption,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt', 
        )
        processed_image = self.processor(image, return_tensors="pt")["pixel_values"]

        return {
            'input_ids': text_encoding['input_ids'].flatten(),
            'attention_masks': text_encoding['attention_mask'].flatten(),
            'inputs_image': processed_image.squeeze(0),
            'targets': torch.tensor(label, dtype=torch.long),
        }

    def labelencoder(self, text):
        if text == 'not-sarcasm':
            return 0
        elif text == "multi-sarcasm":
            return 1
        elif text == "image-sarcasm":
            return 2
        else:
            return 3
    
    def image_loader(self, image_path):
        return Image.open(image_path).convert("RGB")

    def get_input_data(self, row):
        caption = row['caption']
        label = self.labelencoder(row['label'])
        image_path = os.path.join(self.image_train_path, row["image"])
        image = self.image_loader(image_path)

        if self.augment:
            if row['label'] == "image-sarcasm":
                image = augment_image(image_path)  
            elif row['label'] == "text-sarcasm":
                caption = augment_text(caption)  
        
        return str(caption), image, label

In [7]:
class Adapter(nn.Module):
    def __init__(self, dim=512, rank=8):
        super(Adapter, self).__init__()
        self.adapter_down = nn.Linear(dim, rank)
        self.adapter_up = nn.Linear(rank, dim)
        self.adapter_mid = nn.Linear(rank, rank)
        self.drop = nn.Dropout(0.1)

    def forward(self, x):
        x_down = self.adapter_down(x)
        x_down = self.adapter_mid(x_down)
        x_down = self.drop(x_down)
        x_up = self.adapter_up(x_down)
        return x_up


class MoBA(nn.Module):
    def __init__(self, num_experts=8, dim=512):
        super(MoBA, self).__init__()
        self.num_experts = num_experts
        self.drop = nn.Dropout(0.1)
        self.experts = nn.ModuleList([Adapter(dim, rank=8) for _ in range(num_experts)])
        self.gate = nn.Linear(dim, num_experts)

    def forward(self, x):
        gating_scores = self.gate(x)
        gating_weights = F.softmax(gating_scores, dim=-1)   # [b, n, num]
        expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=-1)  # [b, n, d, num]
        output = torch.sum(gating_weights.unsqueeze(2) * expert_outputs, dim=-1)
        return output


class EncoderLayer(nn.Module):
    def __init__(self, num_experts):
        super(EncoderLayer, self).__init__()
        self.dropout = nn.Dropout(0.1)
        hidden_size = 768
        self.norm = nn.LayerNorm(hidden_size)
        self.attn_text = nn.MultiheadAttention(hidden_size, 8, batch_first=True)
        self.attn_image = nn.MultiheadAttention(hidden_size, 8, batch_first=True)
        self.activation = nn.ReLU()
        self.linear_text1 = nn.Linear(hidden_size, hidden_size)
        self.linear_text2 = nn.Linear(hidden_size, hidden_size)
        self.linear_image1 = nn.Linear(hidden_size, hidden_size)
        self.linear_image2 = nn.Linear(hidden_size, hidden_size)
        self.linear_ocr1 = nn.Linear(hidden_size, hidden_size)
        self.linear_ocr2 = nn.Linear(hidden_size, hidden_size)

        self.adapter_text1 = MoBA(num_experts, hidden_size)
        self.adapter_text2 = MoBA(num_experts, hidden_size)
        self.adapter_image1 = MoBA(num_experts, hidden_size)
        self.adapter_image2 = MoBA(num_experts, hidden_size)

    def forward(self, text, image):
        text_norm = self.norm(text)
        t_att, _ = self.attn_text(text_norm, text_norm, text_norm)
        t_att = self.dropout(t_att)

        image_norm = self.norm(image)
        v_att, _ = self.attn_image(image_norm, image_norm, image_norm)
        v_att = self.dropout(v_att)
        # pdb.set_trace()
        text_out = text + t_att + self.adapter_text1(image)
        image_out = image + v_att + self.adapter_image1(text)

        text_norm2 = self.norm(text_out)
        t_tmp = self.linear_text1(self.dropout(self.activation(self.linear_text2(self.norm(text_norm2)))))
        text_embeds = text_out + self.dropout(t_tmp)

        image_norm2 = self.norm(image_out)
        i_tmp = self.linear_image1(self.dropout(self.activation(self.linear_image2(self.norm(image_norm2)))))
        image_embeds = image_out + self.dropout(i_tmp)

        text_embeds = self.adapter_text2(image_out) + text_embeds
        image_embeds = self.adapter_image2(text_out) + image_embeds
        return text_embeds, image_embeds


class Encoder(nn.Module):
    def __init__(self, num_experts, n_layers):
        super(Encoder, self).__init__()
        self.encoders = nn.ModuleList([
            EncoderLayer(num_experts) for _ in range(n_layers)
        ])

    def forward(self, text, image):
        for layer in self.encoders:
            text, image = layer(text, image)
        return text, image


class Model(nn.Module):
    def __init__(self, args, model_text, model_image):
        super(Model, self).__init__()
        self.model_text = model_text
        self.model_image = model_image
         
        self.text_linear = nn.Linear(1024, 768)

        self.trans = EncoderLayer(args.num_experts)
        self.encoder = Encoder(args.num_experts, args.layers)
        self.classifier_fuse = nn.Linear(args.text_size, args.label_number)
        self.weights = torch.tensor(args.alpha, dtype=torch.float)
        self.loss_fct = nn.CrossEntropyLoss(weight=self.weights)
        self.att = nn.Linear(args.text_size, 1)

        layers_to_freeze = [self.model_text, self.model_image, self.att, self.classifier_fuse]
        for layer in layers_to_freeze:
            for param in layer.parameters():
                param.requires_grad = False

        for name, module in self.trans.named_children():
            if 'adapter' in name or 'attn' in name:
                for param in module.parameters():
                    param.requires_grad = True
            else:
                for param in module.parameters():
                    param.requires_grad = False

        for encoder_layer in self.encoder.encoders:
            for name, module in encoder_layer.named_children():
                if 'adapter' in name:
                    for param in module.parameters():
                        param.requires_grad = True
                else:
                    for param in module.parameters():
                        param.requires_grad = False

    def forward(self, input_ids, attention_mask, inputs_image, labels=None):
        text_embeds = self.model_text(input_ids, attention_mask, output_hidden_states=True).hidden_states[-1]
        image_embeds = self.model_image(inputs_image, output_hidden_states=True).hidden_states[-1]

        text_embeds = self.text_linear(text_embeds)

        seq_len_text = text_embeds.shape[1]
        seq_len_image = image_embeds.shape[1]

        if seq_len_image > seq_len_text:
            image_embeds = image_embeds[:, :seq_len_text, :]
        elif seq_len_text > seq_len_image:
            padding_size = seq_len_text - seq_len_image
            image_embeds = F.pad(image_embeds, (0, 0, 0, padding_size))

        text_embeds, image_embeds = self.trans(text_embeds, image_embeds)
        text_embeds, image_embeds = self.encoder(text_embeds, image_embeds)

        # Fusion
        text_feature = torch.mean(text_embeds, dim=1)
        image_feature = torch.mean(image_embeds, dim=1)
        text_weight = self.att(text_feature)
        image_weight = self.att(image_feature)
        att = nn.functional.softmax(torch.stack((text_weight, image_weight), dim=-1), dim=-1)
        tw, iw = att.split([1, 1], dim=-1)
        fuse_feature = tw.squeeze(1) * text_feature + iw.squeeze(1) * image_feature

        # Classifier
        logits_fuse = self.classifier_fuse(fuse_feature)
        fuse_score = nn.functional.softmax(logits_fuse, dim=-1)
        outputs = (fuse_score, )

        if labels is not None:
            loss = self.loss_fct(logits_fuse, labels)
            outputs = (loss,) + outputs

        return outputs


In [8]:
class Config:
    def __init__(self):
        # Device configuration
        self.device = 'cuda'  # Device number
        self.model_text = "microsoft/infoxlm-large"
        self.model_image = "google/vit-base-patch16-384"

        # Training and validation configuration
        self.epochs = 10
        self.num_workers = 2
        self.patience = 5 
        self.batch_size = 32
        self.accumulation_steps = 1
        self.lr = 2e-4
        self.max_chunk_len = 240

        # Model architecture configuration
        self.hidden_size = 768
        self.num_heads = 8
        self.text_size = 768  
        self.image_size = 768  
        self.adam_epsilon = 1e-8  
        self.layers = 3
        self.num_experts = 16

        # Loss 
        self.epsilon = 0.1
        self.gamma = 2.0
        self.alpha = [0.1, 0.2, 0.3, 0.4 ]  #fix
        self.label_number = len(self.alpha)
        self.reduction = "mean"
        self.ignore_index = -100
        self.smoothing_weight = 0.3
        self.device = "cuda"  # Hoặc 'cpu'
        self.dtype = torch.float32
        


config = Config()

In [9]:
from sklearn.model_selection import train_test_split

image_train_path = "/kaggle/input/fulldata-uit-2024/Traning_set/Traning_set/train-images/"
train = pd.read_json("/kaggle/input/fulldata-uit-2024/Traning_set/Traning_set/vimmsd-train.json").T
train = split_chunk_new_data(train, max_len = config.max_chunk_len)
train["id"] = train.index 
train['caption'] = train['caption'].apply(preprocess)
train['caption'] = train['caption'].apply(preprocess)
train_df, dev_df = train_test_split(train, test_size=0.1, stratify=train['label'], random_state=42)
print(train_df.label.value_counts())
print(dev_df.label.value_counts())

label
not-sarcasm      5864
multi-sarcasm    3850
image-sarcasm     409
text-sarcasm       72
Name: count, dtype: int64
label
not-sarcasm      652
multi-sarcasm    428
image-sarcasm     45
text-sarcasm       8
Name: count, dtype: int64


In [10]:
def set_seed(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [11]:
processor = AutoImageProcessor.from_pretrained(config.model_image, use_fast=True)
model_image = AutoModelForImageClassification.from_pretrained(config.model_image)
tokenizer = AutoTokenizer.from_pretrained(config.model_text)
model_text = XLMRobertaModel.from_pretrained(config.model_text)
# model_ocr = (config.model_ocr)

model = Model(config, model_text, model_image).to(device)
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Số lượng tham số có thể huấn luyện: {total_params}")

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/347M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/513 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Số lượng tham số có thể huấn luyện: 9071616


In [12]:
train_dataset = Dataset(train_df, image_train_path, tokenizer, processor)
dev_dataset = Dataset(dev_df, image_train_path, tokenizer, processor) 

In [13]:
train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers)
dev_loader = DataLoader(dev_dataset, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers)

optimizer = AdamW(model.parameters(), lr=config.lr)

lr_scheduler = get_linear_schedule_with_warmup(
            optimizer, 
            num_warmup_steps=0, 
            num_training_steps=len(train_loader)*config.epochs
        )

info_epoch = {}
logs = []
best_acc = 0
cnt = 0
i = 0

total_time = time.time()
for epoch in range(config.epochs):
    set_seed(42)
    info_epoch[epoch] = {}
    print(f'Epoch {epoch+1}/{config.epochs}')
    print('-'*30)
    start_training_time = time.time()

    model.train()
    train_losses = []
    true_labels = []
    predicted_labels = []

    for data in tqdm(train_loader):
        y_true = data['targets'].to(device)
        input_ids = data['input_ids'].to(device)
        inputs_image = data['inputs_image'].to(device)
        attention_mask = data['attention_masks'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, inputs_image=inputs_image, labels=y_true)
        loss = outputs[0]
        logits = outputs[1] 
        train_losses.append(loss.item())

        loss.backward()
        if (i + 1) % config.accumulation_steps == 0:
            optimizer.step()
            lr_scheduler.step() 
            optimizer.zero_grad()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        i += 1

        _, pred = torch.max(logits, dim=1)
        true_labels.extend(y_true.cpu().numpy())
        predicted_labels.extend(pred.cpu().numpy())

    train_f1 = f1_score(true_labels, predicted_labels, average='macro')
    train_acc = accuracy_score(true_labels, predicted_labels)
    epoch_training_time = time.time() - start_training_time
    print(f'Training time: {epoch_training_time:.2f}s Train Loss: {np.mean(train_losses):.4f} F1: {train_f1:.4f} Acc: {train_acc:.4f}')

    model.eval()
    eval_losses = []
    y_true_list = []
    y_pred_list = []

    start_eval_time = time.time()
    with torch.no_grad():
        for data in tqdm(dev_loader):
            y_true = data['targets'].to(device)
            input_ids = data['input_ids'].to(device)
            inputs_image = data['inputs_image'].to(device)
            attention_mask = data['attention_masks'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, inputs_image=inputs_image, labels=y_true)
            loss = outputs[0]
            logits = outputs[1]
            eval_losses.append(loss.item())

            _, pred = torch.max(logits, dim=1)
            y_true_list.extend(y_true.cpu().numpy())
            y_pred_list.extend(pred.cpu().numpy())

    dev_f1 = f1_score(y_true_list, y_pred_list, average='macro')
    dev_acc = accuracy_score(y_true_list, y_pred_list)
    epoch_eval_time = time.time() - start_eval_time

    print(f'Dev time: {epoch_eval_time}s Dev Loss: {np.mean(eval_losses):.4f} F1: {dev_f1:.4f} Acc: {dev_acc:.4f}')

    info_epoch[epoch] = {
        "time_train": epoch_training_time,
        "epoch": epoch,
        "train_loss": np.mean(train_losses),
        "train_acc": train_acc,
        "f1-train": train_f1,
        "time_val": epoch_eval_time,
        "val_acc": dev_acc,
        "val_loss": np.mean(eval_losses),
        "f1-val": dev_f1
    }

    if dev_f1 > best_acc:
        cnt = 0
        torch.save(model.state_dict(), 'best_model.pth')
        print(f'Saved best_model at epoch {epoch+1}')
        best_acc = dev_f1
    else:
        cnt += 1

    if cnt >= config.patience:
        print('Early stopping')
        break

    torch.cuda.empty_cache()

Epoch 1/10
------------------------------


100%|██████████| 319/319 [17:15<00:00,  3.25s/it]


Training time: 1035.23s Train Loss: 1.0522 F1: 0.3631 Acc: 0.6058


100%|██████████| 36/36 [01:20<00:00,  2.25s/it]


Dev time: 80.92314648628235s Dev Loss: 0.9233 F1: 0.3750 Acc: 0.6452
Saved best_model at epoch 1
Epoch 2/10
------------------------------


100%|██████████| 319/319 [17:15<00:00,  3.25s/it]


Training time: 1035.39s Train Loss: 0.6030 F1: 0.6376 Acc: 0.7498


100%|██████████| 36/36 [01:20<00:00,  2.25s/it]


Dev time: 80.91847157478333s Dev Loss: 0.9860 F1: 0.3713 Acc: 0.6381
Epoch 3/10
------------------------------


100%|██████████| 319/319 [17:16<00:00,  3.25s/it]


Training time: 1036.39s Train Loss: 0.4187 F1: 0.8046 Acc: 0.8247


100%|██████████| 36/36 [01:21<00:00,  2.25s/it]


Dev time: 81.11187720298767s Dev Loss: 1.0661 F1: 0.3940 Acc: 0.6752
Saved best_model at epoch 3
Epoch 4/10
------------------------------


100%|██████████| 319/319 [17:16<00:00,  3.25s/it]


Training time: 1036.11s Train Loss: 0.2738 F1: 0.8736 Acc: 0.8855


100%|██████████| 36/36 [01:20<00:00,  2.25s/it]


Dev time: 80.8390645980835s Dev Loss: 1.2068 F1: 0.3961 Acc: 0.6611
Saved best_model at epoch 4
Epoch 5/10
------------------------------


100%|██████████| 319/319 [17:16<00:00,  3.25s/it]


Training time: 1036.58s Train Loss: 0.1928 F1: 0.9201 Acc: 0.9232


100%|██████████| 36/36 [01:20<00:00,  2.25s/it]


Dev time: 80.95667839050293s Dev Loss: 1.4630 F1: 0.3840 Acc: 0.6549
Epoch 6/10
------------------------------


100%|██████████| 319/319 [17:16<00:00,  3.25s/it]


Training time: 1036.68s Train Loss: 0.1081 F1: 0.9689 Acc: 0.9596


100%|██████████| 36/36 [01:20<00:00,  2.25s/it]


Dev time: 80.84852385520935s Dev Loss: 1.7732 F1: 0.3416 Acc: 0.5975
Epoch 7/10
------------------------------


100%|██████████| 319/319 [17:16<00:00,  3.25s/it]


Training time: 1036.48s Train Loss: 0.0659 F1: 0.9854 Acc: 0.9757


100%|██████████| 36/36 [01:20<00:00,  2.24s/it]


Dev time: 80.81410598754883s Dev Loss: 1.7195 F1: 0.3675 Acc: 0.6267
Epoch 8/10
------------------------------


100%|██████████| 319/319 [17:16<00:00,  3.25s/it]


Training time: 1036.25s Train Loss: 0.0332 F1: 0.9951 Acc: 0.9911


100%|██████████| 36/36 [01:20<00:00,  2.24s/it]


Dev time: 80.80794644355774s Dev Loss: 1.7662 F1: 0.3762 Acc: 0.6381
Epoch 9/10
------------------------------


100%|██████████| 319/319 [17:16<00:00,  3.25s/it]


Training time: 1036.10s Train Loss: 0.0157 F1: 0.9989 Acc: 0.9980


100%|██████████| 36/36 [01:20<00:00,  2.24s/it]

Dev time: 80.8174729347229s Dev Loss: 1.9615 F1: 0.3777 Acc: 0.6540
Early stopping





In [14]:
test = pd.read_json("/kaggle/input/fulldata-uit-2024/Private_Test/Private_Test/vimmsd-private-test.json").T
test['caption'] = test['caption'].apply(preprocess)
test['caption'] = test['caption'].apply(preprocess)
test.head(3)

Unnamed: 0,image,caption,label
0,066d6021fdfeaf39f1dec523879e8fe4d35e877abcea44...,Song Joong Ki amp Song Hye Kyo đều tham dự Bae...,
1,555f4787d4df49e7be743b3d5b77c90755f0d6c351f36b...,Song Joong Ki amp Song Hye Kyo đều tham dự Bae...,
2,7b7cdea2cde1f3f93371259b587a03f2e8c0af682b4d51...,Song Joong Ki amp Song Hye Kyo đều tham dự Bae...,


In [15]:
def predict_sarcasm(model, tokenizer, processor, device, caption, image):
    if not hasattr(model, "loaded"):
        model.load_state_dict(torch.load('best_model.pth', map_location=device))
        model.to(device)
        model.eval()
        model.loaded = True  

    text_encoding = tokenizer.encode_plus(
        caption,
        truncation=True,
        add_special_tokens=True,
        max_length=256,
        padding='max_length',
        return_attention_mask=True,
        return_token_type_ids=False,
        return_tensors='pt', 
    )
    input_ids = text_encoding['input_ids'].to(device)
    attention_mask = text_encoding['attention_mask'].to(device)

    processed_image = processor(image, return_tensors="pt")["pixel_values"]
    inputs_image = processed_image.to(device)  

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, inputs_image=inputs_image)
    if isinstance(outputs, tuple):
        outputs = outputs[0]
    pred_prob, pred = torch.max(outputs, dim=1)
    return {"label": pred.item(), "probability": pred_prob.item()}

def split_chunk(caption, max_len):
    sentences = sent_tokenize(caption)
    token_sentences = [s.split() for s in sentences]

    tmp_context_token = []
    tmp_context = []
    sub_caption = []
    for idx in range(len(sentences)):
        check = True
        if len(tmp_context_token) + len(token_sentences[idx]) <= max_len:
            tmp_context_token += token_sentences[idx]
            tmp_context.append(sentences[idx])
            check = False

        if len(tmp_context_token) + len(token_sentences[idx]) > max_len or idx == len(sentences) - 1:
            context_sub = ' '.join(tmp_context).strip()
            if len(context_sub) > 0:
                sub_caption.append(context_sub)
            
            if check:
                tmp_context_token = token_sentences[idx]
                tmp_context = [sentences[idx]]
            else:
                tmp_context_token = []
                tmp_context = []

    if not sub_caption:
        words = caption.split()
        i = 0
        while i < len(words):
            chunk = words[i:i + max_len]
            if i > 0: 
                chunk = words[i - 3:i] + chunk
            sub_caption.append(' '.join(chunk))
            i += max_len
    return sub_caption

In [16]:
import zipfile
import sys

decode={
    0:'not-sarcasm',
    1:"multi-sarcasm",
    2:"image-sarcasm",
    3:"text-sarcasm"
}
image_test_path = "/kaggle/input/fulldata-uit-2024/Private_Test/Private_Test/test-images/"
results = {}
for i in tqdm(test.index):
    text = test.caption[i]
    image = image_test_path+test.image[i]
    image = Image.open(image).convert("RGB")
    # image = cv2.imread(image)
    predictions = [] 
    
    if len(text.split()) <= config.max_chunk_len:
        result = predict_sarcasm(model, tokenizer, processor, device, text, image)
        if result:  
            predictions.append(result)
    else:
        sub_captions = split_chunk(text, config.max_chunk_len)
        if not sub_captions:  
            print(text)
            print(f"Lỗi: Không có sub-caption nào cho mẫu {i}")
            continue 
        
        for sub in sub_captions:
            result = predict_sarcasm(model, tokenizer, processor, device, sub, image)
            if result:  
                predictions.append(result)

    if predictions:   
        best_prediction = max(predictions, key=lambda x: x["probability"])
        decoded_label = decode[best_prediction["label"]]
        results[str(i)] = decoded_label
    else:
        print(f"Cảnh báo: Không có dự đoán nào cho mẫu {i}")

res_df = pd.DataFrame.from_dict(results, orient="index", columns=["label"])
print(res_df.label.value_counts())
output = {
    "results": results,
    "phase": 'test'
}

with open('/kaggle/working/results.json', 'w') as f:
    json.dump(output, f, indent=2)
    
with zipfile.ZipFile('results.zip', 'w') as zipf:
    zipf.write('results.json')

100%|██████████| 1504/1504 [02:59<00:00,  8.37it/s]

label
multi-sarcasm    934
not-sarcasm      460
image-sarcasm    104
text-sarcasm       6
Name: count, dtype: int64



