In [1]:
import sys
from pathlib import Path
import pandas as pd
import torch
import torch.nn as nn
from datasets import Dataset, DatasetDict, ClassLabel
from transformers import BertModel, ViTModel, BertTokenizer, ViTImageProcessor, Trainer, TrainingArguments, EvalPrediction
from PIL import Image, UnidentifiedImageError, ImageFile

from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from imblearn.over_sampling import RandomOverSampler

ImageFile.LOAD_TRUNCATED_IMAGES = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


DATASET_PATH = Path('data/MEC/')
MODEL_OUTPUT_DIR = Path('data/results')
DATASET_CACHE = Path('data/cache')
TEXT_COLUMN = 'text'
IMAGE_PATH_COLUMN = 'image_path'
TARGET_COLUMN = 'formal_register'
SEED = 42
N_EPOCHS = 30
BATCH_SIZE = 32


DATASET = DATASET_PATH / 'mec-dataset.csv'

def dataset_cache(stage: str):
    return str(DATASET_CACHE / stage)

Path(DATASET_CACHE).mkdir(parents=True, exist_ok=True)
Path(DATASET_CACHE).mkdir(parents=True, exist_ok=True)
print(sys.version)

3.11.9 (main, Apr 19 2024, 16:48:06) [GCC 11.2.0]


## Prepare Dataset

In [2]:
df = pd.read_csv(DATASET, index_col=0)
df[['cohesion', 'thematic_coherence', 'formal_register', 'text_typology']] = 'Nível ' + df[['cohesion', 'thematic_coherence', 'formal_register', 'text_typology']].astype(str)
df[IMAGE_PATH_COLUMN] =  df[IMAGE_PATH_COLUMN].apply(DATASET_PATH.joinpath).astype(str)
df = df.convert_dtypes()
df.info()
df.head(2)

<class 'pandas.core.frame.DataFrame'>
Index: 1188 entries, 0 to 1187
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   text                  1188 non-null   string
 1   motivating_situation  1188 non-null   string
 2   image_url             1188 non-null   string
 3   image_path            1188 non-null   string
 4   cohesion              1188 non-null   string
 5   thematic_coherence    1188 non-null   string
 6   formal_register       1188 non-null   string
 7   text_typology         1188 non-null   string
dtypes: string(8)
memory usage: 83.5 KB


Unnamed: 0,text,motivating_situation,image_url,image_path,cohesion,thematic_coherence,formal_register,text_typology
0,( O chorrinho nino ) - Eu est...,Eu encontrei em cima do armário alguns potes c...,https://storage.googleapis.com/ciclos-10698-bu...,data/MEC/MEC/Rc7dMxTP7ZdLNEvmF0jo/iet1QFw2ARNk...,Nível 3,Nível 3,Nível 3,Nível 4
1,.As meninas do potes de Tintas [T] Uma vez eu ...,Eu encontrei em cima do armário alguns potes c...,https://storage.googleapis.com/ciclos-10698-bu...,data/MEC/MEC/Rc7dMxTP7ZdLNEvmF0jo/F80gTOBoh2Lk...,Nível 3,Nível 3,Nível 3,Nível 4


In [3]:
class_names = sorted(df[TARGET_COLUMN].unique().tolist())

class_label = ClassLabel(num_classes=len(class_names), names=class_names)
class_label

ClassLabel(names=['Nível 1', 'Nível 2', 'Nível 3', 'Nível 4', 'Nível 5'], id=None)

## Training Functions

In [4]:
df[IMAGE_PATH_COLUMN].apply(Path).apply(Path.exists).all()

np.True_

In [5]:
class ClassfierHead(nn.Module):
    def __init__(self, input_size: int, hidden_size = 256, num_classes = 5):
        super().__init__()
        self.fn = nn.Linear(input_size, hidden_size)
        self.classifier = nn.Linear(hidden_size, num_classes)

    def forward(self, combined_last_hidden_state, labels=None):
        result = self.fn(combined_last_hidden_state)
        result = torch.relu_(result)
        logits = self.classifier(result)

        if labels is not None:
            loss = nn.functional.cross_entropy(logits, labels)
            return {"loss": loss, "logits": logits}
        
        return {'logits': logits}


BERT_MODEL_NAME = 'neuralmind/bert-base-portuguese-cased'
VIT_MODEL_NAME = 'google/vit-base-patch16-224'

bert_model: BertModel = BertModel.from_pretrained(BERT_MODEL_NAME).requires_grad_(False)
vit_model: ViTModel = ViTModel.from_pretrained(VIT_MODEL_NAME).requires_grad_(False)

tokenizer: BertTokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
processor: ViTImageProcessor = ViTImageProcessor.from_pretrained(VIT_MODEL_NAME)

# Instanciar o cabeça de classificação
model = ClassfierHead(bert_model.config.hidden_size + vit_model.config.hidden_size)

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def is_image_valid(image_path):
    try:
        with Image.open(image_path) as img:
            img.verify()
        return True
    except (IOError, UnidentifiedImageError):
        return False

def preprocess_text_and_image(input_ids: torch.Tensor, attention_mask: torch.Tensor, pixel_values: torch.Tensor):
    # Processar o texto usando o BERT

    bert_model.eval().cuda()
    vit_model.eval().cuda()

    with torch.no_grad():
        bert_outputs = bert_model(input_ids=input_ids.cuda(), attention_mask=attention_mask.cuda())
        bert_pooled_output = bert_outputs.last_hidden_state[:, 0, :].detach().cpu()
    
        # Processar a imagem usando o ViT
        vit_outputs = vit_model(pixel_values=pixel_values.cuda())
        vit_pooled_output = vit_outputs.last_hidden_state[:, 0, :].detach().cpu()
    
        # Concatenar as saídas
        combined = torch.cat((bert_pooled_output, vit_pooled_output), dim=1)

    return {
        'combined_last_hidden_state': combined
    }


def prepare_dataset(
    df: pd.DataFrame,
    train_indexes: list[int],
    test_indexes: list[int],
    processor: ViTImageProcessor,
    tokenizer: BertTokenizer,
    text_column: str = 'text',
    image_path_column: str = 'image_path',
    target: str = 'formal_register',
    balanced: bool = True,
) -> DatasetDict:
    # Processing dataset e and storing in file cache
    COLUMNS = 'cohesion', 'thematic_coherence', 'formal_register', 'text_typology'
    original_dataset = (Dataset
        .from_pandas(df)
        .select_columns([text_column, image_path_column, *COLUMNS])
        .filter(is_image_valid, input_columns=[image_path_column])
        .map(lambda path: {'pixel_values': Image.open(path)}, input_columns=[image_path_column])
        .map(
            lambda pixel_values: processor(pixel_values, return_tensors='pt'),
            input_columns=['pixel_values'],
            batched=True,
            batch_size=8,
            num_proc=4,
            cache_file_name=dataset_cache('process-image.arrow'),
        )
        .map(
            lambda text: tokenizer(
                text,
                padding='max_length',
                truncation=True,
                max_length=512,
            ),
            batched=True,
            input_columns=[text_column],
            cache_file_name=dataset_cache('process-text.arrow'),
        )
        .with_format('pt')
        .map(
            preprocess_text_and_image,
            batched=True,
            batch_size=64,
            input_columns=['input_ids', 'attention_mask', 'pixel_values'],
            cache_file_name=dataset_cache('process-model-input.arrow'),
        )
        .remove_columns([image_path_column])
        .rename_column(target, 'labels')
        .cast_column('labels', class_label)
    )

    # Balaced indexes
    if balanced:
        train_indexes = RandomOverSampler(random_state=SEED)\
            .fit_resample(train_indexes[None].T, df.loc[train_indexes, target])[0].flatten()

    # Creating dataset split
    dataset = DatasetDict(
        train=original_dataset.select(train_indexes),
        test=original_dataset.select(test_indexes)
    )
    # dataset.set_format('pt', ['input_ids', 'attention_mask', 'pixel_values', 'combined_last_hidden_state', 'labels'], output_all_columns=True)
    return dataset


def compute_metrics(eval_preds: EvalPrediction, compute_result=False, *, PREDS: list = [], LABELS: list = []):
    labels = eval_preds.label_ids
    preds = eval_preds.predictions.argmax(-1)

    PREDS.append(preds)
    LABELS.append(labels)

    if compute_result:
        preds = torch.concat(PREDS).numpy(force=True)
        labels = torch.concat(LABELS).numpy(force=True)

        f1 = f1_score(labels, preds, average="weighted")
        acc = accuracy_score(labels, preds)

        PREDS, LABELS = [], []

        return {"accuracy": acc, "f1": f1}


def evaluate_model(model, dataset: DatasetDict, device):
    # Get model predictions and ground truth
    model.eval()
    model.to(device)
    torch.cuda.empty_cache()

    with torch.no_grad():
      batches = (batch['pixel_values'] for batch in dataset['test'].iter(BATCH_SIZE))
      logits = [model(batch.to(device)).logits for batch in batches]
      logits = torch.concat(logits, 0)

    y_pred = torch.argmax(logits, dim=-1).cpu().numpy()
    y_true = dataset["test"]["labels"].cpu().numpy()

    torch.cuda.empty_cache()

    # Evaluate model
    return dict(
        report = classification_report(y_true, y_pred),
        accuracy = accuracy_score(y_true, y_pred),
        weighted_precision = precision_score(y_true, y_pred, average="weighted"),
        weighted_recall = recall_score(y_true, y_pred, average="weighted"),
        weighted_f1 = f1_score(y_true, y_pred, average="weighted"),
        macro_precision = precision_score(y_true, y_pred, average="macro"),
        macro_recall = recall_score(y_true, y_pred, average="macro"),
        macro_f1 = f1_score(y_true, y_pred, average="macro"),
    )


In [7]:
# train_indexes, test_indexes = next(StratifiedShuffleSplit(random_state=SEED, test_size=0.2).split(df, df[TARGET_COLUMN]))
# dataset = prepare_dataset(df, train_indexes, test_indexes, processor, tokenizer)]
# dataset

In [8]:
def train_model(target: str, balanced: bool = True, test_size: float = 0.2):
    train_indexes, test_indexes = next(StratifiedShuffleSplit(random_state=SEED, test_size=test_size).split(df, df[TARGET_COLUMN]))

    dataset = prepare_dataset(df, train_indexes, test_indexes, processor, tokenizer, target=target, balanced=balanced)
    OUTPUT_DIR = f'results/training/{target}/'

    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        per_device_train_batch_size=32,
        num_train_epochs=20,
        batch_eval_metrics=True,
        dataloader_num_workers=8,
        data_seed=SEED,
        eval_strategy="steps",
        eval_steps=50,
        save_steps=200,
        logging_dir=OUTPUT_DIR + "logging",
        logging_steps=25,
        fp16=True,
        learning_rate=2e-4,
        save_total_limit=2,
        use_cpu=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        compute_metrics=compute_metrics,
    )

    results = trainer.train()
    return results

In [9]:

def cross_validate(target: str, balanced: bool = True):
    training_results = []

    for fold, (train_indexes, test_indexes) in enumerate(StratifiedKFold(random_state=SEED, n_splits=5, shuffle=True).split(df, df[TARGET_COLUMN]), start=1):
    
        dataset = prepare_dataset(df, train_indexes, test_indexes, processor, tokenizer, target=target, balanced=balanced)
        input_size = dataset['test'][0]['combined_last_hidden_state'].size()[0]

        model = ClassfierHead(input_size)

        OUTPUT_DIR = f'results/cross_validation/{target}/fold_{fold}/'

        training_args = TrainingArguments(
            output_dir=OUTPUT_DIR,
            per_device_train_batch_size=32,
            num_train_epochs=20,
            batch_eval_metrics=True,
            dataloader_num_workers=8,
            data_seed=SEED,
            eval_strategy="steps",
            eval_steps=50,
            save_steps=200,
            logging_dir=OUTPUT_DIR + "logging",
            logging_steps=25,
            fp16=True,
            learning_rate=2e-4,
            save_total_limit=2,
            use_cpu=False,
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=dataset["train"],
            eval_dataset=dataset["test"],
            compute_metrics=compute_metrics,
        )

        result = trainer.train()

        training_results.append(result)

    return training_results

In [10]:
cross_validate('formal_register')
cross_validate('thematic_coherence')
cross_validate('text_typology')
cross_validate('cohesion')

Filter:   0%|          | 0/1188 [00:00<?, ? examples/s]

Map:   0%|          | 0/1188 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1188 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss,Accuracy,F1
50,1.098,1.178771,0.563025,0.587992
100,0.7167,1.069179,0.523109,0.552428
150,0.5607,0.928919,0.543417,0.571987
200,0.4816,1.021876,0.527311,0.553818
250,0.3981,1.003131,0.510924,0.536521
300,0.3731,0.778169,0.533613,0.559573
350,0.3262,0.903242,0.534814,0.561197
400,0.3239,0.756768,0.549895,0.576026
450,0.3043,0.735848,0.564426,0.589364
500,0.2697,0.848035,0.568067,0.592635


Filter:   0%|          | 0/1188 [00:00<?, ? examples/s]

Map:   0%|          | 0/1188 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss,Accuracy,F1
50,1.0788,1.110762,0.607248,0.620671
100,0.7216,1.006083,0.607092,0.620563
150,0.5413,0.993788,0.605542,0.619505
200,0.4504,1.049201,0.601622,0.616655
250,0.3738,0.872972,0.601413,0.61656
300,0.3713,0.911448,0.601961,0.617069
350,0.3435,0.874991,0.60221,0.617344
400,0.329,0.767577,0.603701,0.618452
450,0.2652,0.806922,0.604692,0.619082
500,0.2913,0.817938,0.605728,0.619744


Filter:   0%|          | 0/1188 [00:00<?, ? examples/s]

Map:   0%|          | 0/1188 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss,Accuracy,F1
50,1.0597,1.191374,0.612594,0.62367
100,0.6927,1.055496,0.611765,0.62295
150,0.5367,1.165073,0.60997,0.621563
200,0.4744,1.224495,0.607911,0.619888
250,0.4039,0.975414,0.607421,0.619491
300,0.3573,0.985065,0.607093,0.619165
350,0.2984,0.974138,0.606476,0.618597
400,0.3328,0.884558,0.607045,0.618958
450,0.2635,1.006418,0.606781,0.618709
500,0.2784,0.944038,0.606904,0.618722


Filter:   0%|          | 0/1188 [00:00<?, ? examples/s]

Map:   0%|          | 0/1188 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss,Accuracy,F1
50,1.0813,1.353485,0.608197,0.61789
100,0.7192,1.158416,0.606427,0.61663
150,0.533,0.955113,0.606226,0.616582
200,0.4664,0.974331,0.605404,0.61605
250,0.4083,0.805183,0.606076,0.616688
300,0.3549,0.858355,0.606157,0.616855
350,0.3306,0.880285,0.606202,0.617004
400,0.3022,0.816631,0.606583,0.61733
450,0.2679,0.807081,0.607225,0.617939
500,0.2932,0.853417,0.607526,0.618303


Filter:   0%|          | 0/1188 [00:00<?, ? examples/s]

Map:   0%|          | 0/1188 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss,Accuracy,F1
50,1.0612,1.351912,0.616809,0.627201
100,0.6911,1.082508,0.616298,0.626845
150,0.5305,1.045839,0.615503,0.626344
200,0.4936,0.876982,0.615427,0.62633
250,0.4106,0.842779,0.615535,0.626417
300,0.3851,1.03181,0.614733,0.625839
350,0.3279,0.890568,0.614716,0.625846
400,0.3149,0.861251,0.614672,0.625804
450,0.2854,0.889845,0.614655,0.625825
500,0.3048,0.926874,0.61441,0.625662


Filter:   0%|          | 0/1188 [00:00<?, ? examples/s]

Map:   0%|          | 0/1188 [00:00<?, ? examples/s]

ValueError: Original column name thematic_coherence not in the dataset. Current columns in the dataset: ['text', 'formal_register', 'pixel_values', 'input_ids', 'token_type_ids', 'attention_mask', 'combined_last_hidden_state']