In [37]:
import logging
import os
import re
import warnings
from typing import Any

import cv2
import numpy as np
import pandas as pd
import skimage as ski
import torch
from catboost import CatBoostClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    accuracy_score,
    cohen_kappa_score,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
)
from sklearn.metrics._scorer import _BaseScorer
from sklearn.model_selection import cross_validate
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from tqdm.auto import tqdm
from transformers import BertModel, BertTokenizer
from xgboost import XGBClassifier

In [38]:
ROOT = "../data/"
DATASET = os.path.join(ROOT, "br-pt-narrative-essays.csv")
SCORING = {
    "acc": make_scorer(accuracy_score),
    "macro_prec": make_scorer(precision_score, average="macro"),
    "weighted_prec": make_scorer(precision_score, average="weighted"),
    "macro_recall": make_scorer(recall_score, average="macro"),
    "weighted_recall": make_scorer(recall_score, average="weighted"),
    "macro_f1": make_scorer(f1_score, average="macro"),
    "weighted_f1": make_scorer(f1_score, average="weighted"),
    "kappa": make_scorer(cohen_kappa_score)
}
CV = 5
SEED = 42

In [39]:
np.random.seed(SEED)
torch.manual_seed(SEED)

if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)

In [40]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)

In [41]:
tqdm.pandas()

In [42]:
class BERTEncoder(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        model_name="neuralmind/bert-base-portuguese-cased",
        max_length=128,
    ):
        self.model_name = model_name
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)
        self.max_length = max_length

    def transform(self, X, y=None):
        """Tokenize and encode the text, and get CLS token embedding for classification tasks"""

        X = list(X)

        if not isinstance(X, list):
            raise ValueError("Not a list of strings")
        elif not all(isinstance(x, str) for x in X):
            raise ValueError("Not all instances are strings.")

        inputs = self.tokenizer(
            X,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=self.max_length,
        )
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state[:, 0, :].numpy()

    def fit(self, X, y=None):
        return self


class LBPEncoder(BaseEstimator, TransformerMixin):
    """LBP encoder for image data."""

    def __init__(self, radius: int = 1, sampling_pixels: int = 106):
        self.radius = radius
        self.sampling_pixels = sampling_pixels

    def transform(self, X, y=None):
        """Extract the LBP from the images batch."""
        logger.debug("Encoding images...")
        X = list(X)
        logger.debug("Converting...")
        cvt_imgs = [self._cvt(img) for img in X]
        logger.debug("Running LBP algorithm...")
        imgs_lbps = [self._get_lbp(img) for img in cvt_imgs]
        logger.debug("Getting the histograms...")
        imgs_hists = [self._get_hist(img_lbp) for img_lbp in imgs_lbps]
        logger.debug("Extracting LBP features from histograms...")
        features = self._get_features(imgs_hists)
        logger.debug("Finished with encoding images")

        return features

    def fit(self, X, y=None):
        return self
    
    def _cvt(self, img):
        if len(img.shape) > 2:
            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            _, img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

        i_min = np.min(img)
        i_max = np.max(img)
        if i_max - i_min != 0:
            img = (img - i_min) / (i_max - i_min)

        img = img.astype(np.uint8)

        return img

    def _get_lbp(self, img):
        lbp = ski.feature.local_binary_pattern(
            img, self.sampling_pixels, self.radius, method="uniform"
        )
        return (img, lbp)

    def _get_hist(self, img_lbp):
        img, lbp = img_lbp
        hist, _ = np.histogram(
            lbp.ravel(),
            bins=np.arange(0, self.sampling_pixels + 3),
            range=(0, self.sampling_pixels + 2),
        )
        hist = hist.astype("float")
        hist /= hist.sum() + 1e-6
        return img, hist
    
    def _get_features(self, imgs_hists):
        hists = [img_hist[1] for img_hist in imgs_hists]
        features = []
        for h in hists:
            features.extend(h)
        return hists

In [43]:
def _create_tfidf_pipeline(clf: BaseEstimator) -> Pipeline:
    pipeline = Pipeline(
        [
            ("encoder", TfidfVectorizer()),
            ("clf", clf),
        ]
    )
    return pipeline


def _create_lbp_pipeline(clf: BaseEstimator) -> Pipeline:
    pipeline = Pipeline(
        [
            ("encoder", LBPEncoder()),
            ("clf", clf),
        ]
    )
    return pipeline


def _create_bert_pipeline(clf: BaseEstimator) -> Pipeline:
    pipeline = Pipeline([("encoder", BERTEncoder()), ("clf", clf)])
    return pipeline

In [44]:
def scores_to_df(scores: tuple[tuple[str, str, float]]) -> pd.DataFrame:
    """Transform the scores dictionary into a dataframe object."""
    data = []
    for score in scores:
        data.append(
            {
                "algo": score["algo"],
                "encoder": score["encoder"],
                "acc": np.mean(score["test_acc"]),
                "macro_prec": np.mean(score["test_macro_prec"]),
                "weighted_prec": np.mean(score["test_weighted_prec"]),
                "macro_recall": np.mean(score["test_macro_recall"]),
                "weighted_recall": np.mean(score["test_weighted_recall"]),
                "macro_f1": np.mean(score["test_macro_f1"]),
                "weighted_f1": np.mean(score["test_weighted_f1"]),
                "kappa": np.mean(score["test_kappa"]),
            }
        )
    df = pd.DataFrame(data=data)
    return df


def clean_text(text: str) -> str:
    """Remove unwanted tokens from the text."""
    text = re.sub(r"\n", "", text)
    return text


def transform_image(image):
    resized_image = cv2.resize(image, (100, 100))
    gray_image = cv2.cvtColor(resized_image, cv2.COLOR_BGR2GRAY)
    _, binary_image = cv2.threshold(gray_image, 127, 255, cv2.THRESH_BINARY)
    return binary_image


def run_experiments(
    clfs: list[BaseEstimator],
    df: pd.DataFrame,
    scoring: dict[str, _BaseScorer],
    cv: int,
    preprocessors: list[Any],
    text_feature_column: str,
    image_feature_column: str,
    target_column: str,
    n_jobs: int = 1,
):
    """Run Scikit-Learn based experiments."""

    ALGO_NAME_DICT = {
        ExtraTreesClassifier: "Extra Trees",
        DecisionTreeClassifier: "Decision Tree",
        SVC: "Support Vector",
        RandomForestClassifier: "Random Forest",
        CatBoostClassifier: "CatBoost",
        XGBClassifier: "XGBoost",
        MLPClassifier: "MLP"
        
    }

    ENCODER_NAME_DICT = {
        TfidfVectorizer: "TF-IDF",
        BERTEncoder: "BERT",
        LBPEncoder: "LBP"
    }

    def _run_sklearn_cv(
        clf: BaseEstimator,
        X: pd.DataFrame,
        y: pd.DataFrame,
        scoring: dict[str, _BaseScorer],
        cv: int,
    ):
        """Run cross-validation on a Scikit-Learn estimator."""

        assert issubclass(type(clf), BaseEstimator) is True

        cv_results = cross_validate(
            clf,
            X,
            y,
            cv=cv,
            scoring=scoring,
            n_jobs=n_jobs,
        )
        cv_results["algo"] = ALGO_NAME_DICT[type(clf.named_steps["clf"])]
        cv_results["encoder"] = ENCODER_NAME_DICT[type(clf.named_steps["encoder"])]
        return cv_results

    cv_results_list = []

    # Apply preprocessors
    for preprocessor in preprocessors["image"]:
        df[image_feature_column] = df[image_feature_column].apply(
            lambda feature: preprocessor(cv2.imread(feature))
        )

    for preprocessor in preprocessors["text"]:
        df[text_feature_column] = df[text_feature_column].apply(
            lambda feature: preprocessor(feature)
        )

    label_encoder = LabelEncoder()
    df[target_column] = label_encoder.fit_transform(df[target_column])

    for i, clf in enumerate(tqdm(clfs, desc="Classifiers")):
        clf_name = ALGO_NAME_DICT[type(clf)]
        logger.debug(f"#{i + 1} classifier: {clf_name}")

        # Create pipelines
        tfidf_pipeline = _create_tfidf_pipeline(clf)
        bert_pipeline = _create_bert_pipeline(clf)
        lbp_pipeline = _create_lbp_pipeline(clf)

        # Run CV using the pipelines
        logger.debug(f"Running TF-IDF pipeline #{i + 1}")
        tfidf_cv_results = _run_sklearn_cv(
            tfidf_pipeline,
            df[text_feature_column],
            df[target_column],
            scoring,
            cv,
        )
        logger.debug(f"Ending TF-IDF pipeline #{i + 1}")

        bert_cv_results = _run_sklearn_cv(
            bert_pipeline,
            df[text_feature_column],
            df[target_column],
            scoring,
            cv,
        )

        logger.debug(f"Running LBP pipeline #{i + 1}")
        lbp_cv_results = _run_sklearn_cv(
            lbp_pipeline,
            df[image_feature_column],
            df[target_column],
            scoring,
            cv,
        )
        logger.debug(f"Ending LBP pipeline #{i + 1}")

        # Append results
        cv_results_list.append(tfidf_cv_results)
        cv_results_list.append(bert_cv_results)
        cv_results_list.append(lbp_cv_results)

    return cv_results_list

In [45]:
df = pd.read_csv(DATASET, index_col=0)
df.head()

Unnamed: 0,text,motivating_situation,image_url,image_path,cohesion,thematic_coherence,formal_register,text_typology
0,( O chorrinho nino )\n-\nEu e...,Eu encontrei em cima do armário alguns potes c...,https://storage.googleapis.com/ciclos-10698-bu...,MEC/Rc7dMxTP7ZdLNEvmF0jo/iet1QFw2ARNkv9Hx6KGe/...,3,3,3,4
1,.As meninas do potes de Tintas [T]\nUma vez eu...,Eu encontrei em cima do armário alguns potes c...,https://storage.googleapis.com/ciclos-10698-bu...,MEC/Rc7dMxTP7ZdLNEvmF0jo/F80gTOBoh2Lk5JtrLWUO/...,3,3,3,4
2,1 uma menina encontrou varios potes de tinta ...,Eu encontrei em cima do armário alguns potes c...,https://storage.googleapis.com/ciclos-10698-bu...,MEC/Rc7dMxTP7ZdLNEvmF0jo/HjnamZPzaZcDeEANF2IC/...,3,3,3,4
3,2011 [T] A menina da (artes). \n\nDePois que e...,Eu encontrei em cima do armário alguns potes c...,https://storage.googleapis.com/ciclos-10698-bu...,MEC/Rc7dMxTP7ZdLNEvmF0jo/r3DUtFJn9twSvqcNloNM/...,3,3,3,4
4,28/03/2022\n[T] A cobra felena\nem uma tarde m...,Eu encontrei em cima do armário alguns potes c...,https://storage.googleapis.com/ciclos-10698-bu...,MEC/Rc7dMxTP7ZdLNEvmF0jo/tKZIn59KtOcbe9z0n6oY/...,4,1,4,4


In [46]:
df["image_path"] = df["image_path"].apply(lambda image_path: os.path.join(ROOT, image_path))
df.head()

Unnamed: 0,text,motivating_situation,image_url,image_path,cohesion,thematic_coherence,formal_register,text_typology
0,( O chorrinho nino )\n-\nEu e...,Eu encontrei em cima do armário alguns potes c...,https://storage.googleapis.com/ciclos-10698-bu...,../data/MEC/Rc7dMxTP7ZdLNEvmF0jo/iet1QFw2ARNkv...,3,3,3,4
1,.As meninas do potes de Tintas [T]\nUma vez eu...,Eu encontrei em cima do armário alguns potes c...,https://storage.googleapis.com/ciclos-10698-bu...,../data/MEC/Rc7dMxTP7ZdLNEvmF0jo/F80gTOBoh2Lk5...,3,3,3,4
2,1 uma menina encontrou varios potes de tinta ...,Eu encontrei em cima do armário alguns potes c...,https://storage.googleapis.com/ciclos-10698-bu...,../data/MEC/Rc7dMxTP7ZdLNEvmF0jo/HjnamZPzaZcDe...,3,3,3,4
3,2011 [T] A menina da (artes). \n\nDePois que e...,Eu encontrei em cima do armário alguns potes c...,https://storage.googleapis.com/ciclos-10698-bu...,../data/MEC/Rc7dMxTP7ZdLNEvmF0jo/r3DUtFJn9twSv...,3,3,3,4
4,28/03/2022\n[T] A cobra felena\nem uma tarde m...,Eu encontrei em cima do armário alguns potes c...,https://storage.googleapis.com/ciclos-10698-bu...,../data/MEC/Rc7dMxTP7ZdLNEvmF0jo/tKZIn59KtOcbe...,4,1,4,4


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1188 entries, 0 to 1187
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   text                  1188 non-null   object
 1   motivating_situation  1188 non-null   object
 2   image_url             1188 non-null   object
 3   image_path            1188 non-null   object
 4   cohesion              1188 non-null   int64 
 5   thematic_coherence    1188 non-null   int64 
 6   formal_register       1188 non-null   int64 
 7   text_typology         1188 non-null   int64 
dtypes: int64(4), object(4)
memory usage: 83.5+ KB


In [48]:
def remove_truncated_images(df, column_name):
    """
    Remove rows with truncated or corrupted images from the DataFrame.

    Parameters:
    df (pd.DataFrame): DataFrame containing image file paths.
    column_name (str): Column name containing image file paths.

    Returns:
    pd.DataFrame: DataFrame with rows containing truncated or corrupted images removed.
    """
    # Function to check if an image is valid
    def is_valid_image(image_path):
        with open(image_path, 'rb') as image_file:
            check_chars = image_file.read()[-2:]
        if check_chars != b'\xff\xd9':
            return False
        return True

    # Apply the function to the DataFrame and filter out invalid images
    valid_image_paths = df[column_name].apply(is_valid_image)
    filtered_df = df[valid_image_paths].reset_index(drop=True)

    return filtered_df

df = remove_truncated_images(df, "image_path")

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1163 entries, 0 to 1162
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   text                  1163 non-null   object
 1   motivating_situation  1163 non-null   object
 2   image_url             1163 non-null   object
 3   image_path            1163 non-null   object
 4   cohesion              1163 non-null   int64 
 5   thematic_coherence    1163 non-null   int64 
 6   formal_register       1163 non-null   int64 
 7   text_typology         1163 non-null   int64 
dtypes: int64(4), object(4)
memory usage: 72.8+ KB


In [50]:
# df["image"] = df["image_path"].progress_apply(lambda image_path: transform_image(cv2.imread(image_path)))

In [51]:
# lbp_encoder = LBPEncoder()
# features = lbp_encoder.fit_transform(df["image"])

In [52]:
clfs = [
    SVC(random_state=SEED),
    RandomForestClassifier(random_state=SEED),
    DecisionTreeClassifier(random_state=SEED),
    ExtraTreesClassifier(random_state=SEED),
    XGBClassifier(
        objective="multi:softprob",
        eval_metric="mlogloss",
    ),
    MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=SEED),
]

test_df = df.sample(n=100)

with warnings.catch_warnings():
    warnings.simplefilter(action="ignore", category=UndefinedMetricWarning)
    warnings.simplefilter(action="ignore", category=FutureWarning)
    warnings.simplefilter(action="ignore", category=UserWarning)
    preprocessors = {"text": [clean_text], "image": [transform_image]}
    cv_results = run_experiments(
        clfs,
        test_df,
        scoring=SCORING,
        cv=CV,
        preprocessors=preprocessors,
        text_feature_column="text",
        image_feature_column="image_path",
        target_column="text_typology",
    )

Classifiers:   0%|          | 0/6 [00:00<?, ?it/s]

1 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/home/hyanbatista42/miniconda3/envs/multimodal-aes/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/hyanbatista42/miniconda3/envs/multimodal-aes/lib/python3.11/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/hyanbatista42/miniconda3/envs/multimodal-aes/lib/python3.11/site-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.f

In [55]:
cv_results_df = scores_to_df(cv_results)
cv_results_df

Unnamed: 0,algo,encoder,acc,macro_prec,weighted_prec,macro_recall,weighted_recall,macro_f1,weighted_f1,kappa
0,Support Vector,TF-IDF,0.59,0.1415,0.3485,0.24,0.59,0.177984,0.438065,0.0
1,Support Vector,BERT,0.59,0.1415,0.3485,0.24,0.59,0.177984,0.438065,0.0
2,Support Vector,LBP,0.59,0.1415,0.3485,0.24,0.59,0.177984,0.438065,0.0
3,Random Forest,TF-IDF,0.61,0.234342,0.396079,0.285,0.61,0.240161,0.470538,0.070588
4,Random Forest,BERT,0.58,0.18616,0.39674,0.2475,0.58,0.203393,0.461057,0.028393
5,Random Forest,LBP,0.53,0.238535,0.416617,0.265076,0.53,0.241085,0.45817,0.031267
6,Decision Tree,TF-IDF,0.43,0.26986,0.415283,0.288788,0.43,0.273715,0.41479,0.016279
7,Decision Tree,BERT,0.49,0.251042,0.480762,0.288697,0.49,0.26252,0.480115,0.130162
8,Decision Tree,LBP,0.44,0.250103,0.450608,0.253682,0.44,0.233978,0.427401,0.03268
9,Extra Trees,TF-IDF,0.61,0.234342,0.396079,0.285,0.61,0.240161,0.470538,0.070588


In [56]:
print(cv_results_df.to_latex())

\begin{tabular}{lllrrrrrrrr}
\toprule
 & algo & encoder & acc & macro_prec & weighted_prec & macro_recall & weighted_recall & macro_f1 & weighted_f1 & kappa \\
\midrule
0 & Support Vector & TF-IDF & 0.590000 & 0.141500 & 0.348500 & 0.240000 & 0.590000 & 0.177984 & 0.438065 & 0.000000 \\
1 & Support Vector & BERT & 0.590000 & 0.141500 & 0.348500 & 0.240000 & 0.590000 & 0.177984 & 0.438065 & 0.000000 \\
2 & Support Vector & LBP & 0.590000 & 0.141500 & 0.348500 & 0.240000 & 0.590000 & 0.177984 & 0.438065 & 0.000000 \\
3 & Random Forest & TF-IDF & 0.610000 & 0.234342 & 0.396079 & 0.285000 & 0.610000 & 0.240161 & 0.470538 & 0.070588 \\
4 & Random Forest & BERT & 0.580000 & 0.186160 & 0.396740 & 0.247500 & 0.580000 & 0.203393 & 0.461057 & 0.028393 \\
5 & Random Forest & LBP & 0.530000 & 0.238535 & 0.416617 & 0.265076 & 0.530000 & 0.241085 & 0.458170 & 0.031267 \\
6 & Decision Tree & TF-IDF & 0.430000 & 0.269860 & 0.415283 & 0.288788 & 0.430000 & 0.273715 & 0.414790 & 0.016279 \\
7 & Decision