In [1]:
import os
import re
import warnings
from typing import Any

import numpy as np
import pandas as pd
import torch
import xgboost as xgb
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    accuracy_score,
    cohen_kappa_score,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
)
from sklearn.metrics._scorer import _BaseScorer
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from transformers import BertModel, BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ROOT = "../"
DATASET = os.path.join(ROOT, "data/br-pt-narrative-essays.csv")
SCORING = {
    "acc": make_scorer(accuracy_score),
    "macro_prec": make_scorer(precision_score, average="macro"),
    "weighted_prec": make_scorer(precision_score, average="weighted"),
    "macro_recall": make_scorer(recall_score, average="macro"),
    "weighted_recall": make_scorer(recall_score, average="weighted"),
    "macro_f1": make_scorer(f1_score, average="macro"),
    "weighted_f1": make_scorer(f1_score, average="weighted"),
    "kappa": make_scorer(cohen_kappa_score)
}
CV = 5
SEED = 42

In [3]:
class BERTEncoder(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        model_name="neuralmind/bert-base-portuguese-cased",
        max_length=128,
    ):
        self.model_name = model_name
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)
        self.max_length = max_length

    def transform(self, X, y=None):
        """Tokenize and encode the text, and get CLS token embedding for classification tasks"""

        X = list(X)

        if not isinstance(X, list):
            raise ValueError("Not a list of strings")
        elif not all(isinstance(x, str) for x in X):
            raise ValueError("Not all instances are strings.")

        inputs = self.tokenizer(
            X,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=self.max_length,
        )
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state[:, 0, :].numpy()

    def fit(self, X, y=None):
        return self


def scores_to_df(scores: tuple[tuple[str, str, float]]) -> pd.DataFrame:
    """Transform the scores dictionary into a dataframe object."""
    data = []
    for score in scores:
        data.append(
            {
                "algo": score["algo"],
                "encoder": score["encoder"],
                "acc": np.mean(score["test_acc"]),
                "macro_prec": np.mean(score["test_macro_prec"]),
                "weighted_prec": np.mean(score["test_weighted_prec"]),
                "macro_recall": np.mean(score["test_macro_recall"]),
                "weighted_recall": np.mean(score["test_weighted_recall"]),
                "macro_f1": np.mean(score["test_macro_f1"]),
                "weighted_f1": np.mean(score["test_weighted_f1"]),
                "kappa": np.mean(score["test_kappa"]),
            }
        )
    df = pd.DataFrame(data=data)
    return df


def clean_text(text: str) -> str:
    """Remove unwanted tokens from the text."""
    text = re.sub(r"\n", "", text)
    return text


def _create_tfidf_pipeline(clf: BaseEstimator) -> Pipeline:
    pipeline = Pipeline(
        [
            ("encoder", TfidfVectorizer()),
            ("clf", clf),
        ]
    )
    return pipeline


def _create_bert_pipeline(clf: BaseEstimator) -> Pipeline:
    pipeline = Pipeline([("encoder", BERTEncoder()), ("clf", clf)])
    return pipeline


def run_sklearn_experiments(
    clfs: list[BaseEstimator],
    df: pd.DataFrame,
    scoring: dict[str, _BaseScorer],
    cv: int,
    preprocessors: list[Any],
    feature_column: str,
    target_column: str,
    n_jobs: int = 1,
):
    """Run Scikit-Learn based experiments."""

    ALGO_NAME_DICT = {
        ExtraTreesClassifier: "Extra Trees",
        DecisionTreeClassifier: "Decision Tree",
        SVC: "Support Vector",
        RandomForestClassifier: "Random Forest",
    }

    ENCODER_NAME_DICT = {
        TfidfVectorizer: "TF-IDF",
        BERTEncoder: "BERT",
    }

    def _run_sklearn_cv(
        clf: BaseEstimator,
        X: pd.DataFrame,
        y: pd.DataFrame,
        scoring: dict[str, _BaseScorer],
        cv: int,
    ):
        """Run cross-validation on a Scikit-Learn estimator."""

        assert issubclass(type(clf), BaseEstimator) is True

        cv_results = cross_validate(
            clf,
            X,
            y,
            cv=cv,
            scoring=scoring,
            n_jobs=n_jobs,
        )
        cv_results["algo"] = ALGO_NAME_DICT[type(clf.named_steps["clf"])]
        cv_results["encoder"] = ENCODER_NAME_DICT[type(clf.named_steps["encoder"])]
        return cv_results

    cv_results_list = []

    # Apply preprocessors
    for preprocessor in preprocessors:
        df[feature_column] = df[feature_column].apply(
            lambda feature: preprocessor(feature)
        )

    for clf in clfs:
        # Create pipelines
        tfidf_pipeline = _create_tfidf_pipeline(clf)
        bert_pipeline = _create_bert_pipeline(clf)

        # Run CV using the pipelines
        tfidf_cv_results_list = _run_sklearn_cv(
            tfidf_pipeline,
            df[feature_column],
            df[target_column],
            scoring,
            cv,
        )

        bert_cv_results_list = _run_sklearn_cv(
            bert_pipeline,
            df[feature_column],
            df[target_column],
            scoring,
            cv,
        )

        # Join results
        joint_cv_results_list = tfidf_cv_results_list + bert_cv_results_list
        cv_results_list.append(joint_cv_results_list)

    return cv_results_list

In [4]:
df = pd.read_csv(DATASET, index_col=0)
df.head()

Unnamed: 0,text,motivating_situation,image_url,image_path,cohesion,thematic_coherence,formal_register,text_typology
0,( O chorrinho nino )\n-\nEu e...,Eu encontrei em cima do armário alguns potes c...,https://storage.googleapis.com/ciclos-10698-bu...,MEC/Rc7dMxTP7ZdLNEvmF0jo/iet1QFw2ARNkv9Hx6KGe/...,3,3,3,4
1,.As meninas do potes de Tintas [T]\nUma vez eu...,Eu encontrei em cima do armário alguns potes c...,https://storage.googleapis.com/ciclos-10698-bu...,MEC/Rc7dMxTP7ZdLNEvmF0jo/F80gTOBoh2Lk5JtrLWUO/...,3,3,3,4
2,1 uma menina encontrou varios potes de tinta ...,Eu encontrei em cima do armário alguns potes c...,https://storage.googleapis.com/ciclos-10698-bu...,MEC/Rc7dMxTP7ZdLNEvmF0jo/HjnamZPzaZcDeEANF2IC/...,3,3,3,4
3,2011 [T] A menina da (artes). \n\nDePois que e...,Eu encontrei em cima do armário alguns potes c...,https://storage.googleapis.com/ciclos-10698-bu...,MEC/Rc7dMxTP7ZdLNEvmF0jo/r3DUtFJn9twSvqcNloNM/...,3,3,3,4
4,28/03/2022\n[T] A cobra felena\nem uma tarde m...,Eu encontrei em cima do armário alguns potes c...,https://storage.googleapis.com/ciclos-10698-bu...,MEC/Rc7dMxTP7ZdLNEvmF0jo/tKZIn59KtOcbe9z0n6oY/...,4,1,4,4


In [5]:
# Function to check if a value is a string
def is_string(value):
    return isinstance(value, str)

# Apply the function and filter the DataFrame
df_filtered = df[df['text'].apply(is_string)]

# Reset index if needed
df_filtered.reset_index(drop=True, inplace=True)

In [6]:
clfs = [
    SVC(),
    RandomForestClassifier(),
    DecisionTreeClassifier(),
    ExtraTreesClassifier()
]

with warnings.catch_warnings():
    warnings.simplefilter(action="ignore", category=UndefinedMetricWarning)
    warnings.simplefilter(action="ignore", category=FutureWarning)
    cv_results = run_sklearn_experiments(clfs, df_filtered, scoring=SCORING, cv=CV, preprocessors=[clean_text], feature_column="text", target_column="cohesion")

TypeError: unsupported operand type(s) for +: 'dict' and 'dict'

In [None]:
cv_results_df = scores_to_df(cv_results)
cv_results_df

Unnamed: 0,algo,encoder,acc,macro_prec,weighted_prec,macro_recall,weighted_recall,macro_f1,weighted_f1,kappa
0,Support Vector,TF-IDF,0.681825,0.259292,0.544278,0.206309,0.681825,0.175538,0.5603,0.02011
1,Random Forest,TF-IDF,0.66667,0.299165,0.530862,0.228474,0.66667,0.21794,0.565789,0.031691
2,Decision Tree,TF-IDF,0.557256,0.296601,0.543236,0.276714,0.557256,0.281307,0.54878,0.079544
3,Extra Trees,TF-IDF,0.651502,0.250197,0.518574,0.225143,0.651502,0.21249,0.559056,0.021207


In [None]:
def run_xgboost_experiments(clf, X: pd.DataFrame, y: pd.DataFrame, random_state: int = 42):
    """Run XGBoost based experiments."""

In [None]:
def run_catboost_experiments(clf, X: pd.DataFrame, y: pd.DataFrame, random_state: int = 42):
    """Run CatBoost based experiments."""