In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Embedding, LSTM, Bidirectional,
    Dense, Dropout, Layer, Softmax, GlobalAveragePooling1D,
)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import precision_recall_curve, confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict

# ===========================================================
# 1. Carregar dataset e preparar textos
# ===========================================================
df = pd.read_csv("/Users/matheusmota/src/github/msc/msc-proj/data/academic_works.csv")
df = df.head(100).copy()
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df["included"] = df["included"].astype(bool)
df["text"] = (df["title"].fillna("") + " " +
              df["keywords"].fillna("") + " " +
              df["abstract"].fillna(""))

y = np.array(df["included"].to_list())

# ===========================================================
# 2. Tokenização
# ===========================================================
max_words = 20000
max_len = 300
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df["text"].to_list())
sequences = tokenizer.texts_to_sequences(df["text"].to_list())
x_seq = pad_sequences(sequences, maxlen=max_len)

# ===========================================================
# 3. Carregar embeddings (GloVe)
# ===========================================================
embeddings_index = {}
with open('../data/word_vectors/glove/glove.6B.300d.txt', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

embedding_dim = 300
word_index = tokenizer.word_index
num_words = min(max_words, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        vec = embeddings_index.get(word)
        if vec is not None:
            embedding_matrix[i] = vec

# ===========================================================
# 4. Divisão dos dados: 70/20/10
# ===========================================================
# 70% treino + 30% (val + teste)
x_train_full, x_temp, y_train_full, y_temp = train_test_split(
    x_seq, y, test_size=0.3, stratify=y, random_state=42
)
# Dentro dos 30%, separar 2/3 para validação (20% total) e 1/3 para teste (10% total)
x_val, x_test, y_val, y_test = train_test_split(
    x_temp, y_temp, test_size=(1/3), stratify=y_temp, random_state=42
)

# ===========================================================
# 5. Modelo com atenção Bahdanau
# ===========================================================
class BahdanauAttention(Layer):
    def __init__(self, units):
        super().__init__()
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)
        self.softmax = Softmax(axis=1)

    def call(self, query, values):
        score = tf.nn.tanh(self.W1(query) + self.W2(values))
        attn_weights = self.softmax(self.V(score))
        context = attn_weights * values
        context = tf.reduce_sum(context, axis=1)
        return context

def build_model(bidirectional=True, use_attention=True):
    inputs = Input(shape=(max_len,))
    embedding = Embedding(
        input_dim=num_words,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=max_len,
        trainable=False
    )(inputs)

    if bidirectional:
        x = Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))(embedding)
    else:
        x = LSTM(100, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)(embedding)

    if use_attention:
        x = BahdanauAttention(100)(x, x)
    else:
        x = GlobalAveragePooling1D()(x)

    x = Dropout(0.5 if not bidirectional else 0.02)(x)
    outputs = Dense(1, activation='sigmoid')(x)
    return Model(inputs, outputs)

# ===========================================================
# 6. Experimentos
# ===========================================================
experiments = [
    ("SVM with SGD + TF-IDF", "SVM"),
    ("Bi-LSTM + GloVe", (True, False)),
    ("Bi-LSTM + GloVe + Attention", (True, True)),
    ("LSTM + GloVe", (False, False)),
    ("LSTM + GloVe + Attention", (False, True)),
]

# ===========================================================
# 7. Função de simulação com K-Fold
# ===========================================================
from typing import Tuple, TypedDict

class ResultOut(TypedDict):
    Dataset: str
    Classifier: str
    TP: int
    FP: int
    TN: int
    FN: int
    N: int
    P: float
    R: float
    F2: float
    WSS95: float

def simulate_model(name: str, config: str | Tuple[bool, bool]) -> ResultOut:
    if config == "SVM":
        vect = TfidfVectorizer(ngram_range=(1, 3))
        X_tfidf = vect.fit_transform(df["abstract"].to_list())
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        clf = SGDClassifier(loss='hinge', penalty='l2', max_iter=1000, random_state=42)
        y_scores = cross_val_predict(clf, X_tfidf, y, cv=cv, method='decision_function')
    else:
        bidirectional, attention = config
        model = build_model(bidirectional, attention)
        lr = 1e-4 if bidirectional else 3e-4
        model.compile(optimizer=tf.keras.optimizers.Adam(lr),
                      loss='binary_crossentropy',
                      metrics=['accuracy'])

        # --- K-Fold apenas sobre o conjunto de treino ---
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        y_scores = np.zeros_like(y_train_full, dtype=float)

        for fold, (train_idx, val_idx) in enumerate(skf.split(x_train_full, y_train_full)):
            print(f"Fold {fold+1}/5")
            x_tr, x_val_k = x_train_full[train_idx], x_train_full[val_idx]
            y_tr, y_val_k = y_train_full[train_idx], y_train_full[val_idx]

            model_fold = build_model(bidirectional, attention)
            model_fold.compile(optimizer=tf.keras.optimizers.Adam(lr),
                               loss='binary_crossentropy',
                               metrics=['accuracy'])
            model_fold.fit(
                x_tr, y_tr,
                validation_data=(x_val_k, y_val_k),
                batch_size=64,
                epochs=5,
                verbose=0,
                class_weight={0:1, 1:44}
            )
            y_scores[val_idx] = model_fold.predict(x_val_k).flatten()

    if config == "SVM":
        y_true_scores = y
    else:
        # Para redes, só sobre o conjunto de treino
        y_true_scores = y_train_full

    prec, rec, thresh = precision_recall_curve(y_true_scores, y_scores)

    best_thresh = next((t for p, r, t in zip(prec, rec, thresh) if r >= 0.95), 0.5)

    y_pred = (y_scores >= best_thresh).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true_scores, y_pred).ravel()

    N = tn + fp
    P = tp / (tp + fp) if (tp + fp) else 0
    R = tp / (tp + fn) if (tp + fn) else 0
    F2 = (5 * P * R) / (4 * P + R) if (P + R) else 0
    WSS95 = (N - fp)/N - (1-0.95) if N else 0

    return ResultOut(
        Dataset="academic_works.csv",
        Classifier=name,
        TP=tp, FP=fp, TN=tn, FN=fn,
        N=N, P=P, R=R, F2=F2, WSS95=WSS95
    )

# ===========================================================
# 8. Execução e salvamento dos resultados
# ===========================================================
import time, os
from pathlib import Path

os.environ['CUDA_VISIBLE_DEVICES'] = ''
results = []

for name, config in experiments:
    print(f"\nRodando {name}...")
    start = time.perf_counter()
    result = simulate_model(name, config)
    end = time.perf_counter()
    result["time"] = end - start
    results.append(result)

results_df = pd.DataFrame(results)
folder = "../data/results"
Path(folder).mkdir(parents=True, exist_ok=True)
results_df.to_excel(f"{folder}/results_table.xlsx", index=False)
print(results_df.to_markdown())


In [2]:
import numpy as np
import pandas as pd
import torch
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification
)
from sklearn.metrics import precision_recall_curve, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import time, os
from pathlib import Path

# ===========================================================
# 1. Carregar dataset e preparar textos
# ===========================================================
df = pd.read_csv("/Users/matheusmota/src/github/msc/msc-proj/data/academic_works.csv")
df = df.head(100).copy()
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df["included"] = df["included"].astype(bool)
df["text"] = (df["title"].fillna("") + " " +
              df["keywords"].fillna("") + " " +
              df["abstract"].fillna(""))

y = np.array(df["included"].to_list())

# ===========================================================
# 2.1. Definir modelos HuggingFace
# ===========================================================
MODEL_LIST = [
    ("BERT-base (Bi-BERT)", "bert-base-uncased"),
    ("DistilBERT", "distilbert-base-uncased"),
    ("RoBERTa-base", "roberta-base"),
    ("ALBERT-base-v2", "albert-base-v2"),
]

max_len = 256

# ===========================================================
# 3. Dataset PyTorch
# ===========================================================
class BertDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = torch.tensor(labels, dtype=torch.float)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

from typing import TypedDict

class ResultOut(TypedDict):
    Dataset: str
    Classifier: str
    TP: int
    FP: int
    TN: int
    FN: int
    N: int
    P: float
    R: float
    F2: float
    WSS95: float
    time: float

# ===========================================================
# 4. Função de simulação K-Fold para QUALQUER modelo
# ===========================================================
def simulate_hf_model(name: str, model_name: str) -> ResultOut:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    encodings = tokenizer(
        list(df["text"]),
        truncation=True,
        padding='max_length',
        max_length=max_len,
        return_tensors='pt'
    )

    input_ids = encodings['input_ids']
    # Nem todos modelos usam token_type_ids (ex: RoBERTa)
    attention_mask = encodings['attention_mask']

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    y_scores = np.zeros(len(y), dtype=float)

    for fold, (train_idx, val_idx) in enumerate(kfold.split(input_ids, y)):
        print(f"Fold {fold+1}/5 — {name}")
        ids_train, ids_val = input_ids[train_idx], input_ids[val_idx]
        mask_train, mask_val = attention_mask[train_idx], attention_mask[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]

        train_dataset = BertDataset(ids_train, mask_train, y_tr)
        val_dataset   = BertDataset(ids_val, mask_val, y_val)
        train_loader  = DataLoader(train_dataset, batch_size=8, shuffle=True)
        val_loader    = DataLoader(val_dataset, batch_size=8)

        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
        model.to(device)
        optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
        loss_fn = torch.nn.BCEWithLogitsLoss()

        model.train()
        for epoch in range(2):
            for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Train {name}"):
                optimizer.zero_grad()
                input_ids_ = batch['input_ids'].to(device)
                attention_mask_ = batch['attention_mask'].to(device)
                labels_ = batch['labels'].unsqueeze(1).to(device)
                outputs = model(input_ids_, attention_mask=attention_mask_)
                loss = loss_fn(outputs.logits, labels_)
                loss.backward()
                optimizer.step()

        model.eval()
        preds_fold = []
        with torch.no_grad():
            for batch in val_loader:
                input_ids_ = batch['input_ids'].to(device)
                attention_mask_ = batch['attention_mask'].to(device)
                outputs = model(input_ids_, attention_mask=attention_mask_)
                scores = torch.sigmoid(outputs.logits).cpu().numpy().flatten()
                preds_fold.extend(scores)
        y_scores[val_idx] = np.array(preds_fold)

    prec, rec, thresh = precision_recall_curve(y, y_scores)
    best_thresh = next((t for p, r, t in zip(prec, rec, thresh) if r >= 0.95), 0.5)
    y_pred = (y_scores >= best_thresh).astype(int)
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()

    N = tn + fp
    P = tp / (tp + fp) if (tp + fp) else 0
    R = tp / (tp + fn) if (tp + fn) else 0
    F2 = (5 * P * R) / (4 * P + R) if (P + R) else 0
    WSS95 = (N - fp)/N - (1-0.95) if N else 0

    return ResultOut(
        Dataset="academic_works.csv",
        Classifier=name,
        TP=tp, FP=fp, TN=tn, FN=fn,
        N=N, P=P, R=R, F2=F2, WSS95=WSS95,
        time=0.0
    )

# ===========================================================
# 5. Execução e Salvamento dos Resultados
# ===========================================================
results = []
for name, model_hf in MODEL_LIST:
    print(f"\nRodando {name} ({model_hf})...")
    start = time.perf_counter()
    result = simulate_hf_model(name, model_hf)
    end = time.perf_counter()
    result["time"] = end - start
    results.append(result)

results_df = pd.DataFrame(results)
folder = "../data/results"
Path(folder).mkdir(parents=True, exist_ok=True)
results_df.to_excel(f"{folder}/results_hf_models.xlsx", index=False)
print(results_df.to_markdown())



Rodando BERT-base (Bi-BERT) (bert-base-uncased)...
Fold 1/5 — BERT-base (Bi-BERT)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Train BERT-base (Bi-BERT): 100%|██████████| 10/10 [00:39<00:00,  3.91s/it]
Epoch 2 Train BERT-base (Bi-BERT): 100%|██████████| 10/10 [00:38<00:00,  3.84s/it]


Fold 2/5 — BERT-base (Bi-BERT)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Train BERT-base (Bi-BERT): 100%|██████████| 10/10 [00:37<00:00,  3.79s/it]
Epoch 2 Train BERT-base (Bi-BERT): 100%|██████████| 10/10 [00:37<00:00,  3.79s/it]


Fold 3/5 — BERT-base (Bi-BERT)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Train BERT-base (Bi-BERT): 100%|██████████| 10/10 [00:39<00:00,  4.00s/it]
Epoch 2 Train BERT-base (Bi-BERT): 100%|██████████| 10/10 [00:44<00:00,  4.40s/it]


Fold 4/5 — BERT-base (Bi-BERT)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Train BERT-base (Bi-BERT): 100%|██████████| 10/10 [00:40<00:00,  4.07s/it]
Epoch 2 Train BERT-base (Bi-BERT): 100%|██████████| 10/10 [00:39<00:00,  3.91s/it]


Fold 5/5 — BERT-base (Bi-BERT)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Train BERT-base (Bi-BERT): 100%|██████████| 10/10 [00:41<00:00,  4.11s/it]
Epoch 2 Train BERT-base (Bi-BERT): 100%|██████████| 10/10 [00:39<00:00,  3.96s/it]



Rodando DistilBERT (distilbert-base-uncased)...
Fold 1/5 — DistilBERT


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Train DistilBERT: 100%|██████████| 10/10 [00:19<00:00,  2.00s/it]
Epoch 2 Train DistilBERT: 100%|██████████| 10/10 [00:19<00:00,  1.96s/it]


Fold 2/5 — DistilBERT


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Train DistilBERT: 100%|██████████| 10/10 [00:21<00:00,  2.19s/it]
Epoch 2 Train DistilBERT: 100%|██████████| 10/10 [00:19<00:00,  1.97s/it]


Fold 3/5 — DistilBERT


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Train DistilBERT: 100%|██████████| 10/10 [00:21<00:00,  2.11s/it]
Epoch 2 Train DistilBERT: 100%|██████████| 10/10 [00:19<00:00,  1.92s/it]


Fold 4/5 — DistilBERT


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Train DistilBERT: 100%|██████████| 10/10 [00:18<00:00,  1.89s/it]
Epoch 2 Train DistilBERT: 100%|██████████| 10/10 [00:20<00:00,  2.08s/it]


Fold 5/5 — DistilBERT


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Train DistilBERT: 100%|██████████| 10/10 [00:19<00:00,  1.99s/it]
Epoch 2 Train DistilBERT: 100%|██████████| 10/10 [00:19<00:00,  1.98s/it]



Rodando RoBERTa-base (roberta-base)...
Fold 1/5 — RoBERTa-base


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Train RoBERTa-base: 100%|██████████| 10/10 [00:41<00:00,  4.11s/it]
Epoch 2 Train RoBERTa-base: 100%|██████████| 10/10 [00:38<00:00,  3.83s/it]


Fold 2/5 — RoBERTa-base


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Train RoBERTa-base: 100%|██████████| 10/10 [00:40<00:00,  4.01s/it]
Epoch 2 Train RoBERTa-base: 100%|██████████| 10/10 [00:37<00:00,  3.77s/it]


Fold 3/5 — RoBERTa-base


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Train RoBERTa-base: 100%|██████████| 10/10 [00:41<00:00,  4.10s/it]
Epoch 2 Train RoBERTa-base: 100%|██████████| 10/10 [00:41<00:00,  4.12s/it]


Fold 4/5 — RoBERTa-base


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Train RoBERTa-base: 100%|██████████| 10/10 [00:40<00:00,  4.04s/it]
Epoch 2 Train RoBERTa-base: 100%|██████████| 10/10 [00:38<00:00,  3.88s/it]


Fold 5/5 — RoBERTa-base


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Train RoBERTa-base: 100%|██████████| 10/10 [00:38<00:00,  3.88s/it]
Epoch 2 Train RoBERTa-base: 100%|██████████| 10/10 [00:38<00:00,  3.88s/it]



Rodando ALBERT-base-v2 (albert-base-v2)...
Fold 1/5 — ALBERT-base-v2


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Train ALBERT-base-v2: 100%|██████████| 10/10 [00:24<00:00,  2.42s/it]
Epoch 2 Train ALBERT-base-v2: 100%|██████████| 10/10 [00:27<00:00,  2.72s/it]


Fold 2/5 — ALBERT-base-v2


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Train ALBERT-base-v2: 100%|██████████| 10/10 [00:27<00:00,  2.79s/it]
Epoch 2 Train ALBERT-base-v2: 100%|██████████| 10/10 [00:29<00:00,  2.92s/it]


Fold 3/5 — ALBERT-base-v2


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Train ALBERT-base-v2: 100%|██████████| 10/10 [00:28<00:00,  2.82s/it]
Epoch 2 Train ALBERT-base-v2: 100%|██████████| 10/10 [00:29<00:00,  2.92s/it]


Fold 4/5 — ALBERT-base-v2


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Train ALBERT-base-v2: 100%|██████████| 10/10 [00:28<00:00,  2.87s/it]
Epoch 2 Train ALBERT-base-v2: 100%|██████████| 10/10 [00:29<00:00,  2.93s/it]


Fold 5/5 — ALBERT-base-v2


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Train ALBERT-base-v2: 100%|██████████| 10/10 [00:27<00:00,  2.79s/it]
Epoch 2 Train ALBERT-base-v2: 100%|██████████| 10/10 [00:27<00:00,  2.75s/it]


|    | Dataset            | Classifier          |   TP |   FP |   TN |   FN |   N |   P |   R |       F2 |   WSS95 |    time |
|---:|:-------------------|:--------------------|-----:|-----:|-----:|-----:|----:|----:|----:|---------:|--------:|--------:|
|  0 | academic_works.csv | BERT-base (Bi-BERT) |   20 |   80 |    0 |    0 |  80 | 0.2 |   1 | 0.555556 |   -0.05 | 410.187 |
|  1 | academic_works.csv | DistilBERT          |   20 |   80 |    0 |    0 |  80 | 0.2 |   1 | 0.555556 |   -0.05 | 220.087 |
|  2 | academic_works.csv | RoBERTa-base        |   20 |   80 |    0 |    0 |  80 | 0.2 |   1 | 0.555556 |   -0.05 | 420.127 |
|  3 | academic_works.csv | ALBERT-base-v2      |   20 |   80 |    0 |    0 |  80 | 0.2 |   1 | 0.555556 |   -0.05 | 298.866 |
