In [1]:
# activate autoreload
%load_ext autoreload
%autoreload 2

# check if session is in Google Colab
try:
    import google.colab
    IN_COLAB = True
    print('Google Colab session!')
except:
    IN_COLAB = False
    print('Not a Google Colab session.')

# add src path to the notebook
import os
import sys
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    PROJECT_ROOT: str = '/content/drive/MyDrive/papers/2025b_relevance_2.0'
    !pip install contextily esda deep-translator h3pandas h3~=3.0 datasets optuna setfit
else:
    PROJECT_ROOT: str = os.path.dirname(os.path.abspath(os.path.dirname("__file__")))
if PROJECT_ROOT not in sys.path:
    sys.path.append(os.path.join(PROJECT_ROOT))
print(PROJECT_ROOT)

Not a Google Colab session.
/mnt/c/Users/DavidHanny/OneDrive - IT U interdisciplinary transformation university austria/Documents/projects/papers/2025a_relevance_classification_2.0


# Meta Learning
Okay, so instead of simply concatenating representations or in-context learning, we can also try to join the features with meta learning.

In [7]:
import warnings
import pickle
import torch
import numpy as np
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
from transformers import pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, root_mean_squared_error, mean_absolute_error, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from src.model_training.classification_head import optimise_model, evaluate_model
from src.model_training.bert import train_classifier, extract_probabilities
tqdm.pandas()
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# surpress ConvergenceWarnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses

# set data path
DATA_PATH: str = os.path.join(PROJECT_ROOT, 'data')
RESULTS_PATH: str = os.path.join(PROJECT_ROOT, 'results')
print(DATA_PATH)

# set pytorch device
device: str = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
if device == 'cuda':
    print(torch.cuda.get_device_name(0))

/mnt/c/Users/DavidHanny/OneDrive - IT U interdisciplinary transformation university austria/Documents/projects/papers/2025a_relevance_classification_2.0/data
Device: cuda


## 1. Training Data
First, we need to prepare our training and evaluation data that we will use throughout the study.

In [3]:
train_gdf: gpd.GeoDataFrame = gpd.read_parquet(os.path.join(DATA_PATH, 'processed', 'fine_tuning', 'train_data.parquet'))
test_gdf: gpd.GeoDataFrame = gpd.read_parquet(os.path.join(DATA_PATH, 'processed', 'fine_tuning', 'test_data.parquet'))
with open(os.path.join(DATA_PATH, 'processed', 'fine_tuning', 'train_label_encoder.pkl'), 'rb') as f:
    label_encoder: OrdinalEncoder = pickle.load(f)

NON_TEXT_COLUMNS: list[str] = [
    'event_distance_km',
    'event_distance_h',
    'n_disaster_tweets_1km',
    'n_disaster_tweets_10km',
    'n_disaster_tweets_50km',
    'n_disaster_tweets_10000km'
]
NON_TEXT_COLUMNS_NORM: list[str] = [f'{x}_norm' for x in NON_TEXT_COLUMNS]

# Now you can use the loaded label encoder
print("Class encodings:", label_encoder.categories[0])
print(train_gdf.shape)
print(test_gdf.shape)
pd.DataFrame(train_gdf)

Class encodings: ['Not related', 'Related but not relevant', 'Related and relevant']
(3659, 45)
(915, 45)


Unnamed: 0,message_id,date,use_case,text,tweet_lang,geometry,photo_url,text_raw,related,x,...,sphere_y,sphere_z,int_label,valid,event_distance_km_norm,event_distance_h_norm,n_disaster_tweets_1km_norm,n_disaster_tweets_10km_norm,n_disaster_tweets_50km_norm,n_disaster_tweets_10000km_norm
0,1.296800e+18,2020-08-21 13:40:45,California 🔥,Closed due to the czu august lightning complex...,,POINT (-122.36110 37.16663),,Closed due to the czu august lightning complex...,1,-1.362118e+07,...,0.095685,-0.954961,1,True,-0.457531,-0.099120,-0.370997,-0.451317,0.758702,1.366236
1,1.417100e+18,2021-07-19 12:23:18,Germany 🌊,Mich beunruhigt nichts mehr.Wir sorgen persönl...,de,POINT (12.22671 51.84923),,Mich beunruhigt nichts mehr.Wir sorgen persönl...,1,1.361556e+06,...,0.645793,0.288072,1,True,1.300809,0.095609,0.362793,0.052028,-0.467472,1.510834
2,1.341270e+18,2020-12-22 06:29:24,California 🔥,The view out my kitchen window of the massive ...,,POINT (-118.41191 34.02069),,The view out my kitchen window of the massive ...,1,-1.318155e+07,...,0.095685,-0.954961,2,True,-0.714648,0.772862,0.798591,0.527691,-0.172292,-0.719290
3,1.320860e+18,2020-10-26 22:49:26,California 🔥,@user @user @user I love 1/2 miles from Anahei...,,POINT (-117.85109 33.84275),,@ZestForLifeNow @City_of_Anaheim @AnaheimFire ...,1,-1.311912e+07,...,0.095685,-0.954961,2,True,-0.714648,-0.099120,-0.105182,-0.068227,0.592908,-0.276396
4,1.296050e+18,2020-08-19 11:26:50,California 🔥,Someone fucking set off my apartment building’...,,POINT (-118.41191 34.02069),,Someone fucking set off my apartment building’...,1,-1.318155e+07,...,0.095685,-0.954961,0,True,-0.714648,0.852190,2.446648,1.889790,1.001015,0.244985
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3654,1.624650e+18,2023-02-12 06:18:19,Turkey 🪨,Hatay/Hassa'da en kazın altında çıkan not...#d...,tr,POINT (36.51252 36.78301),http://pbs.twimg.com/media/FovuSvIX0AUnSdz.jpg,Hatay/Hassa'da en kazın altında çıkan not...\r...,1,4.064655e+06,...,-0.317761,0.887312,2,True,-1.168961,-0.188440,-0.317823,-0.436595,0.543710,2.060679
3655,1.610290e+18,2023-01-03 15:19:57,Turkey 🪨,FutBol Sohbet programımızın yeni bölümü YouTub...,tr,POINT (33.78141 41.38023),http://pbs.twimg.com/media/FljqrPJWQAcU-DW.jpg,FutBol Sohbet programımızın yeni bölümü YouTub...,0,3.760672e+06,...,-0.317761,0.887312,0,True,0.882802,-2.645158,-0.544401,-0.635649,-0.646982,-1.109802
3656,1.627130e+18,2023-02-19 02:12:45,Chile 🔥,Pasando ahora 😭😭 #Coronel #Biobio #IncendioFor...,es,POINT (-73.22220 -37.00482),http://pbs.twimg.com/ext_tw_video_thumb/162712...,Pasando ahora 😭😭 #Coronel #Biobio #IncendioFor...,1,-8.154494e+06,...,0.869559,0.356159,2,True,-0.312896,0.300568,-0.480332,-0.410065,0.183655,0.021594
3657,1.416140e+18,2021-07-16 21:10:24,Germany 🌊,Rhein unterspült Uferstrasse in Basel und löst...,de,POINT (7.65276 47.57676),,Rhein unterspült Uferstrasse in Basel und löst...,1,8.524011e+05,...,0.645793,0.288072,1,True,0.852110,-0.180118,-0.141470,-0.150951,-0.432042,0.471640


## 1. Out-of-fold Preparation
With the data prepared, we can try to train an ensemble of (1) a text model and (2) a non-text model using the out-of-fold (OOF) predictions.

In [4]:
# Define the event and location encoding options
encoding_options = {
    "none": lambda df: np.empty((df.shape[0], 0)),  # returns an empty array so hstack works
    "event_type_encoding": lambda df: np.vstack(df['event_type_encoding'].values),
    "sphere_coords": lambda df: df[['sphere_x', 'sphere_y', 'sphere_z']].values,
    "all": lambda df: np.hstack([
        np.vstack(df['event_type_encoding'].values),
        df[['sphere_x', 'sphere_y', 'sphere_z']].values
    ])
}

# Models to evaluate for the non-text features
models = {
    "logistic_regression": LogisticRegression(random_state=1),
    "random_forest": RandomForestClassifier(random_state=2),
    "svm": SVC(probability=True, random_state=3),
    "gradient_boosting": GradientBoostingClassifier(random_state=5),
    "knn": KNeighborsClassifier(),
    "naive_bayes": GaussianNB()
}

Let's also prepare the data first.

In [5]:
# Construct the feature matrix for the training data
X_base_train: np.ndarray = train_gdf[NON_TEXT_COLUMNS].values  # base features
X_event_train: np.ndarray = encoding_options['all'](train_gdf)  # event encoding features
X_train_non_text: np.ndarray = np.hstack([X_base_train, X_event_train])
X_train_text: np.ndarray = train_gdf['text'].values
y_train: np.ndarray = train_gdf['int_label'].values

# Construct the feature matrix for the test data
X_base_test: np.ndarray = test_gdf[NON_TEXT_COLUMNS].values  # base features
X_event_test: np.ndarray = encoding_options['all'](test_gdf)  # event encoding features
X_test_text: np.ndarray = test_gdf['text'].values
X_test_non_text: np.ndarray = np.hstack([X_base_test, X_event_test])
y_test: np.ndarray = test_gdf['int_label'].values

print(X_train_non_text.shape, X_train_text.shape)
print(X_test_non_text.shape, X_test_text.shape)

(3659, 12) (3659,)
(915, 12) (915,)


We then repeatedly train models on four folds and predict for the other one.

In [22]:
# Define the folds for stacking
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Arrays to store out-of-fold predicted probabilities for each base model
oof_preds_nontext = np.zeros((X_train_non_text.shape[0], len(train_gdf['int_label'].unique())))
oof_features_nontext = np.zeros((X_train_non_text.shape[0], X_train_non_text.shape[1]))
oof_preds_text = np.zeros((X_train_non_text.shape[0], len(train_gdf['int_label'].unique())))

for train_idx, val_idx in kf.split(X_train_non_text, y_train):
    # Split data for both non-text and text features
    X_train_non_text_kf, X_val_non_text_kf = X_train_non_text[train_idx], X_train_non_text[val_idx]
    X_train_text_kf, X_val_text_kf = X_train_text[train_idx], X_train_text[val_idx]
    y_train_kf, y_val_kf = y_train[train_idx], y_train[val_idx]

    X_train_text_kf_bert, X_val_text_kf_bert, y_train_kf_bert, y_val_kf_bert = train_test_split(
        X_train_text_kf, y_train_kf, test_size=0.2, random_state=1
    )

    # Fit a BERT classification model for the text features
    model, tokenizer, eval_results = train_classifier(
        texts_train=X_train_text_kf_bert.tolist(),
        texts_val=X_val_text_kf_bert.tolist(),
        y_train=y_train_kf_bert.tolist(),
        y_val=y_val_kf_bert.tolist(),
        model_name='Twitter/twhin-bert-base',
        model_path=os.path.join(DATA_PATH, 'models', f'twhin-bert-base_ft_kf', 'model'),
        logging_path=os.path.join(DATA_PATH, 'models', f'twhin-bert-base_ft_kf', 'logs'),
        weighted_loss=False,
        id2label={i: label for i, label in enumerate(label_encoder.categories[0])},
        label2id={label: i for i, label in enumerate(label_encoder.categories[0])},
        learning_rate=0.00004657782284000393,
        weight_decay=0.06513203428915136,
        epochs=5,
        batch_size=16
    )
    print(f'Validation macro F1 for text model: {eval_results}')

    # Create a text classification pipeline using your fine-tuned model and tokenizer
    classifier = pipeline("text-classification", model=os.path.join(DATA_PATH, 'models', f'twhin-bert-base_ft_kf', 'model'), 
                          device=device, return_all_scores=True)
    
    # Make OOF predictions using the trained text model
    text_predictions: list = []
    for text in X_val_text_kf.tolist():
        text_predictions.append(extract_probabilities(text=text, classifier=classifier))
    text_prediction_df: pd.DataFrame = pd.DataFrame.from_dict(text_predictions)
    oof_preds_text[val_idx, :] = text_prediction_df[['p_not_related', 'p_related_but_not_relevant',
                                                     'p_related_and_relevant']].values
    
    # Next, train a model on the non-text features with parametrisation based from the non-text evaluation
    non_text_model, non_text_params, non_text_f1 = evaluate_model(
        RandomForestClassifier(random_state=2, n_estimators=100, max_depth=None, min_samples_split=5), 
        X=X_train_non_text_kf, y=y_train_kf
    )
    print(f'Fitted non-text model (random forest) with validation macro F1: {non_text_f1}')
    print(non_text_params)

    # Get the OOF predictions for the non-text model
    oof_preds_nontext[val_idx, :] = non_text_model.predict_proba(X_val_non_text_kf)

print(oof_preds_text[:5])
print(oof_preds_nontext[:5])

# Store the OOF predictions for later
np.save(os.path.join(RESULTS_PATH, 'classif_head', 'oof_preds_text.npy'), oof_preds_text)
np.save(os.path.join(RESULTS_PATH, 'classif_head', 'oof_preds_nontext.npy'), oof_preds_nontext)

Fitted non-text model (random forest) with validation macro F1: 0.6869042751741713
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 2, 'verbose': 0, 'warm_start': False}
Fitted non-text model (random forest) with validation macro F1: 0.6958871168875864
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 2, 'verbose': 0, 'warm_start': False}
Fitted n

We can now use the out-of-fold predictions to train a set of meta classifiers.

## 2. Meta Model Training
First, we go ahead and train a full ensemble based on the OOF prediction probabilities. We also train a partial meta model that is based on the text OOF predictions and the non-text features.

In [8]:
oof_preds_text = np.load(os.path.join(RESULTS_PATH, 'classif_head', 'oof_preds_text.npy'))
oof_preds_nontext = np.load(os.path.join(RESULTS_PATH, 'classif_head', 'oof_preds_nontext.npy'))

print(oof_preds_text[:5])
print(oof_preds_nontext[:5])

meta_training_results: list[dict] = []

# Features based on probabilities
meta_prob_features_train: np.ndarray = np.concatenate([oof_preds_nontext, oof_preds_text], axis=1)  # shape: [n_train, 2*n_classes]
print(meta_prob_features_train.shape)

# Features based on text probabilities and non-text features
meta_part_features_train: np.ndarray = np.concatenate([X_train_non_text, oof_preds_text], axis=1)  # shape: [n_train, n_classes+9]
print(meta_part_features_train.shape)

# Fit and store all models
for model_name, model in tqdm(models.items()):
    # Train a model on the OOF probabilities only
    prob_meta_model, prob_meta_params, prob_meta_f1 = optimise_model(model, meta_prob_features_train, y_train)
    meta_training_results.append({
        'model_name': model_name,
        'method': 'probabilities',
        'params': prob_meta_params,
        'cv_macro_f1': prob_meta_f1,
        'model': prob_meta_model
    })

    # Train a model on the OOF probabilities of the text model and the non-text features
    part_meta_model, part_meta_params, part_meta_f1 = optimise_model(model, meta_part_features_train, y_train)
    meta_training_results.append({
        'model_name': model_name,
        'method': 'partial',
        'params': part_meta_params,
        'cv_macro_f1': part_meta_f1,
        'model': part_meta_model
    })

meta_result_df: pd.DataFrame = pd.DataFrame.from_dict(meta_training_results)
meta_result_df.to_csv(os.path.join(RESULTS_PATH, 'classif_head', 'meta_learner_validation.csv'), index=False)
with open(os.path.join(RESULTS_PATH, 'classif_head', 'meta_learner_models.pickle'), 'wb') as f:
    pickle.dump(meta_training_results, f)
meta_result_df

[[9.74210154e-04 9.94033277e-01 4.99257911e-03]
 [1.30112341e-03 9.97927666e-01 7.71237537e-04]
 [3.79778771e-03 4.16162517e-03 9.92040575e-01]
 [7.70559013e-01 8.05612747e-03 2.21384898e-01]
 [9.98559773e-01 3.94126022e-04 1.04616175e-03]]
[[0.06975397 0.36894048 0.56130556]
 [0.01821429 0.97311905 0.00866667]
 [0.49435714 0.41577381 0.08986905]
 [0.14735317 0.38259921 0.47004762]
 [0.24007937 0.45977778 0.30014286]]
(3659, 6)
(3659, 15)


100%|██████████| 6/6 [03:09<00:00, 31.50s/it]


Unnamed: 0,model_name,method,params,cv_macro_f1,model
0,logistic_regression,probabilities,"{'C': 10, 'max_iter': 1000, 'penalty': 'l2', '...",0.81318,"LogisticRegression(C=10, max_iter=1000, random..."
1,logistic_regression,partial,"{'C': 10, 'max_iter': 2000, 'penalty': 'l2', '...",0.811393,"LogisticRegression(C=10, max_iter=2000, random..."
2,random_forest,probabilities,"{'max_depth': 10, 'min_samples_split': 5, 'n_e...",0.813663,"(DecisionTreeClassifier(max_depth=10, max_feat..."
3,random_forest,partial,"{'max_depth': 10, 'min_samples_split': 5, 'n_e...",0.817497,"(DecisionTreeClassifier(max_depth=10, max_feat..."
4,svm,probabilities,"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}",0.808651,"SVC(C=10, probability=True, random_state=3)"
5,svm,partial,"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}",0.565073,"SVC(C=10, probability=True, random_state=3)"
6,gradient_boosting,probabilities,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",0.810717,([DecisionTreeRegressor(criterion='friedman_ms...
7,gradient_boosting,partial,"{'learning_rate': 0.01, 'max_depth': 3, 'n_est...",0.81457,([DecisionTreeRegressor(criterion='friedman_ms...
8,knn,probabilities,"{'n_neighbors': 9, 'weights': 'uniform'}",0.811446,KNeighborsClassifier(n_neighbors=9)
9,knn,partial,"{'n_neighbors': 9, 'weights': 'distance'}",0.66297,"KNeighborsClassifier(n_neighbors=9, weights='d..."


We might also try to do the equivalent per use case.

In [26]:
# please do not run this, it's a waste of time
"""
# Train a model for each 'use_case' in the training data using meta_part_features_train
oof_preds_text = np.load(os.path.join(RESULTS_PATH, 'classif_head', 'oof_preds_text.npy'))
oof_preds_nontext = np.load(os.path.join(RESULTS_PATH, 'classif_head', 'oof_preds_nontext.npy'))

# Features based on text probabilities and non-text features
meta_part_features_train_use_case: np.ndarray = np.concatenate([X_base_train, oof_preds_text], axis=1)  # shape: [n_train, n_classes+9]
print(meta_part_features_train_use_case.shape)

# Get unique use cases from the training dataframe
unique_use_cases = train_gdf['use_case'].unique()
meta_training_results_by_use_case = []

for use_case in unique_use_cases:
    # Create a boolean mask for the current use_case
    use_case_mask = train_gdf['use_case'] == use_case
    
    # Filter the feature matrix and labels for the current use_case
    X_use_case = meta_part_features_train_use_case[use_case_mask]
    y_use_case = y_train[use_case_mask]
    
    # print(f"Training models for use_case: {use_case}")
    # Iterate over the models and train on the use_case-specific data
    for model_name, model in tqdm(models.items(), desc=f"Use case {use_case}"):
        use_case_model, use_case_params, use_case_f1 = optimise_model(model, X_use_case, y_use_case)
        meta_training_results_by_use_case.append({
            'use_case': use_case,
            'model_name': model_name,
            'method': 'partial',
            'params': use_case_params,
            'cv_macro_f1': use_case_f1,
            'model': use_case_model
        })

# Convert results to a DataFrame and save to CSV
meta_result_by_use_case_df = pd.DataFrame.from_dict(meta_training_results_by_use_case)
meta_result_by_use_case_df.to_csv(os.path.join(RESULTS_PATH, 'classif_head', 'meta_learner_validation_by_use_case.csv'), index=False)
with open(os.path.join(RESULTS_PATH, 'classif_head', 'meta_learner_models_per_use_case.pickle'), 'wb') as f:
    pickle.dump(meta_training_results_by_use_case, f)

# Display the resulting DataFrame
meta_result_by_use_case_df
"""

'\n# Train a model for each \'use_case\' in the training data using meta_part_features_train\noof_preds_text = np.load(os.path.join(RESULTS_PATH, \'classif_head\', \'oof_preds_text.npy\'))\noof_preds_nontext = np.load(os.path.join(RESULTS_PATH, \'classif_head\', \'oof_preds_nontext.npy\'))\n\n# Features based on text probabilities and non-text features\nmeta_part_features_train_use_case: np.ndarray = np.concatenate([X_base_train, oof_preds_text], axis=1)  # shape: [n_train, n_classes+9]\nprint(meta_part_features_train_use_case.shape)\n\n# Get unique use cases from the training dataframe\nunique_use_cases = train_gdf[\'use_case\'].unique()\nmeta_training_results_by_use_case = []\n\nfor use_case in unique_use_cases:\n    # Create a boolean mask for the current use_case\n    use_case_mask = train_gdf[\'use_case\'] == use_case\n    \n    # Filter the feature matrix and labels for the current use_case\n    X_use_case = meta_part_features_train_use_case[use_case_mask]\n    y_use_case = y_tra

## 3. Full Ensemble Building
Lastly, let's build the full ensemble, using (1) our best text model and (2) our best non-text model. To achieve this, we first need to train individual classifiers.

In [9]:
# Our text model already has been fine-tuned
classifier_full = pipeline(
    "text-classification", 
    model=os.path.join(DATA_PATH, 'models', f'twhin-bert-base_ft', 'model'),
    device=device, 
    return_all_scores=True
)

# For the non-text model, we should train an optimal one based on the random forest
# Here, optimise_model is assumed to be a function that performs hyperparameter tuning
non_text_model_full, non_text_params_full, non_text_f1_full = optimise_model(
    RandomForestClassifier(random_state=2), X_train_non_text, y_train)
print(f'Fitted non-text model (random forest) with validation macro F1: {non_text_f1_full}')
print(non_text_params_full)

Fitted non-text model (random forest) with validation macro F1: 0.704437589623365
{'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}


We then get the class probabilities for our test data.

In [10]:
# 1. Get probabilities from the non-text model for the test set
test_preds_nontext = non_text_model_full.predict_proba(X_test_non_text)

# 2. Get probabilities from the text model for the test set
text_predictions_test = []
for text in X_test_text.tolist():
    text_predictions_test.append(extract_probabilities(text=text, classifier=classifier_full))
text_prediction_df_test = pd.DataFrame.from_dict(text_predictions_test)
# Make sure the columns order is the same as in training, e.g., ['p_not_related', 'p_related_but_not_relevant', 'p_related_and_relevant']
test_preds_text = text_prediction_df_test[['p_not_related', 'p_related_but_not_relevant', 'p_related_and_relevant']].values

# Features based on probabilities
meta_prob_features_test: np.ndarray = np.concatenate([test_preds_nontext, test_preds_text], axis=1)  # shape: [n_train, 2*n_classes]
print(meta_prob_features_test.shape)

# Features based on text probabilities and non-text features
meta_part_features_test: np.ndarray = np.concatenate([X_test_non_text, test_preds_text], axis=1)  # shape: [n_train, n_classes+12]
print(meta_part_features_test.shape)

# Features based on text probabilities and non-text features for use-case specific evaluation
meta_part_features_test_use_case: np.ndarray = np.concatenate([X_base_test, test_preds_text], axis=1)  # shape: [n_test, n_classes+12=15]
print(meta_part_features_test_use_case.shape)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


(915, 6)
(915, 15)
(915, 9)


Lastly, we are able to apply the meta classifier.

In [11]:
meta_results: list[dict] = []

# Iterate over every possible meta model
for meta_config in meta_training_results:
    # meta model based on probabilities
    if meta_config['method'] == 'probabilities':
        predictions: np.ndarray = meta_config['model'].predict(meta_prob_features_test)
        probs: np.ndarray = meta_config['model'].predict_proba(meta_prob_features_test)
    # meta model based on text probabilities and non-text features
    elif meta_config['method'] == 'partial':
        predictions: np.ndarray = meta_config['model'].predict(meta_part_features_test)
        probs: np.ndarray = meta_config['model'].predict_proba(meta_part_features_test)
    test_gdf[f'pred_{meta_config["method"]}_{meta_config["model_name"]}'] = predictions

    # Compute all needed evaluation metrics
    prec, rec, f1, support = precision_recall_fscore_support(test_gdf['int_label'], predictions, average='macro')
    rmse: float = root_mean_squared_error(y_true=test_gdf['int_label'], y_pred=predictions)
    mae: float = mean_absolute_error(y_true=test_gdf['int_label'], y_pred=predictions)
    roc_auc = roc_auc_score(y_true=test_gdf['int_label'], y_score=probs, multi_class='ovr')
    acc = accuracy_score(y_true=test_gdf['int_label'], y_pred=predictions)

    meta_results.append({
        "model_name": meta_config['model_name'],
        "meta_method": meta_config['method'],
        "test_macro_prec": prec,
        "test_macro_rec": rec,
        "test_macro_f1": f1,
        'test_acc': acc,
        "test_roc_auc": roc_auc,
        "test_rmse": rmse,
        "test_mae": mae
    })

meta_result_df: pd.DataFrame = pd.DataFrame.from_dict(meta_results)
meta_result_df.to_csv(os.path.join(RESULTS_PATH, 'classif_head', 'meta_learner_test.csv'), index=False)
test_gdf.to_parquet(os.path.join(RESULTS_PATH, 'classif_head', 'meta_learning_preds.parquet'))
meta_result_df

Unnamed: 0,model_name,meta_method,test_macro_prec,test_macro_rec,test_macro_f1,test_acc,test_roc_auc,test_rmse,test_mae
0,logistic_regression,probabilities,0.832501,0.799591,0.810704,0.831694,0.938008,0.486989,0.191257
1,logistic_regression,partial,0.823054,0.788032,0.79963,0.821858,0.930901,0.493676,0.2
2,random_forest,probabilities,0.796501,0.789895,0.792667,0.809836,0.920972,0.508937,0.213115
3,random_forest,partial,0.831884,0.801935,0.812524,0.831694,0.931782,0.486989,0.191257
4,svm,probabilities,0.824932,0.788224,0.799943,0.824044,0.909566,0.494782,0.198907
5,svm,partial,0.656736,0.594951,0.57486,0.68306,0.855459,0.613153,0.336612
6,gradient_boosting,probabilities,0.6999,0.705057,0.699323,0.726776,0.905003,0.718225,0.354098
7,gradient_boosting,partial,0.82834,0.806004,0.814486,0.831694,0.927311,0.483611,0.190164
8,knn,probabilities,0.811169,0.779969,0.79043,0.813115,0.916077,0.512148,0.212022
9,knn,partial,0.680597,0.67225,0.675574,0.701639,0.857539,0.66283,0.345355


Again, why not do the same thing per use case.

In [None]:
# please do not run this, it's a waste of time, this was just for experimental purposes
"""
# Initialize dictionaries to hold global predictions and probabilities for each (model_name, method)
global_predictions = {}  # key: (model_name, method), value: numpy array of predictions for all test samples
global_probabilities = {}  # key: (model_name, method), value: numpy array of predicted probabilities

# First, identify all keys from the use-case–specific meta training results
for meta_config in meta_training_results_by_use_case:
    key = (meta_config['model_name'], meta_config['method'])
    if key not in global_predictions:
        # Create an empty array for predictions for all test samples (assumes predictions are int type)
        global_predictions[key] = np.empty(len(test_gdf), dtype=int)
        # We'll initialize probabilities once we know the number of classes (from the first use case)
        global_probabilities[key] = None

# Now, iterate over each meta config and fill in predictions for its use case subset
for meta_config in meta_training_results_by_use_case:
    key = (meta_config['model_name'], meta_config['method'])
    use_case = meta_config['use_case']
    # Create a boolean mask for test samples belonging to this use case
    use_case_mask = test_gdf['use_case'] == use_case
    
    # Select the appropriate feature matrix based on the method
    if meta_config['method'] == 'partial':
        X_subset = meta_part_features_test_use_case[use_case_mask]
    else:  # For example, if method == 'probabilities'
        X_subset = meta_prob_features_test[use_case_mask]
    
    # Obtain predictions and predicted probabilities on the subset
    preds_subset = meta_config['model'].predict(X_subset)
    probs_subset = meta_config['model'].predict_proba(X_subset)
    
    # If the probabilities array for this key is not yet initialized, do it now
    if global_probabilities[key] is None:
        global_probabilities[key] = np.empty((len(test_gdf), probs_subset.shape[1]))
    
    # Fill in the predictions and probabilities for these test indices
    indices = test_gdf.index[use_case_mask]
    global_predictions[key][indices] = preds_subset
    global_probabilities[key][indices] = probs_subset

# Optionally, add the global predictions to test_gdf as new columns
for key, preds in global_predictions.items():
    col_name = f'pred_use_case_{key[0]}_{key[1]}'
    test_gdf[col_name] = preds

# Now that we have predictions for every test sample from each meta model, compute evaluation metrics globally.
meta_results_use_case: list[dict] = []
for key, preds in global_predictions.items():
    probs = global_probabilities[key]
    prec, rec, f1, _ = precision_recall_fscore_support(
        test_gdf['int_label'], preds, average='macro'
    )
    rmse: float = root_mean_squared_error(test_gdf['int_label'], preds)
    mae: float = mean_absolute_error(test_gdf['int_label'], preds)
    roc_auc = roc_auc_score(test_gdf['int_label'], probs, multi_class='ovr')
    acc = accuracy_score(test_gdf['int_label'], preds)

    # Compute average confidence for misclassified samples:
    misclassified_mask = preds != test_gdf['int_label'].values
    if misclassified_mask.sum() > 0:
        mis_confidences = [probs[i, preds[i]] for i in np.where(misclassified_mask)[0]]
        avg_mis_conf = np.mean(mis_confidences)
    else:
        avg_mis_conf = np.nan  # if no misclassifications, return NaN
        
    # Compute average confidence for correctly classified samples:
    correct_mask = preds == test_gdf['int_label'].values
    if correct_mask.sum() > 0:
        correct_confidences = [probs[i, preds[i]] for i in np.where(correct_mask)[0]]
        avg_correct_conf = np.mean(correct_confidences)
    else:
        avg_correct_conf = np.nan  # if no correct classifications, return NaN
    
    meta_results_use_case.append({
         "model_name": key[0],
         "meta_method": f"use_case_{key[1]}",
         "test_macro_prec": prec,
         "test_macro_rec": rec,
         "test_macro_f1": f1,
         "test_acc": acc,
         "test_roc_auc": roc_auc,
         "test_rmse": rmse,
         "test_mae": mae,
         "avg_misclassified_confidence": avg_mis_conf,
         "avg_correct_confidence": avg_correct_conf
    })

# Save the evaluation results and predictions
meta_result_use_case_df: pd.DataFrame = pd.DataFrame.from_dict(meta_results_use_case)
meta_result_use_case_df.to_csv(os.path.join(RESULTS_PATH, 'classif_head', 'meta_learner_test_per_use_case.csv'), index=False)
test_gdf.to_parquet(os.path.join(RESULTS_PATH, 'classif_head', 'meta_learning_preds_per_use_case.parquet'))

meta_result_use_case_df
"""