In [4]:
# activate autoreload
%load_ext autoreload
%autoreload 2

# check if session is in Google Colab
try:
    import google.colab
    IN_COLAB = True
    print('Google Colab session!')
except:
    IN_COLAB = False
    print('Not a Google Colab session.')

# add src path to the notebook
import os
import sys
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    PROJECT_ROOT: str = '/content/drive/MyDrive/papers/2025b_relevance_2.0'
    !pip install contextily esda deep-translator h3pandas h3~=3.0 datasets optuna setfit
else:
    PROJECT_ROOT: str = os.path.dirname(os.path.abspath(os.path.dirname("__file__")))
if PROJECT_ROOT not in sys.path:
    sys.path.append(os.path.join(PROJECT_ROOT))
print(PROJECT_ROOT)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Not a Google Colab session.
/mnt/c/Users/DavidHanny/OneDrive - IT U interdisciplinary transformation university austria/Documents/projects/papers/2025_GSAI_RES_Relevance_Classification


In [5]:
!nvidia-smi

Wed Jul 30 10:51:39 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 575.51.02              Driver Version: 576.02         CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A500 Laptop GPU     On  |   00000000:01:00.0 Off |                  N/A |
| N/A   51C    P0             10W /   24W |       0MiB /   4096MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# **Performance benchmarks**
Lastly, this notebook is for performance benchmarks of:
- the non-text model
- the text model (TwHIN-BERT)
- the combined inference pipeline

In [6]:
import warnings
import pickle
import torch
import timeit
import numpy as np
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
from transformers import pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, root_mean_squared_error, mean_absolute_error, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from src.model_training.classification_head import optimise_model, evaluate_model
from src.model_training.bert import train_classifier, extract_probabilities
tqdm.pandas()
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# surpress ConvergenceWarnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses

# set data path
DATA_PATH: str = os.path.join(PROJECT_ROOT, 'data')
RESULTS_PATH: str = os.path.join(PROJECT_ROOT, 'results')
print(DATA_PATH)

# set pytorch device
device: str = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
if device.type == 'cuda':
    GPU_NAME: str = torch.cuda.get_device_name(0)
    print(GPU_NAME)
else:
    GPU_NAME: str = 'CPU'

CPU_NAME: str = 'Intel(R) Core(TM) Ultra 7 165H'
print(CPU_NAME)

/mnt/c/Users/DavidHanny/OneDrive - IT U interdisciplinary transformation university austria/Documents/projects/papers/2025_GSAI_RES_Relevance_Classification/data
Device: cuda
NVIDIA RTX A500 Laptop GPU
Intel(R) Core(TM) Ultra 7 165H


## 1. Training Data
First, we need to prepare our training and evaluation data that we will use throughout the study.

In [7]:
train_gdf: gpd.GeoDataFrame = gpd.read_parquet(os.path.join(DATA_PATH, 'processed', 'fine_tuning', 'train_data.parquet'))
test_gdf: gpd.GeoDataFrame = gpd.read_parquet(os.path.join(DATA_PATH, 'processed', 'fine_tuning', 'test_data.parquet'))
with open(os.path.join(DATA_PATH, 'processed', 'fine_tuning', 'train_label_encoder.pkl'), 'rb') as f:
    label_encoder: OrdinalEncoder = pickle.load(f)

NON_TEXT_COLUMNS: list[str] = [
    'event_distance_km',
    'event_distance_h',
    'n_disaster_tweets_1km',
    'n_disaster_tweets_10km',
    'n_disaster_tweets_50km',
    'n_disaster_tweets_10000km'
]
NON_TEXT_COLUMNS_NORM: list[str] = [f'{x}_norm' for x in NON_TEXT_COLUMNS]

# Now you can use the loaded label encoder
print("Class encodings:", label_encoder.categories[0])
print(train_gdf.shape)
print(test_gdf.shape)
pd.DataFrame(train_gdf)

Class encodings: ['Not related', 'Related but not relevant', 'Related and relevant']
(3659, 45)
(915, 45)


Unnamed: 0,message_id,date,use_case,text,tweet_lang,geometry,photo_url,text_raw,related,x,...,sphere_y,sphere_z,int_label,valid,event_distance_km_norm,event_distance_h_norm,n_disaster_tweets_1km_norm,n_disaster_tweets_10km_norm,n_disaster_tweets_50km_norm,n_disaster_tweets_10000km_norm
0,1.296800e+18,2020-08-21 13:40:45,California 🔥,Closed due to the czu august lightning complex...,,POINT (-122.36110 37.16663),,Closed due to the czu august lightning complex...,1,-1.362118e+07,...,0.095685,-0.954961,1,True,-0.457531,-0.099120,-0.370997,-0.451317,0.758702,1.366236
1,1.417100e+18,2021-07-19 12:23:18,Germany 🌊,Mich beunruhigt nichts mehr.Wir sorgen persönl...,de,POINT (12.22671 51.84923),,Mich beunruhigt nichts mehr.Wir sorgen persönl...,1,1.361556e+06,...,0.645793,0.288072,1,True,1.300809,0.095609,0.362793,0.052028,-0.467472,1.510834
2,1.341270e+18,2020-12-22 06:29:24,California 🔥,The view out my kitchen window of the massive ...,,POINT (-118.41191 34.02069),,The view out my kitchen window of the massive ...,1,-1.318155e+07,...,0.095685,-0.954961,2,True,-0.714648,0.772862,0.798591,0.527691,-0.172292,-0.719290
3,1.320860e+18,2020-10-26 22:49:26,California 🔥,@user @user @user I love 1/2 miles from Anahei...,,POINT (-117.85109 33.84275),,@ZestForLifeNow @City_of_Anaheim @AnaheimFire ...,1,-1.311912e+07,...,0.095685,-0.954961,2,True,-0.714648,-0.099120,-0.105182,-0.068227,0.592908,-0.276396
4,1.296050e+18,2020-08-19 11:26:50,California 🔥,Someone fucking set off my apartment building’...,,POINT (-118.41191 34.02069),,Someone fucking set off my apartment building’...,1,-1.318155e+07,...,0.095685,-0.954961,0,True,-0.714648,0.852190,2.446648,1.889790,1.001015,0.244985
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3654,1.624650e+18,2023-02-12 06:18:19,Turkey 🪨,Hatay/Hassa'da en kazın altında çıkan not...#d...,tr,POINT (36.51252 36.78301),http://pbs.twimg.com/media/FovuSvIX0AUnSdz.jpg,Hatay/Hassa'da en kazın altında çıkan not...\r...,1,4.064655e+06,...,-0.317761,0.887312,2,True,-1.168961,-0.188440,-0.317823,-0.436595,0.543710,2.060679
3655,1.610290e+18,2023-01-03 15:19:57,Turkey 🪨,FutBol Sohbet programımızın yeni bölümü YouTub...,tr,POINT (33.78141 41.38023),http://pbs.twimg.com/media/FljqrPJWQAcU-DW.jpg,FutBol Sohbet programımızın yeni bölümü YouTub...,0,3.760672e+06,...,-0.317761,0.887312,0,True,0.882802,-2.645158,-0.544401,-0.635649,-0.646982,-1.109802
3656,1.627130e+18,2023-02-19 02:12:45,Chile 🔥,Pasando ahora 😭😭 #Coronel #Biobio #IncendioFor...,es,POINT (-73.22220 -37.00482),http://pbs.twimg.com/ext_tw_video_thumb/162712...,Pasando ahora 😭😭 #Coronel #Biobio #IncendioFor...,1,-8.154494e+06,...,0.869559,0.356159,2,True,-0.312896,0.300568,-0.480332,-0.410065,0.183655,0.021594
3657,1.416140e+18,2021-07-16 21:10:24,Germany 🌊,Rhein unterspült Uferstrasse in Basel und löst...,de,POINT (7.65276 47.57676),,Rhein unterspült Uferstrasse in Basel und löst...,1,8.524011e+05,...,0.645793,0.288072,1,True,0.852110,-0.180118,-0.141470,-0.150951,-0.432042,0.471640


Let's also prepare the data first.

In [8]:
# Define the event and location encoding options
encoding_options = {
    "none": lambda df: np.empty((df.shape[0], 0)),  # returns an empty array so hstack works
    "event_type_encoding": lambda df: np.vstack(df['event_type_encoding'].values),
    "sphere_coords": lambda df: df[['sphere_x', 'sphere_y', 'sphere_z']].values,
    "all": lambda df: np.hstack([
        np.vstack(df['event_type_encoding'].values),
        df[['sphere_x', 'sphere_y', 'sphere_z']].values
    ])
}

# Models to evaluate for the non-text features
models = {
    "logistic_regression": LogisticRegression(random_state=1),
    "random_forest": RandomForestClassifier(random_state=2),
    "svm": SVC(probability=True, random_state=3),
    "gradient_boosting": GradientBoostingClassifier(random_state=5),
    "knn": KNeighborsClassifier(),
    "naive_bayes": GaussianNB()
}

# Construct the feature matrix for the training data
X_base_train: np.ndarray = train_gdf[NON_TEXT_COLUMNS].values  # base features
X_event_train: np.ndarray = encoding_options['all'](train_gdf)  # event encoding features
X_train_non_text: np.ndarray = np.hstack([X_base_train, X_event_train])
X_train_text: np.ndarray = train_gdf['text'].values
y_train: np.ndarray = train_gdf['int_label'].values

# Construct the feature matrix for the test data
X_base_test: np.ndarray = test_gdf[NON_TEXT_COLUMNS].values  # base features
X_event_test: np.ndarray = encoding_options['all'](test_gdf)  # event encoding features
X_test_text: np.ndarray = test_gdf['text'].values
X_test_non_text: np.ndarray = np.hstack([X_base_test, X_event_test])
y_test: np.ndarray = test_gdf['int_label'].values

print(X_train_non_text.shape, X_train_text.shape)
print(X_test_non_text.shape, X_test_text.shape)

(3659, 12) (3659,)
(915, 12) (915,)


## 2. Non-text benchmark
For the ease of it, let's just benchmark all models.

In [6]:
def benchmark_non_text_models() -> pd.DataFrame:
    benchmark_entries: list[dict] = []

    for name, model in tqdm(models.items()):
        non_text_model_full, non_text_params_full, non_text_f1_full = optimise_model(
        model, X_train_non_text, y_train)
        print(f'Fitted non-text model ({name}) with validation macro F1: {non_text_f1_full}')
        print(non_text_params_full)

        def benchmark():
            non_text_model_full.predict(X_test_non_text)
        
        # Run timeit (repeat 10 times, get average)
        runs: int = 10
        times = timeit.repeat(benchmark, repeat=runs, number=1)
        average_time = np.mean(times)
        benchmark_entries.append({
            'cpu': CPU_NAME,
            'gpu': GPU_NAME,
            'runs': runs,
            'model': name,
            'pred_samples': len(X_test_non_text),
            'avg_inference_time_ms': average_time * 1000,
            'avg_latency_per_sample': average_time / len(X_test_non_text) * 1000
        })

    return pd.DataFrame(benchmark_entries)
benchmark_df_non_text: pd.DataFrame = benchmark_non_text_models()
benchmark_df_non_text.to_csv(os.path.join(RESULTS_PATH, 'non_text_model', 'inference_benchmark.csv'))
benchmark_df_non_text

 17%|█▋        | 1/6 [00:13<01:06, 13.26s/it]

Fitted non-text model (logistic_regression) with validation macro F1: 0.621695552567351
{'C': 0.1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'lbfgs'}


 33%|███▎      | 2/6 [00:21<00:41, 10.46s/it]

Fitted non-text model (random_forest) with validation macro F1: 0.704437589623365
{'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
Fitted non-text model (svm) with validation macro F1: 0.565823347179145
{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}


 67%|██████▋   | 4/6 [01:43<01:08, 34.08s/it]

Fitted non-text model (gradient_boosting) with validation macro F1: 0.6952075587391988
{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
Fitted non-text model (knn) with validation macro F1: 0.6594967504331689
{'n_neighbors': 9, 'weights': 'distance'}


100%|██████████| 6/6 [01:43<00:00, 17.23s/it]


Fitted non-text model (naive_bayes) with validation macro F1: 0.5618767481550588
{'var_smoothing': 1e-08}


Unnamed: 0,cpu,gpu,runs,model,pred_samples,avg_inference_time_ms,avg_latency_per_sample
0,Intel(R) Core(TM) Ultra 7 165H,NVIDIA RTX A500 Laptop GPU,10,logistic_regression,915,0.357619,0.000391
1,Intel(R) Core(TM) Ultra 7 165H,NVIDIA RTX A500 Laptop GPU,10,random_forest,915,12.2676,0.013407
2,Intel(R) Core(TM) Ultra 7 165H,NVIDIA RTX A500 Laptop GPU,10,svm,915,76.295336,0.083383
3,Intel(R) Core(TM) Ultra 7 165H,NVIDIA RTX A500 Laptop GPU,10,gradient_boosting,915,4.295238,0.004694
4,Intel(R) Core(TM) Ultra 7 165H,NVIDIA RTX A500 Laptop GPU,10,knn,915,4.673786,0.005108
5,Intel(R) Core(TM) Ultra 7 165H,NVIDIA RTX A500 Laptop GPU,10,naive_bayes,915,0.189761,0.000207


## 3. Text model benchmark
Secondly, let's evaluate the inference time for the text model.

In [None]:
def benchmark_text_model() -> pd.DataFrame:
    benchmark_entries: list[dict] = []
    devices: dict = {CPU_NAME: -1, GPU_NAME: 0}

    for device_name, device_id in devices.items():
        # Our text model already has been fine-tuned
        classifier_full = pipeline(
            "text-classification", 
            model=os.path.join(DATA_PATH, 'models', f'twhin-bert-base_ft', 'model'),
            device=device_id, 
            return_all_scores=True
        )

        def benchmark():
            for text in tqdm(X_test_text):
                classifier_full(text, truncation=True)

        runs: int = 10
        times = timeit.repeat(benchmark, repeat=runs, number=1)
        average_time = np.mean(times)

        benchmark_entries.append({
            'device': device_name,
            'runs': runs,
            'model': 'twhin-bert-base_ft',
            'pred_samples': len(X_test_text),
            'avg_inference_time_ms': average_time * 1000,
            'avg_latency_per_sample_ms': average_time / len(X_test_text) * 1000
        })
    return pd.DataFrame(benchmark_entries)

# note: running this requires 1.2 to 1.3GB of VRAM
text_benchmark_df: pd.DataFrame = benchmark_text_model()
text_benchmark_df.to_csv(os.path.join(RESULTS_PATH, 'fine_tuning', 'inference_benchmark.csv'), index=False)
text_benchmark_df

  0%|          | 0/915 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 915/915 [01:08<00:00, 13.29it/s]
100%|██████████| 915/915 [01:00<00:00, 15.20it/s]
100%|██████████| 915/915 [01:02<00:00, 14.57it/s]
100%|██████████| 915/915 [01:04<00:00, 14.10it/s]
100%|██████████| 915/915 [01:08<00:00, 13.29it/s]
100%|██████████| 915/915 [01:02<00:00, 14.71it/s]
100%|██████████| 915/915 [01:05<00:00, 13.90it/s]
100%|██████████| 915/915 [01:14<00:00, 12.30it/s]
100%|██████████| 915/915 [01:05<00:00, 13.93it/s]
100%|██████████| 915/915 [01:00<00:00, 15.19it/s]
  0%|          | 0/915 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  1%|          | 7/915 [00:00<00:26, 34.84it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency pleas

Unnamed: 0,device,runs,model,pred_samples,avg_inference_time_ms,avg_latency_per_sample
0,Intel(R) Core(TM) Ultra 7 165H,10,twhin-bert-base_ft,915,65399.401302,71.474756
1,NVIDIA RTX A500 Laptop GPU,10,twhin-bert-base_ft,915,15834.665445,17.305645


Let's also do the same for our disaster-relatedness model.

In [10]:
def benchmark_text_model_relatedness() -> pd.DataFrame:
    benchmark_entries: list[dict] = []
    devices: dict = {CPU_NAME: -1, GPU_NAME: 0}

    for device_name, device_id in devices.items():
        # Our text model already has been fine-tuned
        classifier_full = pipeline(
            "text-classification", 
            model='hannybal/disaster-twitter-xlm-roberta-al',
            device=device_id, 
            return_all_scores=True,
            tokenizer='cardiffnlp/twitter-xlm-roberta-base'
        )

        def benchmark():
            for text in tqdm(X_test_text):
                classifier_full(text, truncation=True)

        runs: int = 10
        times = timeit.repeat(benchmark, repeat=runs, number=1)
        average_time = np.mean(times)

        benchmark_entries.append({
            'device': device_name,
            'runs': runs,
            'model': 'twhin-bert-base_ft',
            'pred_samples': len(X_test_text),
            'avg_inference_time_ms': average_time * 1000,
            'avg_latency_per_sample_ms': average_time / len(X_test_text) * 1000
        })
    return pd.DataFrame(benchmark_entries)

# note: running this requires 1.2 to 1.3GB of VRAM
text_benchmark_relatedness_df: pd.DataFrame = benchmark_text_model_relatedness()
text_benchmark_relatedness_df.to_csv(os.path.join(RESULTS_PATH, 'fine_tuning', 'inference_benchmark_relatedness.csv'), index=False)
text_benchmark_relatedness_df

  0%|          | 0/915 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 915/915 [00:52<00:00, 17.42it/s]
100%|██████████| 915/915 [00:50<00:00, 18.09it/s]
100%|██████████| 915/915 [01:02<00:00, 14.72it/s]
100%|██████████| 915/915 [01:02<00:00, 14.63it/s]
100%|██████████| 915/915 [00:57<00:00, 15.85it/s]
100%|██████████| 915/915 [00:57<00:00, 15.89it/s]
100%|██████████| 915/915 [00:51<00:00, 17.85it/s]
100%|██████████| 915/915 [00:50<00:00, 17.98it/s]
100%|██████████| 915/915 [01:00<00:00, 15.07it/s]
100%|██████████| 915/915 [00:52<00:00, 17.54it/s]
  0%|          | 0/915 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  1%|          | 9/915 [00:00<00:26, 33.78it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency pleas

Unnamed: 0,device,runs,model,pred_samples,avg_inference_time_ms,avg_latency_per_sample_ms
0,Intel(R) Core(TM) Ultra 7 165H,10,twhin-bert-base_ft,915,55816.755961,61.001919
1,NVIDIA RTX A500 Laptop GPU,10,twhin-bert-base_ft,915,11671.656735,12.755909
