In [None]:
# activate autoreload
%load_ext autoreload
%autoreload 2

# check if session is in Google Colab
try:
    import google.colab
    IN_COLAB = True
    print('Google Colab session!')
except:
    IN_COLAB = False
    print('Not a Google Colab session.')

# add src path to the notebook
import os
import sys
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    PROJECT_ROOT: str = '/content/drive/MyDrive/papers/2025b_relevance_2.0'
    !pip install contextily esda deep-translator h3pandas h3~=3.0 datasets optuna
else:
    PROJECT_ROOT: str = os.path.dirname(os.path.abspath(os.path.dirname("__file__")))
if PROJECT_ROOT not in sys.path:
    sys.path.append(os.path.join(PROJECT_ROOT))
print(PROJECT_ROOT)

Google Colab session!
Mounted at /content/drive
Collecting contextily
  Downloading contextily-1.6.2-py3-none-any.whl.metadata (2.9 kB)
Collecting esda
  Downloading esda-2.7.0-py3-none-any.whl.metadata (2.0 kB)
Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Collecting h3pandas
  Downloading h3pandas-0.2.6.tar.gz (138 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.4/138.4 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting h3~=3.0
  Downloading h3-3.7.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting mercantile (from contextily)
  Downloading mercantile-1.2.1-py3-none-any.whl.metadata (4.8 kB)
Collecting rasterio (from contextily)
  Downloading rast

# Pre-Training with generic data from CrisisLex
To improve model performance for relevance classification, we can use the data from CrisisLexT26 (Olteanu et al., 2015) for pre-training of a [twhin-bert-base](https://huggingface.co/Twitter/twhin-bert-base) model. This model can then be instanciated as a SentenceTransformers model, be fine-tuned using SetFit and lastly be used for effective feature engineering with different classification heads on top.

In [None]:
import re
import glob
import pandas as pd
from tqdm import tqdm
from src.utils import clean_text_bert
from src.model_training.bert import train_classifier
tqdm.pandas()

# set data path
DATA_PATH: str = os.path.join(PROJECT_ROOT, 'data')
print(f'Data path: {DATA_PATH}')

## 1. Load Data
We go ahead and read in all data from CrisisLexT26.

In [None]:
csv_files: list[str] = glob.glob(os.path.join(DATA_PATH, 'external', 'CrisisLexT26', '*.csv'))
print(f'{len(csv_files)} csv files found')

26 csv files found


In [None]:
csv_files: list[str] = glob.glob(os.path.join(DATA_PATH, 'external', 'CrisisLexT26', '*.csv'))
crisislex_df: pd.DataFrame = pd.DataFrame()

# Define a regular expression to capture the year and name.
# This regex assumes filenames like:
#   2012_Guatemala_earthquake-tweets_labeled.csv
# where:
#   group(1) = year (4 digits)
#   group(2) = name (anything up to the "-tweets_labeled.csv" part)
pattern = re.compile(r"(\d{4})_([^-]+)-tweets_labeled\.csv")

# iterate through all csv files
for file in csv_files:
    df: pd.DataFrame = pd.read_csv(file)

    # Extract base filename (without the folder path)
    base_name = os.path.basename(file)

    # Use the regex to extract year and name from the filename
    match = pattern.match(base_name)
    if match:
        year = match.group(1)
        name = match.group(2)
    else:
        # If filename doesn't match the expected format,
        # you can decide how to handle it. Here we set them as None.
        year, name = None, None

    # Add the extracted information as new columns
    df['year'] = year
    df['event'] = name

    crisislex_df = pd.concat([crisislex_df, df], ignore_index=True)

# rename columns as I want it
crisislex_df.rename(columns={'Tweet ID': 'message_id',
                             ' Tweet Text': 'text_raw',
                             ' Information Source': 'information_source',
                             ' Information Type': 'information_type',
                             ' Informativeness': 'informativeness'}, inplace=True)

# clean the raw text for our needs
crisislex_df['text_raw'] = crisislex_df['text_raw'].str.strip()
crisislex_df['text'] = crisislex_df['text_raw'].apply(clean_text_bert)

# create a column that represent our relevance categories
relevance_category_dict: dict = {
    'Not related': 'Not related',
    'Related - but not informative': 'Related but not relevant',
    'Related and informative': 'Related and relevant',
    'Not applicable': pd.NA
}

crisislex_df['relevance_category'] = crisislex_df['informativeness'].map(relevance_category_dict)
print(crisislex_df.groupby('relevance_category').size())
crisislex_df.to_parquet(os.path.join(DATA_PATH, 'external', 'CrisisLexT26', 'crisislex_t26_processed.parquet'))
crisislex_df.head()

relevance_category
Not related                  2863
Related and relevant        16849
Related but not relevant     7732
dtype: int64


Unnamed: 0,message_id,text_raw,information_source,information_type,informativeness,year,event,text,relevance_category
0,324681353662709760,"@MontesCb7 Waco, Texas!! I use to live near th...",Not applicable,Other Useful Information,Related - but not informative,2013,West_Texas_explosion,"@user Waco, Texas!! I use to live near there, ...",Related but not relevant
1,324693550694543361,"Really? Another explosion, now in Texas? What ...",Outsiders,Sympathy and support,Related - but not informative,2013,West_Texas_explosion,"Really? Another explosion, now in Texas? What ...",Related but not relevant
2,324694339240460289,Explosion reported at fertilizer plant in West...,Outsiders,Other Useful Information,Related and informative,2013,West_Texas_explosion,Explosion reported at fertilizer plant in West...,Related and relevant
3,324694565753851904,RT @Donna4843: triage for injured at Reagan st...,Outsiders,Infrastructure and utilities,Related and informative,2013,West_Texas_explosion,RT @user: triage for injured at Reagan st in W...,Related and relevant
4,324694993551884288,RT @911BUFF: TEXAS: MASSIVE EXPLOSION U/D - LO...,Media,Affected individuals,Related and informative,2013,West_Texas_explosion,RT @user: TEXAS: MASSIVE EXPLOSION U/D - LOCAL...,Related and relevant


With the above dataset in mind, we can now create a train/test/validation split.

In [None]:
crisislex_df: pd.DataFrame = pd.read_parquet(os.path.join(DATA_PATH, 'external', 'CrisisLexT26', 'crisislex_t26_processed.parquet'))
crisislex_df.dropna(subset=['relevance_category'], inplace=True)

# To ensure consistent label weights, we need integer encodings for our labels.
label_to_index: dict = {
    'Not related': 0,
    'Related but not relevant': 1,
    'Related and relevant': 2
}
crisislex_df['int_label'] = crisislex_df['relevance_category'].map(label_to_index)
print(crisislex_df.shape)

(27444, 10)


## 2. Model Training
Next, we can go ahead and train a classification model with our pre-training data. To do that as best as possible.
- We use a class-weighted loss function to mitigate the negative effects of the class imbalance.
- We use the training settings recommended by [Sun et al. (2019)](https://arxiv.org/pdf/1905.05583), i.e. a learning rate of 2e-5, batch size 16 and 5 epochs.



In [None]:
model, tokenizer, eval_results = train_classifier(
   texts=crisislex_df['text'], labels=crisislex_df['int_label'], model_name='Twitter/twhin-bert-base',
   model_path=os.path.join(DATA_PATH, 'models', 'twhin-bert-disaster-pretrained', 'model'),
   logging_path=os.path.join(DATA_PATH, 'models', 'twhin-bert-disaster-pretrained', 'logs'),
   weighted_loss=True
)

label
1     6186
2    13479
0     2290
Name: count, dtype: int64
[0, 1, 2]
Computed class weights: tensor([3.1958, 1.1830, 0.5429])


Map:   0%|          | 0/21955 [00:00<?, ? examples/s]

Map:   0%|          | 0/5489 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Twitter/twhin-bert-base and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer: WeightedTrainer = WeightedTrainer(


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5575,0.560721,0.835854,0.795526,0.775955,0.779977
2,0.4099,0.479929,0.83804,0.779461,0.817755,0.796243
3,0.3154,0.665267,0.857898,0.819464,0.805167,0.811972
4,0.2162,0.829545,0.856805,0.83731,0.793542,0.812228
5,0.1638,0.89542,0.854436,0.822602,0.801446,0.811149


Evaluation Results: {'eval_loss': 0.8295446634292603, 'eval_accuracy': 0.8568045181271634, 'eval_precision': 0.8373104925203712, 'eval_recall': 0.7935417860039368, 'eval_f1': 0.8122281130395607, 'eval_runtime': 114.9812, 'eval_samples_per_second': 47.738, 'eval_steps_per_second': 5.975, 'epoch': 5.0}


In [None]:
eval_results

{'eval_loss': 0.8295446634292603,
 'eval_accuracy': 0.8568045181271634,
 'eval_precision': 0.8373104925203712,
 'eval_recall': 0.7935417860039368,
 'eval_f1': 0.8122281130395607,
 'eval_runtime': 114.9812,
 'eval_samples_per_second': 47.738,
 'eval_steps_per_second': 5.975,
 'epoch': 5.0}

Just to be safe:

```
{'eval_loss': 0.8295446634292603,
 'eval_accuracy': 0.8568045181271634,
 'eval_precision': 0.8373104925203712,
 'eval_recall': 0.7935417860039368,
 'eval_f1': 0.8122281130395607,
 'eval_runtime': 114.9812,
 'eval_samples_per_second': 47.738,
 'eval_steps_per_second': 5.975,
 'epoch': 5.0}
```

In [None]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (distance_embedding): Embedding(1023, 64)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=