In [1]:
import pandas as pd
import numpy as np
import torch
import time
import Resources.utils_stripped as U


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/ubuntu/INFOARM-GDPR-ML-pipeline/.venv/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/ubuntu/INFOARM-GDPR-ML-pipeline/.venv/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/home/ubuntu/INFOARM-GDPR-ML-pipeline/.venv/lib/python3.12/site-packages/ipykernel/kernelapp.py", 

In [2]:
train_df = pd.read_csv("Input/opp115_train.csv")
test_df = pd.read_csv("Input/opp115_test.csv")
train_df, test_df = U.load_opp115_from_df(train_df, test_df)
df_train, df_val = U.split_train_val(train_df, val_size=0.2)

# Training of Models

In [3]:
# check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
# Verify GPU compatibility
print("PyTorch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "N/A")
print("GPU Compute Capability:", torch.cuda.get_device_capability(0) if torch.cuda.is_available() else "N/A")
print("Supported CUDA architectures:", torch.cuda.get_arch_list() if hasattr(torch.cuda, 'get_arch_list') else "N/A")

# Test a simple CUDA operation
if torch.cuda.is_available():
    x = torch.tensor([1.0, 2.0, 3.0]).cuda()
    print("✓ CUDA tensor test passed:", x)

PyTorch version: 2.2.0+cu118
CUDA version: 11.8
GPU: NVIDIA GeForce GTX 1050 Ti
GPU Compute Capability: (6, 1)
Supported CUDA architectures: ['sm_50', 'sm_60', 'sm_70', 'sm_75', 'sm_80', 'sm_86', 'sm_37', 'sm_90']
✓ CUDA tensor test passed: tensor([1., 2., 3.], device='cuda:0')


## RoBERTa Model

In [None]:
res = U.train_transformer_mcc(
    model_type="roberta",
    df_train=df_train,
    df_val=df_val,
    epochs=3,
    batch_size=16,
    lr=2e-5,
    out_dir="artifacts/roberta"
)

model_roberta = res["model"]
tokenizer_roberta = res["tokenizer"]

# # inference
# pred_labels, pred_proba = U.predict_transformer(model_roberta, tokenizer_roberta, ["Some privacy sentence..."], return_proba=True)
# pred_labels[0]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 60.00 MiB. GPU 0 has a total capacity of 3.94 GiB of which 28.50 MiB is free. Including non-PyTorch memory, this process has 3.91 GiB memory in use. Of the allocated memory 3.59 GiB is allocated by PyTorch, and 262.32 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## BERT Model

In [7]:
# Free GPU memory from previous model
if 'model_roberta' in dir():
    del model_roberta
    del tokenizer_roberta
torch.cuda.empty_cache()
import gc
gc.collect()
print(f"GPU memory free: {torch.cuda.mem_get_info()[0] / 1024**3:.2f} GB")

# Train BERT (multi-class)
res = U.train_transformer_mcc(
    model_type="bert",
    df_train=df_train,
    df_val=df_val,
    epochs=3,
    batch_size=16,  # Reduced from 32 due to limited VRAM
    lr=2e-5,
    out_dir="artifacts/bert_mcc"
)

bert_model = res["model"]
bert_tokenizer = res["tokenizer"]

# Inference
# pred_labels, pred_proba = U.predict_transformer(bert_model, bert_tokenizer, ["Some privacy sentence..."], return_proba=True)
# pred_labels[0]

GPU memory free: 0.03 GB


RuntimeError: Numpy is not available

## BiLSTM Model

In [5]:
df_train, df_val = U.split_train_val(train_df, val_size=0.2)

with U.measure_run("Train BiLSTM"):
    res = U.train_bilstm_mcc(
        df_train=df_train,
        df_val=df_val,
        epochs=3,
        batch_size=32,
        lr=1e-4,
        out_dir="artifacts/bilstm"
    )

bilstm = res["model"]

# # inference
# pred_labels, pred_proba = U.predict_bilstm(bilstm, ["Some privacy sentence..."], return_proba=True)
# pred_labels[0]


Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 359/359 [00:08<00:00, 41.64it/s]
Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [00:02<00:00, 37.43it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZE

=== PERF ===
tag: Train BiLSTM
wall_time_sec: 38.65933985599986
cpu_user_sec: 28.620000000000005
cpu_system_sec: 6.2700000000000005
ram_delta_mb: 735.3828125
gpu_peak_mem_mb: 659.55078125


# Evaluation of Models

## BiLSTM

In [9]:
print("test_df rows:", len(test_df))
print("train_df rows:", len(train_df))
print("Sentence non-null:", test_df["Sentence"].notna().sum())
print("Unique labels:", test_df["label"].nunique())

bilstm.eval()

sentences = test_df["Sentence"].tolist()
y_true = test_df["label"].astype(int).to_numpy()

with U.measure_run("BiLSTM inference"):
    y_pred_names, _ = U.predict_bilstm(bilstm, sentences, return_proba=True)

# map string labels → ids
CLASS_TO_ID = {name: i for i, name in enumerate(U.LABEL_NAMES)}
y_pred = np.array([CLASS_TO_ID[name] for name in y_pred_names], dtype=int)

results = U.evaluate_predictions(y_true, y_pred)

print("Confusion matrix:")
print(results["confusion_matrix"])

p = results["report"]["macro avg"]["precision"]
r = results["report"]["macro avg"]["recall"]

print("Macro F1:", results["report"]["macro avg"]["f1-score"])
print("Macro F2:", 0.0 if (4*p + r) == 0 else (5*p*r)/(4*p + r))

test_df rows: 2480
train_df rows: 14336
Sentence non-null: 2480
Unique labels: 9
=== PERF ===
tag: BiLSTM inference
wall_time_sec: 4.77458802899946
cpu_user_sec: 2.950000000000003
cpu_system_sec: 0.6100000000000012
ram_delta_mb: 0.68359375
gpu_peak_mem_mb: 625.11767578125
Confusion matrix:
[[782  55  46   0   0   0   0   0  20]
 [ 85 633   3   0   0   0   0   0  50]
 [ 57   4 162   0   0   0   0   0  45]
 [ 27   1   3   0   0   0   0   0   1]
 [  8   3   2   0   0   1   0   0  73]
 [  4   0   3   0   0   8   0   0  32]
 [  5   0   0   0   0   0   0   0   4]
 [  7   5   3   0   0   4   0   0  50]
 [ 39  15  45   0   0   3   0   0 192]]
Macro F1: 0.33682351871785
Macro F2: 0.3473634889639036
