In [147]:
# Libs
import pandas as pd
import random
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import math
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch

from statsmodels.stats.proportion import proportion_confint

In [148]:
# General
SEED = 42
use_gpu = False
num_classes = 2
random.seed(SEED)

In [149]:
# Check GPU availability
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

True
NVIDIA GeForce RTX 4070 Laptop GPU


# Loading Data

In [150]:
df_kp_labeled = pd.read_feather('../Data/TrainingData/df_kp_labeled.feather')
df_kp_unlabeled = pd.read_feather('../Data/TrainingData/df_kp_unlabeled.feather')

In [151]:
print(df_kp_labeled.head())

   ID_kp  ID_dta                                          KEY_POINT  \
0      6      50  Stress stemmed from the fear of disease progre...   
1      8      50   Not psychologically prepared to manage such s...   
2     16      54  Not psychologically prepared to manage these s...   
3     24      60  Stopping treatment was seen as a sign of recov...   
4     35      68  The expertise of GPs familiar with Crohn's cou...   

                                KEY_POINT_normalized  ID_kp_distinct  label  
0  stress stemmed from the fear of disease progre...               3      1  
1  not psychologically prepared to manage such so...               5      0  
2  not psychologically prepared to manage these s...              11      0  
3  stopping treatment was seen as a sign of recov...              16      1  
4  the expertise of gps familiar with crohn's cou...              26      1  


In [152]:
df_kp_labeled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   ID_kp                 19 non-null     int32 
 1   ID_dta                19 non-null     int32 
 2   KEY_POINT             19 non-null     object
 3   KEY_POINT_normalized  19 non-null     object
 4   ID_kp_distinct        19 non-null     int64 
 5   label                 19 non-null     int64 
dtypes: int32(2), int64(2), object(2)
memory usage: 888.0+ bytes


In [153]:
df_kp_unlabeled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109 entries, 0 to 108
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   ID_kp                 109 non-null    int32 
 1   ID_dta                109 non-null    int32 
 2   KEY_POINT             109 non-null    object
 3   KEY_POINT_normalized  109 non-null    object
 4   ID_kp_distinct        109 non-null    int64 
dtypes: int32(2), int64(1), object(2)
memory usage: 3.5+ KB


# Importing Model

In [154]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [155]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)

device = "cpu"
if use_gpu:
    device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# Data Preprocessing

The data is normalized as follows.

In [156]:
# df_kp['KEY_POINT_normalized'] = df_kp['KEY_POINT'].str.strip().str.replace('\n', ' ').str.replace('\r', ' ').str.lower()

## Tokenizing

In [157]:
all_keypoints = df_kp_labeled['KEY_POINT_normalized'].tolist() + df_kp_unlabeled['KEY_POINT_normalized'].tolist()

tokenized = tokenizer(
    df_kp_labeled['KEY_POINT_normalized'].tolist(),
    padding=False,
    truncation=False,
    return_tensors=None
)

# Calc max length
lengths = [len(input_ids) for input_ids in tokenized['input_ids']]
max_length = max(lengths)
print(f"Maximum tokenized input length: {max_length}")

Maximum tokenized input length: 65


In [158]:
def tokenize_fn(example):
    return tokenizer(example["KEY_POINT_normalized"], truncation=True, padding="max_length", max_length=max_length)

dataset_labeled = Dataset.from_pandas(df_kp_labeled)
dataset_unlabeled = Dataset.from_pandas(df_kp_unlabeled)

In [159]:
# Labeled
df_tokenized_labeled = dataset_labeled.map(tokenize_fn, batched=True)
df_tokenized_labeled_pd = df_tokenized_labeled.to_pandas()
df_tokenized_labeled_pd['label'] = df_tokenized_labeled_pd['label'].astype(int)

df_tokenized_labeled_pd = df_tokenized_labeled_pd[['input_ids', 'attention_mask', 'label', 'ID_kp_distinct']]
print(df_tokenized_labeled_pd.head())

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

                                           input_ids  \
0  [101, 6911, 27674, 2013, 1996, 3571, 1997, 429...   
1  [101, 2025, 8317, 2135, 4810, 2000, 6133, 2107...   
2  [101, 2025, 8317, 2135, 4810, 2000, 6133, 2122...   
3  [101, 7458, 3949, 2001, 2464, 2004, 1037, 3696...   
4  [101, 1996, 11532, 1997, 14658, 5220, 2007, 13...   

                                      attention_mask  label  ID_kp_distinct  
0  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...      1               3  
1  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...      0               5  
2  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...      0              11  
3  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...      1              16  
4  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...      1              26  


In [160]:
# Unlabeled
'''
df_tokenized_unlabeled = dataset_unlabeled.map(tokenize_fn, batched=True)
df_tokenized_unlabeled_pd = df_tokenized_unlabeled.to_pandas()
df_tokenized_unlabeled_pd['label'] = None

df_tokenized_unlabeled_pd = df_tokenized_unlabeled_pd[['input_ids', 'attention_mask', 'label', 'ID_kp_distinct']]
print(df_tokenized_unlabeled_pd.head())
'''

"\ndf_tokenized_unlabeled = dataset_unlabeled.map(tokenize_fn, batched=True)\ndf_tokenized_unlabeled_pd = df_tokenized_unlabeled.to_pandas()\ndf_tokenized_unlabeled_pd['label'] = None\n\ndf_tokenized_unlabeled_pd = df_tokenized_unlabeled_pd[['input_ids', 'attention_mask', 'label', 'ID_kp_distinct']]\nprint(df_tokenized_unlabeled_pd.head())\n"

# Unsupervised Learning

# Supervised Learning

## Train-Test split

In [166]:
train_df, test_df = train_test_split(
    df_tokenized_labeled_pd,
    test_size=0.3,
    stratify=df_tokenized_labeled_pd['label'],
    random_state=SEED
)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

## Training

In [167]:
training_args = TrainingArguments(
    output_dir="./results/base_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=1,
    save_steps=1,
    fp16=True,
    dataloader_num_workers=0,
    load_best_model_at_end=True,
    gradient_accumulation_steps=2
)

In [168]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [169]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.6915,0.636068
2,0.638,0.640951
3,0.5591,0.647095
4,0.5598,0.654297
5,0.5245,0.665975
6,0.5149,0.676554
7,0.4721,0.685506
8,0.4507,0.693319
9,0.4771,0.697428
10,0.461,0.699219


TrainOutput(global_step=10, training_loss=0.5348620533943176, metrics={'train_runtime': 15.9964, 'train_samples_per_second': 8.127, 'train_steps_per_second': 0.625, 'total_flos': 4342360191000.0, 'train_loss': 0.5348620533943176, 'epoch': 10.0})

## Testing

In [170]:
predictions = trainer.predict(test_dataset)

predicted_logits = predictions.predictions
true_labels = predictions.label_ids

predicted_labels = np.argmax(predicted_logits, axis=-1)
print(predicted_labels)
print(true_labels)

[0 1 1 0 1 1]
[0 0 1 0 1 1]


# Semi-Supervised Learning

In [61]:
gold_data = full_labeled_set        # 19 examples
unlabeled = full_unlabeled_set      # ~100 examples
pseudo_data = []

for epoch in range(num_epochs):
    # 1. Train on gold + retained pseudo
    train_set = gold_data + pseudo_data
    trainer.train(train_dataset=train_set)

    # 2. Predict on a small new batch of unlabeled
    batch = unlabeled.sample(n=3, replace=False)
    logits = trainer.predict(batch).logits
    probs = softmax(logits)
    preds = argmax(logits)

    # 3. Select high‑confidence pseudolabels
    mask_high = probs.max(axis=1) > 0.9
    new_pseudo = batch[mask_high].add_column("labels", preds[mask_high])

    # 4. Update pseudo_data:
    #    - Keep old ones with prob > 0.95
    #    - Add new ones
    pseudo_data = [
      x for x in pseudo_data if x.confidence > 0.95
    ] + new_pseudo

    # 5. Remove used unlabeled examples
    unlabeled = unlabeled.drop(batch.indices)


NameError: name 'full_labeled_set' is not defined

# Simulating CI (Clopper-Pearson)

In [None]:
k = 10     # correct predictions
n = 10      # total predictions

ci_low, ci_high = proportion_confint(k, n, alpha=0.05, method="beta")
print(f"95% CI: [{ci_low:.2%}, {ci_high:.2%}]")