In [97]:
# Libs
import pandas as pd
import random
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import math
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch

from statsmodels.stats.proportion import proportion_confint

In [75]:
# General
SEED = 42
num_classes = 2
random.seed(SEED)

# Loading Data

In [76]:
df_kp_labeled = pd.read_feather('../Data/TrainingData/df_kp_labeled.feather')
df_kp_unlabeled = pd.read_feather('../Data/TrainingData/df_kp_unlabeled.feather')

In [77]:
print(df_kp_labeled.head())

   ID_kp  ID_dta                                          KEY_POINT  \
0      6      50  Stress stemmed from the fear of disease progre...   
1      8      50   Not psychologically prepared to manage such s...   
2     16      54  Not psychologically prepared to manage these s...   
3     24      60  Stopping treatment was seen as a sign of recov...   
4     35      68  The expertise of GPs familiar with Crohn's cou...   

                                KEY_POINT_normalized  ID_kp_distinct  label  
0  stress stemmed from the fear of disease progre...               3      1  
1  not psychologically prepared to manage such so...               5      0  
2  not psychologically prepared to manage these s...              11      0  
3  stopping treatment was seen as a sign of recov...              16      1  
4  the expertise of gps familiar with crohn's cou...              26      1  


In [78]:
df_kp_labeled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   ID_kp                 19 non-null     int32 
 1   ID_dta                19 non-null     int32 
 2   KEY_POINT             19 non-null     object
 3   KEY_POINT_normalized  19 non-null     object
 4   ID_kp_distinct        19 non-null     int64 
 5   label                 19 non-null     int64 
dtypes: int32(2), int64(2), object(2)
memory usage: 888.0+ bytes


# Importing Model

In [79]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [80]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# Data Preprocessing

The data is normalized as follows.

In [81]:
# df_kp['KEY_POINT_normalized'] = df_kp['KEY_POINT'].str.strip().str.replace('\n', ' ').str.replace('\r', ' ').str.lower()

## Tokenizing

In [82]:
tokenized = tokenizer(
    df_kp_labeled['KEY_POINT_normalized'].tolist(),
    padding=False,
    truncation=False,
    return_tensors=None
)

# Calc max length
lengths = [len(input_ids) for input_ids in tokenized['input_ids']]
max_length = max(lengths)
print(f"Maximum tokenized input length: {max_length}")

Maximum tokenized input length: 65


In [83]:
def tokenize_fn(example):
    return tokenizer(example["KEY_POINT_normalized"], truncation=True, padding="max_length", max_length=64)

dataset = Dataset.from_pandas(df_kp_labeled)

df_tokenized = dataset.map(tokenize_fn, batched=True)
df_tokenized_pd = df_tokenized.to_pandas()
df_tokenized_pd['label'] = df_tokenized_pd['label'].astype(int)

df_tokenized_pd = df_tokenized_pd[['input_ids', 'attention_mask', 'label', 'ID_kp_distinct']]
print(df_tokenized_pd.head())

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

                                           input_ids  \
0  [101, 6911, 27674, 2013, 1996, 3571, 1997, 429...   
1  [101, 2025, 8317, 2135, 4810, 2000, 6133, 2107...   
2  [101, 2025, 8317, 2135, 4810, 2000, 6133, 2122...   
3  [101, 7458, 3949, 2001, 2464, 2004, 1037, 3696...   
4  [101, 1996, 11532, 1997, 14658, 5220, 2007, 13...   

                                      attention_mask  label  ID_kp_distinct  
0  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...      1               3  
1  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...      0               5  
2  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...      0              11  
3  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...      1              16  
4  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...      1              26  


In [84]:
print(df_tokenized.column_names)
print(df_tokenized['label'])

['ID_kp', 'ID_dta', 'KEY_POINT', 'KEY_POINT_normalized', 'ID_kp_distinct', 'label', 'input_ids', 'token_type_ids', 'attention_mask']
[1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0]


## Examples

In [85]:
print(dataset[0])
print("\n")
print(df_tokenized[0])

{'ID_kp': 6, 'ID_dta': 50, 'KEY_POINT': 'Stress stemmed from the fear of disease progression or worsening over time.', 'KEY_POINT_normalized': 'stress stemmed from the fear of disease progression or worsening over time.', 'ID_kp_distinct': 3, 'label': 1}


{'ID_kp': 6, 'ID_dta': 50, 'KEY_POINT': 'Stress stemmed from the fear of disease progression or worsening over time.', 'KEY_POINT_normalized': 'stress stemmed from the fear of disease progression or worsening over time.', 'ID_kp_distinct': 3, 'label': 1, 'input_ids': [101, 6911, 27674, 2013, 1996, 3571, 1997, 4295, 14967, 2030, 4788, 5582, 2058, 2051, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1,

In [86]:
print(dataset[1])
print("\n")
print(df_tokenized[1])

{'ID_kp': 8, 'ID_dta': 50, 'KEY_POINT': ' Not psychologically prepared to manage such socially awkward situations.', 'KEY_POINT_normalized': 'not psychologically prepared to manage such socially awkward situations.', 'ID_kp_distinct': 5, 'label': 0}


{'ID_kp': 8, 'ID_dta': 50, 'KEY_POINT': ' Not psychologically prepared to manage such socially awkward situations.', 'KEY_POINT_normalized': 'not psychologically prepared to manage such socially awkward situations.', 'ID_kp_distinct': 5, 'label': 0, 'input_ids': [101, 2025, 8317, 2135, 4810, 2000, 6133, 2107, 14286, 9596, 8146, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1

# Unsupervised Learning

# Supervised Learning

## Train-Test split

In [87]:
# Print the column names of the dataset
df_tokenized['label']

[1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0]

In [88]:
train_df, test_df = train_test_split(
    df_tokenized_pd,
    test_size=0.8,
    stratify=df_tokenized_pd['label'],
    random_state=SEED
)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

## Training

In [89]:
training_args = TrainingArguments(
    output_dir="./results/base_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=1,
    save_steps=1,
    fp16=True,
    dataloader_num_workers=0,
    load_best_model_at_end=True,
    gradient_accumulation_steps=2
)

In [90]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [91]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.3288,0.686376
2,0.3101,0.691823
3,0.2977,0.689448
4,0.2348,0.683872
5,0.2571,0.680028
6,0.2427,0.677546
7,0.2073,0.675979
8,0.2143,0.673962
9,0.2282,0.672942
10,0.2299,0.672157




TrainOutput(global_step=10, training_loss=0.25509653687477113, metrics={'train_runtime': 92.0937, 'train_samples_per_second': 0.326, 'train_steps_per_second': 0.109, 'total_flos': 986666457600.0, 'train_loss': 0.25509653687477113, 'epoch': 10.0})

## Testing

In [92]:
predictions = trainer.predict(test_dataset)

predicted_logits = predictions.predictions
true_labels = predictions.label_ids

predicted_labels = np.argmax(predicted_logits, axis=-1)
print(predicted_labels[:10])
print(true_labels[:10])



[0 0 0 0 1 1 0 1 1 1]
[1 1 0 0 1 1 0 1 1 0]


In [95]:
y_true = np.array([0, 0, 1, 0, 1, 1])
y_pred = np.array([0, 0, 1, 0, 1, 0])

n = len(y_true)
n_bootstraps = 1000
accuracies = []

for _ in range(n_bootstraps):
    idx = np.random.choice(n, size=n, replace=True)
    acc = np.mean(y_true[idx] == y_pred[idx])
    accuracies.append(acc)

conf_int = np.percentile(accuracies, [45, 55])
print("Bootstrap 95% CI for accuracy:", conf_int)


Bootstrap 95% CI for accuracy: [0.83333333 0.83333333]


# Semi-Supervised Learning