In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv("./data/text_label.csv",  encoding='latin-1')

In [3]:
# Split into labeled and unlabeled data
labeled_data = df[df['LABEL'].notna()]
unlabeled_data = df[df['LABEL'].isna()]

# Split labeled data into train and validation sets
train_data, val_data = train_test_split(labeled_data, test_size=0.2, random_state=42)

# Preprocess the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000)
X_train = vectorizer.fit_transform(train_data['FOI_TEXT'])
X_val = vectorizer.transform(val_data['FOI_TEXT'])
X_unlabeled = vectorizer.transform(unlabeled_data['FOI_TEXT'])

y_train = train_data['LABEL']
y_val = val_data['LABEL']

In [46]:
labeled_data['LABEL'].unique()

array([1., 4., 5., 2., 3.])

In [44]:
labeled_data[labeled_data['LABEL'] == 1]

Unnamed: 0,MDR_REPORT_KEY,MDR_TEXT_KEY,TEXT_TYPE_CODE,PATIENT_SEQUENCE_NUMBER,FOI_TEXT,DATE_RECEIVED,LABEL
0,6383024,106903842,N,1,BASED ON ADDITIONAL INFORMATION RECEIVED THIS ...,6/3/2017,1.0
1,6383024,106903843,D,1,BASED ON ADDITIONAL INFORMATION RECEIVED THIS ...,6/3/2017,1.0
2,6383024,109652829,N,1,"IF INFORMATION IS PROVIDED IN THE FUTURE, A SU...",6/3/2017,1.0
3,6383024,69202956,N,1,MANUFACTURER REFERENCE NUMBER: (B)(4). INCIDEN...,6/3/2017,1.0
5,6343125,107384375,N,1,"IF INFORMATION IS PROVIDED IN THE FUTURE, A SU...",20/2/2017,1.0
...,...,...,...,...,...,...,...
3588,10722253,212603921,N,1,"BASED ON THE INVESTIGATION, THE DEVICE WAS NOT...",22/10/2020,1.0
3590,10722253,320662340,N,0,THIS REPORT IS A DUPLICATE REPORT TO 2183959-2...,22/10/2020,1.0
3592,10722574,212684512,D,1,IT WAS REPORTED THAT THE PATIENT UNDERWENT A R...,22/10/2020,1.0
3594,10732217,212963377,N,1,THE EXACT EVENT DATE IS UNKNOWN.,25/10/2020,1.0


In [39]:
unlabeled_data['LABEL']

(9202,)

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize the classifier
clf = LogisticRegression(max_iter=1000)

# Train the classifier on labeled data
clf.fit(X_train, y_train)

# Evaluate on the validation set
y_val_pred = clf.predict(X_val)
initial_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {accuracy_score(y_val, y_val_pred):.4f}')


Validation Accuracy: 0.5185


In [29]:
data = pd.read_csv("./data/text_label.csv",  encoding='latin-1')
X = data['FOI_TEXT']  # Text data
y = data['LABEL']  # Labels

# Split into labeled and unlabeled
X_labeled = X[y.notnull()]
y_labeled = y[y.notnull()]
X_unlabeled = X[y.isnull()]

In [32]:
from sklearn.model_selection import train_test_split

# Split the labeled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled, test_size=0.2, random_state=42, stratify=y_labeled)

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)  # Limit to top 1000 features for efficiency
X_labeled_vec = vectorizer.fit_transform(X_labeled)
X_unlabeled_vec = vectorizer.transform(X_unlabeled)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [34]:
# Initialize the model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=200)

# Self-training loop
while True:
    # Fit the model on the training data
    model.fit(X_train_vec, y_train)

    # Get probability predictions on the unlabeled data
    y_pred_proba = model.predict_proba(X_unlabeled_vec)

    # Get indices of samples with high confidence predictions
    high_confidence_indices = np.where(np.max(y_pred_proba, axis=1) > 0.90)[0]

    if len(high_confidence_indices) == 0:
        break  # Exit if no high-confidence predictions

    # Get the corresponding text and predicted labels
    X_high_conf = X_unlabeled.iloc[high_confidence_indices]
    y_high_conf = np.argmax(y_pred_proba[high_confidence_indices], axis=1)

    # Append high-confidence samples to the labeled data
    X_train = pd.concat([X_train, X_high_conf])
    y_train = pd.concat([y_train, pd.Series(y_high_conf)])

    # Remove high-confidence samples from the unlabeled data
    X_unlabeled = X_unlabeled.drop(X_high_conf.index)

    # Re-vectorize the updated training data
    X_train_vec = vectorizer.fit_transform(X_train)
    X_unlabeled_vec = vectorizer.transform(X_unlabeled)


In [35]:
# Evaluate on the test set
y_pred = model.predict(X_test_vec)

# Print classification report
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

         0.0       0.81      1.00      0.90        44
         1.0       0.50      0.40      0.44        25
         2.0       0.47      0.47      0.47        15
         3.0       0.31      0.31      0.31        16
         4.0       0.83      0.38      0.53        13
         5.0       0.71      0.83      0.77        12

    accuracy                           0.65       125
   macro avg       0.61      0.57      0.57       125
weighted avg       0.64      0.65      0.63       125



In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=np.unique(y_test), 
            yticklabels=np.unique(y_test))
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


In [48]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.65


In [42]:
from sklearn.preprocessing import LabelEncoder

# Split into labeled and unlabeled data
labeled_data = df[df['LABEL'].notna()]
unlabeled_data = df[df['LABEL'].isna()]

labeled_data["LABEL"] = labeled_data["LABEL"].astype(int)

label_encoder = LabelEncoder()
labeled_data['encoded_label'] = label_encoder.fit_transform(labeled_data['LABEL'])
# # Split labeled data into train and validation sets
# train_data, val_data = train_test_split(labeled_data, test_size=0.2, random_state=42)

# # Preprocess the text data using TF-IDF
# vectorizer = TfidfVectorizer(max_features=10000)
# X_train = vectorizer.fit_transform(train_data['FOI_TEXT'])
# X_val = vectorizer.transform(val_data['FOI_TEXT'])
# X_unlabeled = vectorizer.transform(unlabeled_data['FOI_TEXT'])

# y_train = train_data['LABEL']
# y_val = val_data['LABEL']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_data["LABEL"] = labeled_data["LABEL"].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_data['encoded_label'] = label_encoder.fit_transform(labeled_data['LABEL'])


In [35]:
from transformers import BertTokenizer

# Load the BioBERT tokenizer
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-v1.1')

def tokenize_texts(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

# Tokenize labeled and unlabeled data
labeled_inputs = tokenize_texts(labeled_data['FOI_TEXT'].tolist())
unlabeled_inputs = tokenize_texts(unlabeled_data['FOI_TEXT'].tolist())


In [44]:
labeled_data['encoded_label'].unique()

array([0, 3, 4, 1, 2])

In [45]:
import torch
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Load the BioBERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('dmis-lab/biobert-v1.1', num_labels=len(labeled_data['LABEL'].unique()))

# Prepare labels
labels = torch.tensor(labeled_data['encoded_label'].tolist())


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
# Create a dataset for the Trainer
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Ensure we return a dictionary that matches the model's input format
        return {
            'input_ids': self.inputs['input_ids'][idx],
            'attention_mask': self.inputs['attention_mask'][idx],
            'labels': self.labels[idx]  # Must be included as 'labels'
        }


# Create datasets
train_dataset = CustomDataset(labeled_inputs, labels)

In [47]:
# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()

  0%|          | 0/51 [00:00<?, ?it/s]

{'train_runtime': 1965.8232, 'train_samples_per_second': 0.205, 'train_steps_per_second': 0.026, 'train_loss': 1.6527928371055454, 'epoch': 1.0}


TrainOutput(global_step=51, training_loss=1.6527928371055454, metrics={'train_runtime': 1965.8232, 'train_samples_per_second': 0.205, 'train_steps_per_second': 0.026, 'total_flos': 106036611412992.0, 'train_loss': 1.6527928371055454, 'epoch': 1.0})

In [None]:
# Define maximum iterations and confidence threshold
max_iterations = 5
confidence_threshold = 0.8
previous_pseudo_labels_count = 0

for iteration in range(max_iterations):
    print(f"Iteration {iteration + 1}/{max_iterations}")

    # Set model to evaluation mode
    model.eval()

    # Predict on unlabeled data
    with torch.no_grad():
        outputs = model(**unlabeled_inputs)
        predictions = torch.argmax(outputs.logits, dim=1)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)

    # Get high-confidence predictions
    high_confidence_indices = (probabilities.max(dim=1)[0] >= confidence_threshold).nonzero(as_tuple=True)[0]

    # Filter out high-confidence predictions
    pseudo_labels = predictions[high_confidence_indices]
    unlabeled_texts = unlabeled_data['text'].tolist()
    high_confidence_texts = [unlabeled_texts[i] for i in high_confidence_indices]

    # Create a new DataFrame for pseudo-labeled data
    pseudo_labeled_data = pd.DataFrame({
        'text': high_confidence_texts,
        'label': pseudo_labels.tolist()
    })

    # Stop if no new pseudo-labels are generated
    if len(pseudo_labeled_data) == 0:
        print("No high-confidence pseudo-labels found. Stopping the process.")
        break

    # Combine labeled and pseudo-labeled data
    combined_data = pd.concat([combined_data, pseudo_labeled_data], ignore_index=True)

    # Tokenize the combined dataset
    combined_inputs = tokenize_texts(combined_data['text'].tolist())
    combined_labels = torch.tensor(combined_data['label'].tolist())

    # Create dataset for Trainer
    combined_dataset = CustomDataset(combined_inputs, combined_labels)

    # Train the model again
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=combined_dataset,
    )

    trainer.train()

    # Optional: Check if the number of pseudo-labels is improving
    new_pseudo_labels_count = len(pseudo_labeled_data)
    if new_pseudo_labels_count == previous_pseudo_labels_count:
        print("No improvement in the number of pseudo-labels. Stopping the process.")
        break
    previous_pseudo_labels_count = new_pseudo_labels_count

# Final evaluation (optional)
trainer.evaluate()


In [None]:
# Evaluation code (e.g., using metrics like accuracy, precision, recall)
trainer.evaluate()


In [None]:
# Tokenize new data
new_data = ['new_unlabeled_text1', 'new_unlabeled_text2']
new_inputs = tokenize_texts(new_data)

# Predict
model.eval()
with torch.no_grad():
    outputs = model(**new_inputs)
    final_predictions = torch.argmax(outputs.logits, dim=1)

# Output predictions
print(final_predictions)
