## Text Classification Prediction Explanations

In this notebook, we'll see some methods of explaining the predictions of a text classification model.

In [1]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, r2_score, mean_squared_error, root_mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, roc_auc_score, precision_recall_fscore_support, precision_recall_curve

from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter 
from scipy.spatial.distance import pdist


In [2]:
import accuracy

In [3]:
from sklearn import metrics

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return metrics.accuracy_score(y_true=p.label_ids, y_pred=preds)


In [4]:
import torch
from datasets import Dataset
from accelerate import Accelerator
import tqdm as notebook_tqdm

In [5]:
from bs4 import BeautifulSoup
import re
from sklearn import preprocessing

In [6]:
#  from gensim.models import word2vec

In [7]:
from transformers import AutoConfig, Pipeline, RobertaTokenizer, RobertaModel, AutoTokenizer, DistilBertModel, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, DistilBertConfig, DistilBertTokenizer, DistilBertTokenizerFast, DistilBertPreTrainedModel, DistilBertForTokenClassification, DistilBertForSequenceClassification
import evaluate
from torch.utils.data import DataLoader

In [8]:
from sklearn import metrics

In [9]:
metric = evaluate.load("accuracy")

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return metric.compute(predictions=preds, references=p.label_ids)

In [10]:
HospitalDischargeNotes = pd.read_csv("../data/Merged_DischargeNotes_FollowedByHospReadmissionWithin30DaysOfDischarge_sparse.csv")

In [11]:
HospitalDischargeNotes  

Unnamed: 0,text,HospReadmissionWithin30Days
0,\nName: ___ Unit No: _...,0.0
1,\nName: ___ Unit No: _...,1.0
2,\nName: ___ Unit No: _...,1.0
3,\nName: ___ Unit No: ___\n...,0.0
4,\nName: ___. Unit No: ___\n \nAdm...,1.0
...,...,...
185855,\nName: ___ Unit No: __...,1.0
185856,\nName: ___ Unit No: __...,0.0
185857,\nName: ___ Unit No: __...,0.0
185858,\nName: ___ Unit No: ___...,0.0


In [12]:
class TextCleaner():
    def __init__(self):
        pass
    
    def clean_text(self, text):
        text = text.lower() # lowercase everything
        text = text.encode('ascii', 'ignore').decode()  # remove unicode characters
        text = re.sub(r'https*\S+', ' ', text) # remove links
        text = re.sub(r'http*\S+', ' ', text)
        text = re.sub(r'<.*?_:>', '', text)
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
        text = re.sub(r"\s+", " ", text).strip()
        # cleaning up text
        text = re.sub(r'\'\w+', '', text) 
        text = re.sub(r'\w*\d+\w*', '', text)
        text = re.sub(r'\s{2,}', ' ', text)
        text = re.sub(r'\s[^\w\s]\s', '', text)
        return text
    
cleaner = TextCleaner()
HospitalDischargeNotes['cleaned_text'] = HospitalDischargeNotes['text'].apply(cleaner.clean_text)

In [13]:
le = preprocessing.LabelEncoder()
HospitalDischargeNotes['labels'] = le.fit_transform(HospitalDischargeNotes['HospReadmissionWithin30Days'].tolist())

In [14]:
HospitalDischargeNotes = HospitalDischargeNotes.drop(['text', 'HospReadmissionWithin30Days'], axis=1)

In [15]:
train_df, test_df = train_test_split(HospitalDischargeNotes, test_size=0.2, stratify=HospitalDischargeNotes['labels'], random_state=321, shuffle=True)

In [16]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("UFNLP/gatortron-medium")
config=AutoConfig.from_pretrained('UFNLP/gatortron-medium')

# Convert datasets to tokenized format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

def tokenize_data(examples):
    return tokenizer(examples["cleaned_text"], truncation=True, max_length=512, return_tensors="pt", padding=True)

tokenized_train = train_dataset.map(tokenize_data, batched=True)
tokenized_test = test_dataset.map(tokenize_data, batched=True)

Map:   0%|          | 0/148688 [00:00<?, ? examples/s]

Map:   0%|          | 0/37172 [00:00<?, ? examples/s]

In [17]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

import os
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'
PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0

In [18]:
# Load pre-trained DistilBERT model (or another model) for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("UFNLP/gatortron-medium", num_labels=len(le.classes_))  

Some weights of MegatronBertForSequenceClassification were not initialized from the model checkpoint at UFNLP/gatortron-medium and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# Freeze the base model parameters
for param in model.base_model.parameters():
    param.requires_grad = False

In [None]:
# Unfreeze the last 20 layers of the transformer
for i in [28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]:   # for i in [antepenultimate_layer_index, penultimate_layer_index, last_layer_index] if you want to unfreeze specific layers:
    for param in model.base_model.encoder.layer[i].parameters():
        param.requires_grad = True

In [21]:

# Prepare data collator for padding sequences
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)
# DataLoader with dynamic padding
train_dataloader = DataLoader(
    tokenized_train,
    shuffle=True,
    batch_size=8,  # adjust based on your GPU
    collate_fn=data_collator
)

val_dataloader = DataLoader(
    tokenized_test,
    batch_size=8, 
    collate_fn=data_collator
)

In [None]:


# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    warmup_steps=500,
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
    # gradient_accumulation_steps=2,  # Useful for limited VRAM
    # push_to_hub=True
)

# Define Trainer object for training the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    processing_class=tokenizer,
    #tokenizer=tokenizer,
    data_collator=data_collator
  # compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Save the trained model
trainer.save_model('model')



Epoch,Training Loss,Validation Loss


trainer.predict(tokenized_test)

In [None]:
y_pred = np.argmax(trainer.predict(tokenized_test).predictions, axis=1)



In [None]:
y_true = tokenized_test['labels']

In [None]:
print(classification_report(y_true, y_pred, zero_division = 0, target_names=['No Recorded Hospital Readmission Within 30 Days', 'Readmitted to Hospital Within 30 Days']))

                                                 precision    recall  f1-score   support

No Recorded Hospital Readmission Within 30 Days       0.71      0.90      0.80     24575
          Readmitted to Hospital Within 30 Days       0.60      0.29      0.39     12597

                                       accuracy                           0.69     37172
                                      macro avg       0.66      0.60      0.59     37172
                                   weighted avg       0.67      0.69      0.66     37172



print(classification_report(
    y_true=tokenized_test['labels'],
    y_pred=np.argmax(trainer.predict(tokenized_test).predictions, axis=1),
    target_names=le.classes_
))

print(classification_report(
    y_true=tokenized_test['labels'],
    y_pred,
    target_names=le.classes_
))

In [None]:
tokenized_train

Dataset({
    features: ['cleaned_text', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 148688
})

In [None]:
test_df = test_df.rename(columns={'cleaned_text': 'text', 'labels': 'HospReadmissionWithin30Days'})
test_df

Unnamed: 0,text,HospReadmissionWithin30DayssWithin30Days
16313,name unit no admission date discharge date dat...,1
63720,name unit no admission date discharge date dat...,0
155633,name unit no admission date discharge date dat...,0
132922,name unit no admission date discharge date dat...,0
27775,name unit no admission date discharge date dat...,0
...,...,...
15792,name unit no admission date discharge date dat...,0
19798,name unit no admission date discharge date dat...,0
138452,name unit no admission date discharge date dat...,0
46128,name unit no admission date discharge date dat...,0


In [None]:
test_df.to_csv('../data/HospHospReadmissionWithin30DayssWithin30DaysOfDischarge_Xtest_Ypred_df_HospitalDischargeNotesCSV_exported_GatorTron_Last20Layers1iter.csv', index=False)