## Text Classification Prediction Explanations

In this notebook, we'll see some methods of explaining the predictions of a text classification model.

In [1]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter 
from scipy.spatial.distance import pdist


In [2]:
import accuracy

In [3]:
from sklearn import metrics

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return metrics.accuracy_score(y_true=p.label_ids, y_pred=preds)


In [4]:
import torch
from datasets import Dataset
from accelerate import Accelerator
import tqdm as notebook_tqdm

In [5]:
from bs4 import BeautifulSoup
import re
from sklearn import preprocessing

In [6]:
#  from gensim.models import word2vec

In [7]:
from transformers import AutoConfig, Pipeline, RobertaTokenizer, RobertaModel, AutoTokenizer, DistilBertModel, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, DistilBertConfig, DistilBertTokenizer, DistilBertTokenizerFast, DistilBertPreTrainedModel, DistilBertForTokenClassification, DistilBertForSequenceClassification
import evaluate

In [8]:
from sklearn import metrics

In [9]:
metric = evaluate.load("accuracy")

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return metric.compute(predictions=preds, references=p.label_ids)

In [None]:
HospitalDischargeNotes = pd.read_csv("../data/Merged_DischargeNotes_ReadmissionEnsue_sparse.csv")

In [None]:
HospitalDischargeNotes  

Unnamed: 0,text,ReadmissionEnsue
0,\nName: ___ Unit No: _...,1
1,\nName: ___ Unit No: _...,1
2,\nName: ___ Unit No: _...,1
3,\nName: ___ Unit No: _...,0
4,\nName: ___ Unit No: __...,0
...,...,...
347360,\nName: ___ Unit No: ___...,1
347361,\nName: ___ Unit No: ___...,0
347362,\nName: ___ Unit No: ___\...,1
347363,\nName: ___ Unit No: ___\...,0


In [None]:
class TextCleaner():
    def __init__(self):
        pass
    
    def clean_text(self, text):
        text = text.lower() # lowercase everything
        text = text.encode('ascii', 'ignore').decode()  # remove unicode characters
        text = re.sub(r'https*\S+', ' ', text) # remove links
        text = re.sub(r'http*\S+', ' ', text)
        text = re.sub(r'<.*?_:>', '', text)
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
        text = re.sub(r"\s+", " ", text).strip()
        # cleaning up text
        text = re.sub(r'\'\w+', '', text) 
        text = re.sub(r'\w*\d+\w*', '', text)
        text = re.sub(r'\s{2,}', ' ', text)
        text = re.sub(r'\s[^\w\s]\s', '', text)
        return text
    
cleaner = TextCleaner()
HospitalDischargeNotes['cleaned_text'] = HospitalDischargeNotes['text'].apply(cleaner.clean_text)

In [None]:
le = preprocessing.LabelEncoder()
HospitalDischargeNotes['labels'] = le.fit_transform(HospitalDischargeNotes['ReadmissionEnsue'].tolist())

In [None]:
HospitalDischargeNotes = HospitalDischargeNotes.drop(['text', 'ReadmissionEnsue'], axis=1)

In [None]:
train_df, test_df = train_test_split(HospitalDischargeNotes, test_size=0.2, stratify=HospitalDischargeNotes['labels'], random_state=321, shuffle=True)

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("UFNLP/gatortron-medium")
config=AutoConfig.from_pretrained('UFNLP/gatortron-medium')

# Convert datasets to tokenized format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

def tokenize_data(examples):
    return tokenizer(examples["cleaned_text"], truncation=True, max_length=512, return_tensors="pt", padding=True)

tokenized_train = train_dataset.map(tokenize_data, batched=True)
tokenized_test = test_dataset.map(tokenize_data, batched=True)

Map:   0%|          | 0/265434 [00:00<?, ? examples/s]

Map:   0%|          | 0/66359 [00:00<?, ? examples/s]

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

import os
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'
PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0

In [None]:
# Load pre-trained DistilBERT model (or another model) for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("UFNLP/gatortron-medium", num_labels=len(le.classes_)) #, id2label=id2label, label2id=label2id)   # UFNLP/gatortron-medium  (when have 133~411 hrs to spare)

# Prepare data collator for padding sequences
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    warmup_steps=500,
    learning_rate=1e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
    # push_to_hub=True
)

# Define Trainer object for training the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    processing_class=tokenizer,
  # tokenizer=tokenizer,
    data_collator=data_collator
  # compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Save the trained model
trainer.save_model('model')

Some weights of MegatronBertForSequenceClassification were not initialized from the model checkpoint at UFNLP/gatortron-medium and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
trainer.predict(tokenized_test)



PredictionOutput(predictions=array([[-2.6514592 ,  0.7948754 , -2.8717432 , ..., -2.0544043 ,
         2.3956332 ,  0.62176746],
       [-2.7259252 ,  1.6522938 , -3.179516  , ..., -2.1251814 ,
        -0.07951368,  1.5783473 ],
       [-2.3269641 ,  0.89061105, -2.7617931 , ..., -1.9125942 ,
         1.4845914 ,  0.66674936],
       ...,
       [-1.0550561 ,  0.03162514, -2.9278007 , ..., -2.1041276 ,
        -1.0636228 ,  0.0074897 ],
       [-0.07444175, -0.9415072 , -2.2345665 , ..., -1.0343314 ,
         4.449569  , -0.94645566],
       [-2.8205743 ,  0.7177669 , -2.4654167 , ..., -1.9858145 ,
         1.8188262 , -0.60201883]], shape=(182315, 25), dtype=float32), label_ids=array([18, 11, 18, ...,  9, 11, 11], shape=(182315,)), metrics={'test_loss': 0.8415372967720032, 'test_runtime': 1570.1474, 'test_samples_per_second': 116.113, 'test_steps_per_second': 14.515})

In [None]:
np.argmax(trainer.predict(tokenized_test).predictions, axis=1)

array([18, 11, 18, ...,  9, 11, 11], shape=(182315,))

In [None]:
print(classification_report(y_test, y_pred, zero_division = 0, target_names=['No Recorded Hospital Readmission Ensued', 'Followed by Readmission to Hospital']))

print(classification_report(
    y_true=tokenized_test['labels'],
    y_pred=np.argmax(trainer.predict(tokenized_test).predictions, axis=1),
    target_names=le.classes_
))

In [None]:
tokenized_train

Dataset({
    features: ['cleaned_text', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 729257
})

In [None]:
test_df = test_df.rename(columns={'cleaned_text': 'text', 'labels': 'ReadmissionEnsue'})
test_df

Unnamed: 0,Consumer complaint narrative,Issue
684406,i am attaching a copy of the letter i mailed t...,18
652421,xxxx xxxx xxxx credit card company closed our ...,11
14724,i have no clue whats going on so i have no cho...,18
295539,over the last 21 months i have on time payment...,6
140132,transunion is not correcting my name nor incor...,11
...,...,...
812163,i submitted a letter to the xxxx credit bureau...,18
38424,i value your help to removed a portion of the ...,11
575155,xxxx xxxx xxxx xxxx xxxx xxxx reporting late p...,9
512484,i opened a line of credit with upgrade on xxxx...,11


In [None]:
test_df.to_csv('../data/Xtest_Ypred_df_HospitalDischargeNotesCSV_exported_GatorTron_3iter.csv', index=False)