<a href="https://colab.research.google.com/github/InsupCode/ML_Interpretability/blob/main/BERT_doc_classification_Covid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd

In [10]:
train_df = pd.read_csv("/content/train_df_transmission.csv")
test_df = pd.read_csv("/content/test_df_transmission.csv")

In [11]:
train_df.columns

Index(['pmid', 'title', 'abstract', 'keywords', 'label', 'date2',
       'label_category', 'no_of_labels'],
      dtype='object')

In [12]:
train_df.shape

(229, 8)

In [13]:
train_df = train_df[['abstract','label']]
test_df = test_df[['abstract','label']]

In [14]:
print(train_df.shape)
print(test_df.shape)

(229, 2)
(66, 2)


## BERT embeddings

In [15]:
import numpy as np
from transformers import BertTokenizer, BertModel
import torch

In [16]:
#Pubmed Model
tokenizer = BertTokenizer.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract')
model = BertModel.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [17]:
def apply_bert_embeddings_to_column(df, column_name, tokenizer, model):
    """
    Apply BERT embeddings to a specific column in the dataframe.

    Parameters:
        df (pd.DataFrame): The dataframe containing the text data.
        column_name (str): The column to which embeddings should be applied.
        tokenizer: The BERT tokenizer.
        model: The BERT model.

    Returns:
        pd.DataFrame: DataFrame with a new column containing BERT embeddings for the specified column.
    """
    def generate_embeddings(text):
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :].detach().numpy()  # CLS token embeddings
        return embeddings.flatten()

    # Apply embeddings to the specific column
    df[f"{column_name}_bert_embeddings"] = df[column_name].apply(generate_embeddings)
    return df

In [18]:
train_embeddings = apply_bert_embeddings_to_column(train_df, column_name="abstract", tokenizer=tokenizer, model=model)
test_embeddings = apply_bert_embeddings_to_column(test_df, column_name="abstract", tokenizer=tokenizer, model=model)

In [19]:
train_embeddings.head()

Unnamed: 0,abstract,label,abstract_bert_embeddings
0,We are beginning to understand how the virus k...,General Info,"[-0.5246747, 0.36198908, 0.37002364, -0.244909..."
1,"To investigate the genetic diversity, time ori...",Transmission,"[-0.35783195, -0.40571737, -0.14951429, 0.0989..."
2,This study is to investigate the clinical char...,Transmission,"[-1.169202, -0.77483773, -0.95409626, 0.020898..."
3,Background: Studies on COVID-19 infection in p...,Transmission,"[-0.5092844, -0.509097, -0.64117104, 0.5976164..."
4,"COVID-19 has unfortunately halted lab work, co...",General Info,"[-0.36893532, -0.058026157, -0.15989025, -1.05..."


In [20]:
test_embeddings.shape

(66, 3)

In [21]:
train_embeddings.to_csv("train_embeddings_bert.csv",index=False)
test_embeddings.to_csv("test_embeddings_bert.csv",index=False)

## Creating train and test sets for logistic reg

In [22]:
X_train = np.stack(train_embeddings['abstract_bert_embeddings'].values)
X_test = np.stack(test_embeddings['abstract_bert_embeddings'].values)

In [23]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

In [24]:
y_train_enc = encoder.fit_transform(train_embeddings['label'])
y_test_enc = encoder.fit_transform(test_embeddings['label'])

## Model and training

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix


bert_lr = LogisticRegression(max_iter=1000)
bert_lr.fit(X_train, y_train_enc)

In [23]:
y_pred = bert_lr.predict(X_test)

In [24]:
# Evaluation
print(f"Accuracy: {accuracy_score(y_test_enc, y_pred):.2f}")
print("Classification Report:")
print(classification_report(y_test_enc, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test_enc, y_pred))


Accuracy: 0.92
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.86      0.93        36
           1       0.86      1.00      0.92        30

    accuracy                           0.92        66
   macro avg       0.93      0.93      0.92        66
weighted avg       0.94      0.92      0.92        66

Confusion Matrix:
[[31  5]
 [ 0 30]]


In [26]:
class_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print("\nClass Mapping:", class_mapping)


Class Mapping: {'General Info': 0, 'Transmission': 1}


In [27]:
#To save model:
# save the model as a pickle file
import pickle
model_pkl_file = "BERT_LR_Doc_classification.pkl"

with open(model_pkl_file, 'wb') as file:
    pickle.dump(bert_lr, file)

## Using LIME for explanation

### Data for explanation

In [30]:
y_test_enc[:15]

array([0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1])

In [31]:
y_pred[:15]

array([0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1])

In [32]:
test_embeddings.head()

Unnamed: 0,abstract,label,abstract_bert_embeddings
0,Hubei residents are trying to stay positive as...,General Info,"[-0.5736185, 0.5652146, 0.16511987, -0.4830525..."
1,The objective of this work was to estimate the...,Transmission,"[-0.34339374, -1.01559, -0.47287488, -0.145838..."
2,There is an obvious concern globally regarding...,General Info,"[-0.022629198, -0.54114157, -0.18166055, -0.44..."
3,England is continuing to remove coronavirus re...,General Info,"[-0.4036722, 0.84371, -0.14474271, -0.4557204,..."
4,The novel coronavirus that emerged in Wuhan ha...,General Info,"[-0.41236743, 0.27626747, 0.11037006, -0.31172..."


## Load model using pickle and run LIME

In [25]:
import pickle

# Load the model from the pickle file
with open('/content/BERT_LR_Doc_classification.pkl', 'rb') as file:
    bert_lr = pickle.load(file)

### LIME

In [2]:
pip install lime



In [26]:
from lime.lime_text import LimeTextExplainer
from transformers import AutoTokenizer
import numpy as np


In [27]:
class_names = ['0', '1']

In [28]:
def predict_proba(texts):
    """
    Process raw texts, generate embeddings, and output probabilities.
    """
    # Tokenize and generate embeddings
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].detach().numpy()  # CLS token embeddings

    # Predict probabilities using the trained classifier
    probs = bert_lr.predict_proba(embeddings)
    return probs

In [29]:
# Initialize the LIME Text Explainer
explainer = LimeTextExplainer(class_names=class_names)

In [30]:
# Example: Explain a prediction for a single instance
text_instance = test_df['abstract'].loc[1]

In [31]:
text_instance

'The objective of this work was to estimate the incubation period and the serial interval of Covid-19 from a sample of symptomatic patients in Bahia Blanca city during the period March-May 2020. We collected dates of illness onset for primary cases and secondary cases for the first 18 secondary patients infected with SARS-Cov-2. Estimations of incubation period are based on a log-normal distribution while we assume a Gamma distribution for the serial interval. In both cases maximum likelihood estimator was applied to estimate main parameters. Of the total of 18 cases of local transmission analyzed, 17% occurred in the presymptomatic and asymptomatic phase. The mean incubation period for symptomatic patients is 7.9 days (95%CI: 4.6, 11.1) considering the full sample and 7.5 days (95%CI: 4.1, 10.9) if the sample is restricted to the most certain cases. The median is 6.1 and 5.8 days respectively. The point estimation for the mean serial interval is 6.8 days (95%CI: 4.0-9.6). or 5.5 days 

In [None]:
# Generate the explanation
explanation = explainer.explain_instance(
    text_instance,
    predict_proba,  # Function that takes raw text and outputs probabilities
    num_features=10  # Number of words to highlight
)

# Display the explanation
explanation.show_in_notebook()