In [1]:
import json
import sys
from os import listdir
from os.path import isfile, join
import re
import string
import pandas as pd
import numpy as np

import time
from tqdm import tqdm
from typing import List

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import DataLoader, TensorDataset

import transformers
from transformers import AutoTokenizer, AutoModel, utils
from datasets import Dataset
transformers.logging.set_verbosity_error()
utils.logging.set_verbosity_error()  # Suppress standard warnings

from bertviz import model_view, head_view


from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split


from eli5.lime import TextExplainer
from captum.attr import IntegratedGradients
import matplotlib.pyplot as plt

## Data

In [2]:
train_data = pd.read_csv('train_sentence_data.csv')
test_data = pd.read_csv('test_sentence_data.csv')
val_data = pd.read_csv('val_sentence_data.csv')

In [3]:
train_data = train_data.drop(['File_id'],axis=1)
test_data = test_data.drop(['File_id'],axis=1)
val_data = val_data.drop(['File_id'],axis=1)

In [4]:
train_data=train_data.sample(frac = 1)
test_data=test_data.sample(frac = 1)
val_data=val_data.sample(frac = 1)

In [5]:
train_data.to_csv('Train_auto.csv',index=False)
test_data.to_csv('Test_auto.csv',index=False)
val_data.to_csv('val_auto.csv',index=False)

In [6]:
test_data.Status.value_counts()

0    9108
1    5940
Name: Status, dtype: int64

In [7]:
# Load train and validation datasets from CSV files
train_dataset = Dataset.from_csv('Train_auto.csv')
val_dataset = Dataset.from_csv('val_auto.csv')
test_dataset = Dataset.from_csv('Test_auto.csv')

# Rename the columns to 'text' and 'label' to match the expected format for sequence classification
train_dataset = train_dataset.rename_column('Sentence', 'text').rename_column('Status', 'label')
val_dataset = val_dataset.rename_column('Sentence', 'text').rename_column('Status', 'label')
test_dataset = test_dataset.rename_column('Sentence', 'text').rename_column('Status', 'label')


Downloading and preparing dataset csv/default to /Users/admin/.cache/huggingface/datasets/csv/default-18fd2725423c198a/0.0.0...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /Users/admin/.cache/huggingface/datasets/csv/default-18fd2725423c198a/0.0.0. Subsequent calls will reuse this data.
Downloading and preparing dataset csv/default to /Users/admin/.cache/huggingface/datasets/csv/default-cb4b0c7dc1db0543/0.0.0...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /Users/admin/.cache/huggingface/datasets/csv/default-cb4b0c7dc1db0543/0.0.0. Subsequent calls will reuse this data.
Downloading and preparing dataset csv/default to /Users/admin/.cache/huggingface/datasets/csv/default-c3825700183e4555/0.0.0...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /Users/admin/.cache/huggingface/datasets/csv/default-c3825700183e4555/0.0.0. Subsequent calls will reuse this data.


In [14]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification,Trainer, TrainingArguments

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

# Define a function to tokenize the text and create input sequences
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True,max_length=512)

# Apply the tokenization function to the train and validation datasets
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))


Map:   0%|          | 0/138204 [00:00<?, ? examples/s]

Map:   0%|          | 0/15444 [00:00<?, ? examples/s]

Map:   0%|          | 0/15048 [00:00<?, ? examples/s]

In [15]:
model = AutoModelForSequenceClassification.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens', num_labels=2)

In [16]:
# Train the model
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

In [None]:
trainer = Trainer(
    model=model,                     # the instantiated Transformers model to be trained
    args=training_args,              # training arguments, defined above
    train_dataset=train_dataset,     # training dataset
    eval_dataset=val_dataset,        # evaluation dataset
)
trainer.train()

{'loss': 0.6888, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}
{'loss': 0.6965, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.0}


In [None]:
# Evaluate the model
eval_results = trainer.evaluate(test_dataset)
print(eval_results)

In [None]:
def model_adapter(texts: List[str]):
    
    all_scores = []

    for i in tqdm(range(0, len(texts), 64)):

        batch = texts[i:i+64]
        #print(batch)
        
        # use bert encoder to tokenize text 
        encoded_input = tokenizer(batch, 
          return_tensors='pt', 
          padding=True, 
          truncation=True, 
          max_length=model.config.max_position_embeddings-2)

        # run the model
        output = model(**encoded_input)
        #print(output)
        # by default this model gives raw logits rather 
        # than a nice smooth softmax so we apply it ourselves here
        scores = output[0].softmax(1).detach().numpy()
        #print(scores)

        all_scores.extend(scores)

    return np.array(all_scores)

In [None]:
sen = test_data.Sentence
lab = test_data.Status

In [None]:
te = TextExplainer(n_samples=5000, random_state=42)
for i in sen:
    te.fit(i, model_adapter)
    te.explain_prediction(target_names=list(model.config.id2label.values()))

In [None]:
te.explain_weights(target_names=list(model.config.id2label.values()))

In [None]:
te.metrics_ 

## SVM CLASSIFIER

In [None]:
class CustomEmbedding(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.model_name = "sentence-transformers/bert-base-nli-mean-tokens"
        self.model = AutoModel.from_pretrained(self.model_name, output_attentions=True)  # Configure model to return attention values
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        print('\n>>>>>>>init() called.\n')

    def fit(self, X, y = None):
        print('\n>>>>>>>fit() called.\n')
        return self
    
    def mean_pooling(self,model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


    def bert(self,text):
        encoded_input = self.tokenizer(text, padding=True, truncation=True, return_tensors='pt')
        inputs = self.tokenizer.encode(text, return_tensors='pt')  # Tokenize input text

        outputs = self.model(inputs)  # Run model
        attention = outputs[-1]  # Retrieve attention from model outputs

        tokens = self.tokenizer.convert_ids_to_tokens(inputs[0]) 
        sentence_embeddings = self.mean_pooling(outputs, encoded_input['attention_mask'])
        return sentence_embeddings.detach().numpy()[0].tolist()

    def transform(self, X, y = None):
        embeddings = []
        for i in X:
            emb = self.bert(i)
            embeddings.append(emb)
        #print(embeddings)
        return embeddings
        

In [None]:
pipe2 = Pipeline(steps=[
                       ('Bert Embeddings', CustomEmbedding()),    # this will trigger a call to __init__
                       ('Support Vector Classifier', SVC(kernel='rbf',probability=True))
])


In [None]:
train_data.columns

In [None]:
text = train_data['Sentence']
label = train_data['Status']
pipe2.fit(text, label)

In [None]:
te_se = test_data['Sentence']
lab = test_data['Status']
preds2 = pipe2.predict(te_se)

In [None]:
pipe2.classes_

In [None]:
preds2

In [None]:
def print_prediction(doc):
    print(doc)
    y_pred = pipe2.predict_proba([doc])[0]
    tar =['Reject','Accept']
    for target, prob in zip(tar, y_pred):
        print("{:.3f} {}".format(prob, target))

In [None]:
target =['Reject','Accept']
doc = 'however, models that make use of this strategy eventually fail after a certain level of complexity (e'
te = TextExplainer(random_state=42)
te.fit(doc, pipe2.predict_proba)
te.show_prediction(target_names= target)

In [None]:
te.explain_weights(target_names=target)

In [None]:
te.metrics_ 

#### Integrated Gradients