In [1]:
import json
import sys
from os import listdir
from os.path import isfile, join
import re
import string
import pandas as pd
import numpy as np

import time
from tqdm import tqdm
from typing import List

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import DataLoader, TensorDataset
# from keras.preprocessing.sequence import pad_sequences

import transformers
from transformers import AutoTokenizer, AutoModel, utils
from transformers import AutoTokenizer, AutoModelForSequenceClassification,Trainer, TrainingArguments
from datasets import Dataset
transformers.logging.set_verbosity_error()
utils.logging.set_verbosity_error()  # Suppress standard warnings

from bertviz import model_view, head_view


from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


import eli5
from eli5.lime import TextExplainer
# from captum.attr import IntegratedGradients
# import matplotlib.pyplot as plt


from wordcloud import WordCloud
from PIL import Image
import matplotlib.pyplot as plt

## Data

In [2]:
train_data = pd.read_csv('train_sentence_data.csv')
test_data = pd.read_csv('test_sentence_data.csv')
val_data = pd.read_csv('val_sentence_data.csv')

In [3]:
train_data = train_data.drop(['File_id'],axis=1)
test_data = test_data.drop(['File_id'],axis=1)
val_data = val_data.drop(['File_id'],axis=1)

In [5]:
# train_data=train_data.sample(frac = 1)
# test_data=test_data.sample(frac = 1)
# val_data=val_data.sample(frac = 1)

In [6]:
def get_20(data):
    data_1 = data.loc[data['Status'] ==1].iloc[:10]
    data_2 = data.loc[data['Status'] ==0].iloc[:10]
    frames = [data_1, data_2]
    return pd.concat(frames)

train_data = get_20(train_data) 
test_data = get_20(test_data)
val_data = get_20(val_data)

In [7]:
train_data.to_csv('Train_auto.csv',index=False)
test_data.to_csv('Test_auto.csv',index=False)
val_data.to_csv('val_auto.csv',index=False)

In [8]:
train_data.Status.value_counts()

1    10
0    10
Name: Status, dtype: int64

In [9]:
# Load train and validation datasets from CSV files
train_dataset = Dataset.from_csv('Train_auto.csv')
val_dataset = Dataset.from_csv('val_auto.csv')
test_dataset = Dataset.from_csv('Test_auto.csv')

# Rename the columns to 'text' and 'label' to match the expected format for sequence classification
train_dataset = train_dataset.rename_column('Sentence', 'text').rename_column('Status', 'label')
val_dataset = val_dataset.rename_column('Sentence', 'text').rename_column('Status', 'label')
test_dataset = test_dataset.rename_column('Sentence', 'text').rename_column('Status', 'label')


Downloading and preparing dataset csv/default to /Users/admin/.cache/huggingface/datasets/csv/default-2f8aa1df66ec4f51/0.0.0...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /Users/admin/.cache/huggingface/datasets/csv/default-2f8aa1df66ec4f51/0.0.0. Subsequent calls will reuse this data.
Downloading and preparing dataset csv/default to /Users/admin/.cache/huggingface/datasets/csv/default-8386e1c1a2d603cd/0.0.0...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /Users/admin/.cache/huggingface/datasets/csv/default-8386e1c1a2d603cd/0.0.0. Subsequent calls will reuse this data.
Downloading and preparing dataset csv/default to /Users/admin/.cache/huggingface/datasets/csv/default-25a2f21b0e0aafd6/0.0.0...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /Users/admin/.cache/huggingface/datasets/csv/default-25a2f21b0e0aafd6/0.0.0. Subsequent calls will reuse this data.


In [10]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/allenai-specter')

# Define a function to tokenize the text and create input sequences
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True,max_length=512)

# Apply the tokenization function to the train and validation datasets
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [11]:
model = AutoModelForSequenceClassification.from_pretrained('sentence-transformers/allenai-specter', num_labels=2)

In [12]:
# Train the model
training_args = TrainingArguments(
    output_dir='./results/specter',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

In [13]:
trainer = Trainer(
    model=model,                     # the instantiated Transformers model to be trained
    args=training_args,              # training arguments, defined above
    train_dataset=train_dataset,     # training dataset
    eval_dataset=val_dataset,        # evaluation dataset
)
trainer.train()



{'train_runtime': 31.9613, 'train_samples_per_second': 1.877, 'train_steps_per_second': 0.188, 'train_loss': 0.7047828038533529, 'epoch': 3.0}


TrainOutput(global_step=6, training_loss=0.7047828038533529, metrics={'train_runtime': 31.9613, 'train_samples_per_second': 1.877, 'train_steps_per_second': 0.188, 'train_loss': 0.7047828038533529, 'epoch': 3.0})

In [14]:
# Evaluate the model
eval_results = trainer.evaluate(test_dataset)
print(eval_results)

{'eval_loss': 0.7017756104469299, 'eval_runtime': 2.9702, 'eval_samples_per_second': 6.734, 'eval_steps_per_second': 0.337, 'epoch': 3.0}
{'eval_loss': 0.7017756104469299, 'eval_runtime': 2.9702, 'eval_samples_per_second': 6.734, 'eval_steps_per_second': 0.337, 'epoch': 3.0}


In [15]:
def model_adapter(texts: List[str]):
    
    all_scores = []

    for i in tqdm(range(0, len(texts), 64)):

        batch = texts[i:i+64]
        #print(batch)
        
        # use bert encoder to tokenize text 
        encoded_input = tokenizer(batch, 
          return_tensors='pt', 
          padding=True, 
          truncation=True, 
          max_length=model.config.max_position_embeddings-2)

        # run the model
        output = model(**encoded_input)
        #print(output)
        # by default this model gives raw logits rather 
        # than a nice smooth softmax so we apply it ourselves here
        scores = output[0].softmax(1).detach().numpy()
        #print(scores)

        all_scores.extend(scores)

    return np.array(all_scores)

In [16]:
sen = test_data.Sentence
lab = test_data.Status

In [17]:
specter_exp = TextExplainer(n_samples=5000, random_state=42)
specter_exp.fit(sen[0], model_adapter)
specter_exp.explain_prediction(target_names=list(model.config.id2label.values()))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 79/79 [03:01<00:00,  2.29s/it]


Contribution?,Feature
0.122,<BIAS>
0.091,Highlighted in text (sum)


In [18]:
weights = specter_exp.explain_weights()




## very inportant

In [19]:
# print(weights.targets[0].feature_weights.pos[0].weight,weights.targets[0].feature_weights.pos[0].feature)

In [20]:
# d = specter_exp.explain_prediction()

### Logistic Regression

In [21]:
data = pd.read_csv("./complete_sentence/train_processed_data.csv")

In [22]:
data = get_20(data)

In [23]:
data.head()

Unnamed: 0,File_id,Paper_text,Status
0,304.pdf.json,training neural networks to synthesize robust ...,1
1,305.pdf.json,data compression is a fundamental and well-stu...,1
2,306.pdf.json,deep learning has shown great success in a var...,1
3,307.pdf.json,the most useful applications of dialog systems...,1
4,308.pdf.json,generative adversarial networks (gans)(goodfel...,1


In [24]:
text = data.Paper_text.values.tolist()
status = data.Status.values.tolist()

In [25]:
class Padding(BaseEstimator, TransformerMixin):
    
    def __init__(self,model,tokenizer):
        self.model = model # Configure model to return attention values
        self.tokenizer = tokenizer
        self.mxlenght = 400
        print('\n>>>>>>>init() called.\n')

    def fit(self, X, y = None):
        print('\n>>>>>>>fit() called.\n')
        return self
    

    def transform(self, X, y = None):
        embeddings = []
        for i in tqdm(X):
            temp = [] 
            sentence_list = i.split(".")
            for i in sentence_list:
                if len(i)==0:
                    continue
                encoded_input = tokenizer(i,return_tensors='pt', padding=True, truncation=True,max_length=model.config.max_position_embeddings-2)
                output = model(**encoded_input)
                pred = np.argmax(output[0].softmax(1).detach().numpy())
                temp.append(pred)
            size = self.mxlenght - len(temp)
            if size > 0:
                temp.extend([-1]*size)
            elif size < 0:
                temp = temp[0:self.mxlenght]
            else:
                pass
            embeddings.append(temp)
        return embeddings

In [26]:
pipe1 = Pipeline(steps=[
                       ('Documnet Embeddings', Padding(model,tokenizer)), # this will trigger a call to __init__
                       ('Logistic Regression', LogisticRegression(solver='lbfgs')),

])

pipe1.fit(text, status)


>>>>>>>init() called.


>>>>>>>fit() called.



100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [37:37<00:00, 112.86s/it]


Pipeline(steps=[('Documnet Embeddings',
                 Padding(model=BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31116, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inpl...
                         tokenizer=BertTokenizerFast(name_or_path='sentence-transformers/allenai-specter', vocab_size=31116, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True))),
                ('Logistic Regression', LogisticRegression())])

In [27]:
status[0]

1

In [None]:
target =['Reject','Accept']
doc = text[0]
pipe1_exp = TextExplainer(n_samples=5000,random_state=42)
pipe1_exp.fit(doc, pipe1.predict_proba)
pipe1_exp.show_prediction(target_names= target)

 25%|██████████████████████▉                                                                     | 1249/5000 [15:10:14<32:31:38, 31.22s/it]

In [None]:
weights = exp.explain_weights()
positivie_words= {}

for i in weights.targets[0].feature_weights.pos:
    #print(i.feature)
    g = positivie_words.get(i.feature,-1)
    if g==-1:
        positivie_words[i.feature]=1
    else:
        positivie_words[i.feature]+=1
        

negative_words= {}

for i in weights.targets[0].feature_weights.neg:
    #print(i.feature)
    g = negative_words.get(i.feature,-1)
    if g==-1:
        negative_words[i.feature]=1
    else:
        negative_words[i.feature]+=1
# print(weights.targets[0].feature_weights.pos[0].weight,weights.targets[0].feature_weights.pos[0].feature)

In [None]:
wc = WordCloud(background_color="white",width=1000,height=1000,relative_scaling=0.5,normalize_plurals=False).generate_from_frequencies(positivie_words)
plt.imshow(wc)

wc = WordCloud(background_color="white",width=1000,height=1000,relative_scaling=0.5,normalize_plurals=False).generate_from_frequencies(negative_words)
plt.imshow(wc)

In [None]:
def print_prediction(doc):
    print(doc)
    y_pred = pipe1.predict_proba([doc])[0]
    tar =['Reject','Accept']
    for target, prob in zip(tar, y_pred):
        print("{:.3f} {}".format(prob, target))

In [None]:
print_prediction(text[0])

## SVM CLASSIFIER

In [None]:
class CustomEmbedding(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.model_name = "sentence-transformers/bert-base-nli-mean-tokens"
        self.model = AutoModel.from_pretrained(self.model_name, output_attentions=True)  # Configure model to return attention values
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        print('\n>>>>>>>init() called.\n')

    def fit(self, X, y = None):
        print('\n>>>>>>>fit() called.\n')
        return self
    
    def mean_pooling(self,model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


    def bert(self,text):
        encoded_input = self.tokenizer(text, padding=True, truncation=True, return_tensors='pt')
        inputs = self.tokenizer.encode(text, return_tensors='pt')  # Tokenize input text

        outputs = self.model(inputs)  # Run model
        attention = outputs[-1]  # Retrieve attention from model outputs

        tokens = self.tokenizer.convert_ids_to_tokens(inputs[0]) 
        sentence_embeddings = self.mean_pooling(outputs, encoded_input['attention_mask'])
        return sentence_embeddings.detach().numpy()[0].tolist()

    def transform(self, X, y = None):
        embeddings = []
        for i in X:
            emb = self.bert(i)
            embeddings.append(emb)
        #print(embeddings)
        return embeddings
        

In [None]:
pipe2 = Pipeline(steps=[
                       ('Bert Embeddings', CustomEmbedding()),    # this will trigger a call to __init__
                       ('Support Vector Classifier', SVC(kernel='rbf',probability=True))
])


In [None]:
text = train_data['Sentence']
label = train_data['Status']
pipe2.fit(text, label)

In [None]:
te_se = test_data['Sentence']
lab = test_data['Status']
preds2 = pipe2.predict(te_se)

In [None]:
target =['Reject','Accept']
doc = 'however, models that make use of this strategy eventually fail after a certain level of complexity (e'
pipe2_exp = TextExplainer(random_state=42)
pipe2_exp.fit(doc, pipe2.predict_proba)
pipe2_exp.show_prediction(target_names= target)

In [None]:
def print_prediction_1(doc):
    print(doc)
    y_pred = pipe2.predict_proba([doc])[0]
    tar =['Reject','Accept']
    for target, prob in zip(tar, y_pred):
        print("{:.3f} {}".format(prob, target))

In [None]:
pipe2_exp.explain_weights(target_names=target)

In [None]:
print(pipe2.classes_) 
print(preds2)
print(pipe2_exp.metrics_ )

In [None]:
class Padding_2(BaseEstimator, TransformerMixin):
    
    def __init__(self,model):
        self.model = model # Configure model to return attention values
        self.tokenizer = tokenizer
        self.mxlenght = 400
        print('\n>>>>>>>init() called.\n')

    def fit(self, X, y = None):
        print('\n>>>>>>>fit() called.\n')
        return self
    

    def transform(self, X, y = None):
        embeddings = []
        for i in X:
            temp = [] 
            sentence_list = i.split(".")
            for i in sentence_list:
                if len(i)==0:
                    continue
                pred = model.predict(i)
                temp.append(pred)
            size = self.mxlenght - len(temp)
            if size > 0:
                temp.extend([-1]*size)
            elif size < 0:
                temp = temp[0:self.mxlenght+1]
            else:
                pass
            embeddings.append(temp)
        return embeddings

In [None]:
pipe3 = Pipeline(steps=[
                       ('Documnet Embeddings', Padding_2(pipe2)), # this will trigger a call to __init__
                       ('Logistic Regression', LogisticRegression(solver='lbfgs')),

])

pipe3.fit(text,status)

In [None]:
def print_prediction_2(doc):
    print(doc)
    y_pred = pipe3.predict_proba([doc])[0]
    tar =['Reject','Accept']
    for target, prob in zip(tar, y_pred):
        print("{:.3f} {}".format(prob, target))

In [None]:
target =['Reject','Accept']
doc = text[0]
pipe3_exp = TextExplainer(random_state=42)
pipe3_exp.fit(doc, pipe3.predict_proba)
pipe3_exp.show_prediction(target_names= target)

#### Integrated Gradients