In [1]:
import json
import sys
from os import listdir
from os.path import isfile, join
import re
import string
import pandas as pd
import numpy as np

import time
from tqdm import tqdm
from typing import List


import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import DataLoader, TensorDataset
# from keras.preprocessing.sequence import pad_sequences

import transformers
from transformers import AutoTokenizer, AutoModel, utils
from transformers import AutoTokenizer, AutoModelForSequenceClassification,Trainer, TrainingArguments
from datasets import Dataset
transformers.logging.set_verbosity_error()
utils.logging.set_verbosity_error()  # Suppress standard warnings

from bertviz import model_view, head_view


from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


import eli5
from eli5.lime import TextExplainer
# from captum.attr import IntegratedGradients
# import matplotlib.pyplot as plt


from wordcloud import WordCloud
from PIL import Image
import matplotlib.pyplot as plt

## Data

In [2]:
sen_train_data = pd.read_csv('../sentence_split/train_sentence_data.csv')
sen_test_data = pd.read_csv('../sentence_split/test_sentence_data.csv')
sen_val_data = pd.read_csv('../sentence_split/val_sentence_data.csv')

In [3]:
doc_train_data = pd.read_csv("../complete_sentence/train_processed_data.csv")
doc_test_data = pd.read_csv("../complete_sentence/train_processed_data.csv") 

In [4]:
sen_train_data = sen_train_data.drop(['File_id'],axis=1)
sen_test_data = sen_test_data.drop(['File_id'],axis=1)
sen_val_data = sen_val_data.drop(['File_id'],axis=1)

In [5]:
# def get_20(data):
#     data_1 = data.loc[data['Status'] ==1].iloc[:20]
#     data_2 = data.loc[data['Status'] ==0].iloc[:20]
#     frames = [data_1, data_2]
#     return pd.concat(frames)

# sen_train_data = get_20(sen_train_data) 
# sen_test_data = get_20(sen_test_data)
# sen_val_data = get_20(sen_val_data)

In [6]:
doc_train_text = doc_train_data.Paper_text.values.tolist()
doc_train_status = doc_train_data.Status.values.tolist()

doc_test_text = doc_test_data.Paper_text.values.tolist()
doc_test_status = doc_test_data.Status.values.tolist()

In [7]:
sen_train_data=sen_train_data.sample(frac = 1)
sen_test_data=sen_test_data.sample(frac = 1)
sen_val_data=sen_val_data.sample(frac = 1)

In [8]:
sen_train_data.to_csv('Train_auto.csv',index=False)
sen_test_data.to_csv('Test_auto.csv',index=False)
sen_val_data.to_csv('val_auto.csv',index=False)

In [9]:
sen_train_data.Status.value_counts()

0    83160
1    55044
Name: Status, dtype: int64

In [10]:
# Load train and validation datasets from CSV files
train_dataset = Dataset.from_csv('Train_auto.csv')
val_dataset = Dataset.from_csv('val_auto.csv')
test_dataset = Dataset.from_csv('Test_auto.csv')

# Rename the columns to 'text' and 'label' to match the expected format for sequence classification
train_dataset = train_dataset.rename_column('Sentence', 'text').rename_column('Status', 'label')
val_dataset = val_dataset.rename_column('Sentence', 'text').rename_column('Status', 'label')
test_dataset = test_dataset.rename_column('Sentence', 'text').rename_column('Status', 'label')


Downloading and preparing dataset csv/default to /home/015953353/.cache/huggingface/datasets/csv/default-893d49f64543a901/0.0.0...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /home/015953353/.cache/huggingface/datasets/csv/default-893d49f64543a901/0.0.0. Subsequent calls will reuse this data.
Downloading and preparing dataset csv/default to /home/015953353/.cache/huggingface/datasets/csv/default-198c1f644b0bafd4/0.0.0...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /home/015953353/.cache/huggingface/datasets/csv/default-198c1f644b0bafd4/0.0.0. Subsequent calls will reuse this data.
Downloading and preparing dataset csv/default to /home/015953353/.cache/huggingface/datasets/csv/default-ba837e8abe899d4a/0.0.0...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /home/015953353/.cache/huggingface/datasets/csv/default-ba837e8abe899d4a/0.0.0. Subsequent calls will reuse this data.


In [11]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', cache_dir='../bert-base-uncased')

# Define a function to tokenize the text and create input sequences
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True,max_length=512)

# Apply the tokenization function to the train and validation datasets
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))


Map:   0%|          | 0/138204 [00:00<?, ? examples/s]

Map:   0%|          | 0/15444 [00:00<?, ? examples/s]

Map:   0%|          | 0/15048 [00:00<?, ? examples/s]

In [12]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', cache_dir='../bert-base-uncased', num_labels=2)

In [13]:
# Train the model
training_args = TrainingArguments(
    output_dir='./results/bert',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  return torch._C._cuda_getDeviceCount() > 0


In [None]:
trainer = Trainer(
    model=model,                     # the instantiated Transformers model to be trained
    args=training_args,              # training arguments, defined above
    train_dataset=train_dataset,     # training dataset
    eval_dataset=val_dataset,        # evaluation dataset
)
trainer.train()



In [None]:
# Evaluate the model
eval_results = trainer.evaluate(test_dataset)
print(eval_results)

In [None]:
def model_adapter(texts: List[str]):
    
    all_scores = []

    for i in tqdm(range(0, len(texts), 64)):

        batch = texts[i:i+64]
        #print(batch)
        
        # use bert encoder to tokenize text 
        encoded_input = tokenizer(batch, 
          return_tensors='pt', 
          padding=True, 
          truncation=True, 
          max_length=model.config.max_position_embeddings-2)

        # run the model
        output = model(**encoded_input)
        #print(output)
        # by default this model gives raw logits rather 
        # than a nice smooth softmax so we apply it ourselves here
        scores = output[0].softmax(1).detach().numpy()
        #print(scores)

        all_scores.extend(scores)

    return np.array(all_scores)

In [None]:
sen = sen_test_data.Sentence
lab = sen_test_data.Status

In [None]:
# specter_exp = TextExplainer(n_samples=5000, random_state=42)
# specter_exp.fit(sen[0], model_adapter)
# specter_exp.explain_prediction(target_names=list(model.config.id2label.values()))

In [None]:
# weights = specter_exp.explain_weights()

# sen_positivie_words= {}

# for i in weights.targets[0].feature_weights.pos:
#     #print(i.feature)
#     g = sen_positivie_words.get(i.feature,-1)
#     if g==-1:
#         sen_positivie_words[i.feature]=1
#     else:
#         sen_positivie_words[i.feature]+=1
        

# sen_negative_words= {}

# for i in weights.targets[0].feature_weights.neg:
#     #print(i.feature)
#     g = sen_negative_words.get(i.feature,-1)
#     if g==-1:
#         sen_negative_words[i.feature]=1
#     else:
#         sen_negative_words[i.feature]+=1

        
# wc = WordCloud(background_color="white",width=1000,height=1000,relative_scaling=0.5,normalize_plurals=False).generate_from_frequencies(sen_positivie_words)
# plt.imshow(wc)


In [None]:
# wc = WordCloud(background_color="white",width=1000,height=1000,relative_scaling=0.5,normalize_plurals=False).generate_from_frequencies(sen_negative_words)
# plt.imshow(wc)

## very inportant

In [None]:
# print(weights.targets[0].feature_weights.pos[0].weight,weights.targets[0].feature_weights.pos[0].feature)

In [None]:
# d = specter_exp.explain_prediction()

### Logistic Regression

In [None]:
doc_train_data = pd.read_csv("./complete_sentence/train_processed_data.csv")
doc_test_data = pd.read_csv("./complete_sentence/train_processed_data.csv") 

In [None]:
doc_train_data = get_20(doc_train_data)
doc_test_data = get_20(doc_test_data)

In [None]:
doc_train_data.head()

In [None]:
doc_train_text = doc_train_data.Paper_text.values.tolist()
doc_train_status = doc_train_data.Status.values.tolist()

doc_test_text = doc_test_data.Paper_text.values.tolist()
doc_test_status = doc_test_data.Status.values.tolist()

In [None]:
class Padding(BaseEstimator, TransformerMixin):
    
    def __init__(self,model,tokenizer):
        self.model = model # Configure model to return attention values
        self.tokenizer = tokenizer
        self.mxlenght = 400
        print('\n>>>>>>>init() called.\n')

    def fit(self, X, y = None):
        print('\n>>>>>>>fit() called.\n')
        return self
    

    def transform(self, X, y = None):
        embeddings = []
        for i in tqdm(X):
            temp = [] 
            sentence_list = i.split(".")
            for i in sentence_list:
                if len(i)==0:
                    continue
                encoded_input = self.tokenizer(i,return_tensors='pt', padding=True, truncation=True,max_length=model.config.max_position_embeddings-2)
                output = self.model(**encoded_input)
                pred = np.argmax(output[0].softmax(1).detach().numpy())
                temp.append(pred)
            size = self.mxlenght - len(temp)
            if size > 0:
                temp.extend([-1]*size)
            elif size < 0:
                temp = temp[0:self.mxlenght]
            else:
                pass
            embeddings.append(temp)
        return embeddings

In [None]:
pipe1 = Pipeline(steps=[
                       ('Documnet Embeddings', Padding(model,tokenizer)), # this will trigger a call to __init__
                       ('Logistic Regression', LogisticRegression(solver='lbfgs')),

])

pipe1.fit(doc_train_text, doc_train_status)
pipe1.score(doc_test_text,doc_test_status)

In [None]:
target =['Reject','Accept']
doc = doc_test_text[0]
pipe1_exp = TextExplainer(n_samples=10,random_state=42)
pipe1_exp.fit(doc, pipe1.predict_proba)
pipe1_exp.show_prediction(target_names= target)

In [None]:
weights = pipe1_exp.explain_weights(top= None)
positivie_words= {}

for i in weights.targets[0].feature_weights.pos:
    #print(i.feature)
    g = positivie_words.get(i.feature,-1)
    if g==-1:
        positivie_words[i.feature]=1
    else:
        positivie_words[i.feature]+=1
        

negative_words= {}

for i in weights.targets[0].feature_weights.neg:
    #print(i.feature)
    g = negative_words.get(i.feature,-1)
    if g==-1:
        negative_words[i.feature]=1
    else:
        negative_words[i.feature]+=1
# print(weights.targets[0].feature_weights.pos[0].weight,weights.targets[0].feature_weights.pos[0].feature)

In [None]:
wc = WordCloud(background_color="white",width=1000,height=1000,relative_scaling=0.5,normalize_plurals=False).generate_from_frequencies(positivie_words)
plt.imshow(wc)

In [None]:
wc = WordCloud(background_color="white",width=1000,height=1000,relative_scaling=0.5,normalize_plurals=False).generate_from_frequencies(negative_words)
plt.imshow(wc)

In [None]:
def print_prediction(doc):
    #print(doc)
    y_pred = pipe1.predict_proba([doc])[0]
    tar =['Reject','Accept']
    for target, prob in zip(tar, y_pred):
        print("{:.3f} {}".format(prob, target))

In [None]:
print_prediction(doc_test_text[0])