In [None]:
import json
import sys
from os import listdir
from os.path import isfile, join
import re
import string
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')

# import openai
import time
# from sentence_transformers import SentenceTransformer
# import tiktoken
# from openai.embeddings_utils import get_embedding
from tqdm import tqdm

from sklearn.svm import SVC

import torch
import torch.nn as nn
from torch.autograd import Variable
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

import transformers
from transformers import AutoTokenizer, AutoModel, utils
transformers.logging.set_verbosity_error()
#from bertviz import model_view, head_view
utils.logging.set_verbosity_error()  # Suppress standard warnings
from bertviz import model_view, head_view


from captum.attr import LayerIntegratedGradients, visualization

In [2]:
train_data = pd.read_csv('train_sentence_data.csv')
test_data = pd.read_csv('test_sentence_data.csv')
val_data = pd.read_csv('val_sentence_data.csv')


In [3]:
t0 = train_data.loc[train_data['Status'] == 0][0:10]
t1 = train_data.loc[train_data['Status'] == 1][0:10]
frames = [t0, t1]
  
train_data = pd.concat(frames)
train_data=train_data.sample(frac = 1)


t0 = test_data.loc[test_data['Status'] == 0][0:10]
t1 = test_data.loc[test_data['Status'] == 1][0:10]
frames = [t0, t1]
  
test_data = pd.concat(frames)
test_data=test_data.sample(frac = 1)

In [4]:
train_data.head()

Unnamed: 0,File_id,Sentence,Status
55044,502.pdf.json,training neural networks to synthesize robust ...,0
3,304.pdf.json,"\nthus far, to evaluate the efficacy of neural...",1
9,304.pdf.json,"however, models that make use of this strateg...",1
55051,502.pdf.json,this makes it difficult to reason about what ...,0
55052,502.pdf.json,\none common strategy to improve generalizatio...,0


In [5]:
model_name = "sentence-transformers/bert-base-nli-mean-tokens"
model = AutoModel.from_pretrained(model_name, output_attentions=True)  # Configure model to return attention values
tokenizer = AutoTokenizer.from_pretrained(model_name)

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


def bert(text):
    try:
        encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
        inputs = tokenizer.encode(text, return_tensors='pt')  # Tokenize input text

        outputs = model(inputs)  # Run model
        attention = outputs[-1]  # Retrieve attention from model outputs

        tokens = tokenizer.convert_ids_to_tokens(inputs[0]) 
        sentence_embeddings = mean_pooling(outputs, encoded_input['attention_mask'])
        return sentence_embeddings.detach().numpy()[0].tolist(),attention,tokens
    except:
        t=torch.full((1, 768), -1).numpy()[0].tolist()
        return t,-1,-1

In [6]:
train_data['embeddings'], train_data['attention'], train_data['tokens'] = zip(*[bert(x) for x in train_data['Sentence']])

test_data['embeddings'], test_data['attention'], test_data['tokens'] = zip(*[bert(x) for x in test_data['Sentence']])

In [7]:
train_data.head()

Unnamed: 0,File_id,Sentence,Status,embeddings,attention,tokens
55044,502.pdf.json,training neural networks to synthesize robust ...,0,"[0.3189399540424347, -0.2120111733675003, 0.58...","([[tensor([[0.0434, 0.0313, 0.0194, 0.0346, 0....","[[CLS], training, neural, networks, to, synth,..."
3,304.pdf.json,"\nthus far, to evaluate the efficacy of neural...",1,"[-0.22580105066299438, 0.8819662928581238, 0.6...","([[tensor([[0.0161, 0.0119, 0.0109, ..., 0.01...","[[CLS], thus, far, ,, to, evaluate, the, effic..."
9,304.pdf.json,"however, models that make use of this strateg...",1,"[0.1685861498117447, -0.3472355902194977, 1.61...","([[tensor([[0.0432, 0.0586, 0.0389, 0.0146, 0....","[[CLS], however, ,, models, that, make, use, o..."
55051,502.pdf.json,this makes it difficult to reason about what ...,0,"[0.20485571026802063, 0.1942422091960907, 1.02...","([[tensor([[0.0506, 0.0582, 0.0199, 0.0552, 0....","[[CLS], this, makes, it, difficult, to, reason..."
55052,502.pdf.json,\none common strategy to improve generalizatio...,0,"[-0.4726422429084778, -0.29828211665153503, 1....","([[tensor([[0.0340, 0.0601, 0.0187, 0.0254, 0....","[[CLS], one, common, strategy, to, improve, ge..."


In [None]:
model_view(train_data['attention'][0], train_data['tokens'][0])  # Display model view

In [None]:
head_view(train_data['attention'][0], train_data['tokens'][0])

In [8]:
train_data['embeddings'][0]

[0.3189399540424347,
 -0.2120111733675003,
 0.5892661213874817,
 0.47461968660354614,
 -0.2805885672569275,
 -0.04401398450136185,
 0.4509320557117462,
 -0.5321618318557739,
 0.9889230132102966,
 -0.5384123921394348,
 0.21799714863300323,
 0.4650745391845703,
 0.545346736907959,
 0.274621844291687,
 -0.9002418518066406,
 0.07171375304460526,
 0.8405423760414124,
 -1.0200188159942627,
 0.09160549938678741,
 -0.48537304997444153,
 -0.21207354962825775,
 0.08328301459550858,
 0.046449191868305206,
 -0.043165452778339386,
 0.24641211330890656,
 -0.056924037635326385,
 -0.5269643664360046,
 -0.445924311876297,
 0.20761819183826447,
 0.31089669466018677,
 -0.304706335067749,
 0.0420294813811779,
 -0.4118236005306244,
 -0.3456064760684967,
 -0.3048211932182312,
 0.8224318623542786,
 1.4471275806427002,
 0.1354876607656479,
 0.3440203368663788,
 0.11017918586730957,
 0.2280081957578659,
 -0.3091297149658203,
 0.3675878942012787,
 0.6526841521263123,
 -0.4775177538394928,
 -0.4601874053478241,


In [9]:
clf = SVC(kernel='rbf',probability=True)
clf.fit(train_data['embeddings'].tolist(), train_data['Status'])

In [10]:
y_test = test_data['Status'].tolist()
y_pred = clf.predict(test_data['embeddings'].tolist())
accuracy = np.mean(y_pred == y_test)
print("Accuracy:", accuracy)

Accuracy: 0.5


## AutomodelClassification

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# Prepare the data
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
train_dataset = MyCustomDataset('train.txt', tokenizer)
val_dataset = MyCustomDataset('val.txt', tokenizer)

# Define the model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Train the model
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)
trainer = Trainer(
    model=model,                     # the instantiated Transformers model to be trained
    args=training_args,              # training arguments, defined above
    train_dataset=train_dataset,     # training dataset
    eval_dataset=val_dataset,        # evaluation dataset
)
trainer.train()

# Evaluate the model
test_dataset = MyCustomDataset('test.txt', tokenizer)
eval_results = trainer.evaluate(test_dataset)
print(eval_results)

## Explanation

In [32]:
from eli5.lime import TextExplainer


def 

ImportError: cannot import name 'itemfreq' from 'scipy.stats' (/Users/admin/anaconda3/envs/Ra/lib/python3.8/site-packages/scipy/stats/__init__.py)