# Sentence bert

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [None]:
d=pd.read_csv('./Data/Insight_sentence_total.csv',index_col=0)
d=d.reset_index(drop=True)

In [None]:
d

In [None]:
#Delete empty rows
emp=[]
for i in range(len(d)):
    text=d['Label'][i]
    sentence=d['Sentence'][i]
    d['Sentence'][i]=str(d['Sentence'][i])
    if text=='Removed':
        emp.append(i)
    if sentence=='' or sentence=='nan':
        emp.append(i)

d.drop(index=emp, inplace=True)
d=d.reset_index(drop=True)

In [None]:
d['Label'].value_counts()

In [None]:
#Training_Batch
symbol=sum(d['Label'].value_counts())
batch_1 = d[:symbol]
batch_1['Label'].value_counts()

In [None]:
train_labels=batch_1['Label']
train_labels

In [None]:
#Sample_model_list(if suitable to use )
model_list=['distilbert-base-cased','distilbert-base-uncased','allenai/scibert_scivocab_cased','allenai/scibert_scivocab_uncased','gpt2','andreas122001/roberta-academic-detector']
model_dict={}

In [None]:
#scibert_scivocab_uncased model
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'allenai/scibert_scivocab_uncased')


# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [None]:
#sentence tokenizing
tokenized = d['Sentence'][0:len(d)].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
tokenized

In [None]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
padded

In [None]:
np.array(padded).shape

In [None]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

In [None]:
# device=torch.device('mps')
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)
# model.to(device)
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
labels=d['Label'][0:symbol]

# Eval

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features[0:symbol], labels)

In [None]:
train_labels.value_counts()

In [None]:
test_labels.value_counts()

In [None]:
len(train_features)

In [None]:
# Grid Search for Parameters
# parameters = {'C': np.linspace(0.0001, 100, 20)}
tuned_parameters = [
    {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
    {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.001, 0.0001]},
    {'C': [1, 10, 100, 1000], 'kernel': ['poly'], 'degree': [2, 3, 4], 'gamma': [0.001, 0.0001]},
    {'C': [1, 10, 100, 1000], 'kernel': ['sigmoid'], 'gamma': [0.001, 0.0001]}
    ]
grid_search = GridSearchCV(SVC(), tuned_parameters,cv=5)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

In [None]:
# lr_clf = SVC(C=100,gamma=0.0001,kernel='rbf')
lr_clf = SVC(C=grid_search.best_params_['C'], degree=grid_search.best_params_['degree'], gamma=grid_search.best_params_['gamma'], kernel=grid_search.best_params_['kernel'])
lr_clf.fit(train_features, train_labels)

In [None]:
y_pred=lr_clf.predict(test_features)

In [None]:
lr_clf.score(test_features, test_labels)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_pred, test_labels))

# Predict remaining data

In [None]:
total_train_features=features[:symbol]
total_pred_features=features[symbol:len(d)]

In [None]:
lr_clf.fit(total_train_features, labels)
y_pred_r=lr_clf.predict(total_pred_features)

In [None]:
for i in range(symbol,len(d)):
    d['Label'][i]=y_pred_r[i-symbol]

d.to_csv('./Data/Insight_label_pred.csv')

In [None]:
d