In [1]:
import ast
import pandas as pd
import numpy as np

train_df = pd.read_csv("../../data/train-val-splits/qrels_args_docs_train_emb.tsv", sep="\t").dropna()
val_df = pd.read_csv("../../data/train-val-splits/qrels_args_docs_val_emb.tsv", sep="\t").dropna()
train_df['instr_mean'] = train_df['instr_mean'].apply(ast.literal_eval)
val_df['instr_mean'] = val_df['instr_mean'].apply(ast.literal_eval)
#print(len(train_df), len(val_df))

3011 692


In [2]:
import tensorflow as tf

X_train, Y_train = list(), list()

for x in train_df.instr_mean.tolist():
    x_train = np.array([np.array(val) for val in x[0]])
    X_train.append(x_train)
    
for y in train_df.qual.tolist():
    y_train = np.array([np.array(val) for val in [y]])
    Y_train.append(y_train)

X_train = tf.cast(X_train , dtype=tf.float32)
Y_train = tf.cast(Y_train , dtype=tf.float32)

In [3]:
X_val, Y_val = list(), list()

for x in val_df.instr_mean.tolist():
    x_val = np.array([np.array(val) for val in x[0]])
    X_val.append(x_val)
    
for y in val_df.qual.tolist():
    y_val = np.array([np.array(val) for val in [y]])
    Y_val.append(y_val)

X_val = tf.cast(X_val , dtype=tf.float32)
Y_val = tf.cast(Y_val , dtype=tf.float32)

In [4]:
import pandas as pd
import re
import numpy as np

from sklearn.base import clone
from sklearn.metrics import classification_report, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import StratifiedKFold

from transformers import pipeline, AutoTokenizer
from tqdm import tqdm

from matplotlib import pyplot as plt

import ast
import warnings

warnings.filterwarnings('ignore')

In [5]:
from numpy.random import seed
seed(42)

tf.random.set_seed(42)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import TruePositives, Precision
import ast

# This neural network is used for the embeddings by the "large" transformer models

model2 = Sequential()
model2.add(Dense(512, input_shape=(768,), activation='relu', kernel_initializer='he_uniform'))
#model.add(Dropout(0.2))
model2.add(Dense(256, activation='relu'))
#model.add(Dropout(0.2))
model2.add(Dense(64, activation='relu'))
#model.add(Dropout(0.2))
model2.add(Dense(16, activation='relu'))
model2.add(Dense(3, activation='softmax'))
# compile the keras model
model2.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=["sparse_categorical_accuracy"])

In [6]:
%%time
from keras.callbacks import EarlyStopping

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)

model2.fit(X_train, Y_train, epochs=1000, batch_size=5, verbose=0, validation_split=0.2, callbacks=[es])
probs = model2.predict(X_val)
predictions = np.argmax(probs, axis=1)
#print(classification_report(y_true=Y_val, y_pred=predictions))

Epoch 55: early stopping
              precision    recall  f1-score   support

         0.0       0.63      0.37      0.46       126
         1.0       0.46      0.49      0.47       275
         2.0       0.53      0.60      0.56       291

    accuracy                           0.51       692
   macro avg       0.54      0.48      0.50       692
weighted avg       0.52      0.51      0.51       692

CPU times: user 8min 45s, sys: 19.4 s, total: 9min 4s
Wall time: 2min 14s


In [7]:
# preds on the val set
val_df['dnn_pred'] = predictions.tolist()
val_df['dnn_prob'] = probs.tolist()

In [11]:
# on the val set; hyperparameters as on the test set

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(C=20, penalty="l2", solver="liblinear")

clf.fit(X_train.numpy(), train_df.qual.tolist())

probs = clf.predict_proba(X_val.numpy())
predictions = clf.predict(X_val.numpy())

#print(classification_report(y_true=val_df.qual.tolist(), y_pred=predictions))

              precision    recall  f1-score   support

           0       0.81      0.30      0.44       126
           1       0.60      0.30      0.40       275
           2       0.53      0.91      0.67       291

    accuracy                           0.56       692
   macro avg       0.64      0.51      0.50       692
weighted avg       0.61      0.56      0.52       692



In [12]:
val_df['lr_pred'] = predictions.tolist()
val_df['lr_prob'] = probs.tolist()

In [14]:
# on the val set; hyperparameters as on the test set

from sklearn.svm import SVC

clf = SVC(C=10, gamma=1, kernel='rbf', probability=True)
clf.fit(X_train.numpy(), train_df.qual.tolist())

probs = clf.predict_proba(X_val.numpy())
predictions = clf.predict(X_val.numpy())

#print(classification_report(y_true=val_df.qual.tolist(), y_pred=predictions))

              precision    recall  f1-score   support

           0       0.72      0.48      0.58       126
           1       0.51      0.29      0.37       275
           2       0.53      0.81      0.64       291

    accuracy                           0.55       692
   macro avg       0.59      0.53      0.53       692
weighted avg       0.56      0.55      0.52       692



In [15]:
val_df['svm_pred'] = predictions.tolist()
val_df['svm_prob'] = probs.tolist()

In [16]:
# on the val set; hyperparameters as on the test set
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(criterion='entropy', max_depth=8, max_features='auto', n_estimators=500)

clf.fit(X_train.numpy(), train_df.qual.tolist())

probs = clf.predict_proba(X_val.numpy())
predictions = clf.predict(X_val.numpy())

#print(classification_report(y_true=val_df.qual.tolist(), y_pred=predictions))

              precision    recall  f1-score   support

           0       0.85      0.26      0.40       126
           1       0.68      0.06      0.11       275
           2       0.46      0.99      0.62       291

    accuracy                           0.49       692
   macro avg       0.66      0.44      0.38       692
weighted avg       0.62      0.49      0.38       692



In [17]:
val_df['rf_pred'] = predictions.tolist()
val_df['rf_prob'] = probs.tolist()

In [18]:
# on the val set; hyperparameters as on the test set

from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

clf = MultinomialNB(alpha=0.01)

clf.fit(scaler.fit_transform(X_train.numpy()), train_df.qual.tolist())

probs = clf.predict_proba(X_val.numpy())
predictions = clf.predict(X_val.numpy())

#print(classification_report(y_true=val_df.qual.tolist(), y_pred=predictions))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       126
           1       0.00      0.00      0.00       275
           2       0.42      1.00      0.59       291

    accuracy                           0.42       692
   macro avg       0.14      0.33      0.20       692
weighted avg       0.18      0.42      0.25       692



In [19]:
val_df['nb_pred'] = predictions.tolist()
val_df['nb_prob'] = probs.tolist()

In [20]:
from lightgbm import LGBMClassifier

lgb = LGBMClassifier(
    max_depth=7,
    lambda_l1=0.1,
    lambda_l2=0.01,
    learning_rate=0.01,
    n_estimators=500,
    reg_aplha=1.1,
    colsample_bytree=0.9,
    subsample=0.9,
    n_jobs=5
)

In [22]:
lgb.fit(X_train, Y_train, eval_set=[(X_val, Y_val)], eval_metric='auc_mu', verbose=False, early_stopping_rounds=50)

probs = lgb.predict_proba(X_val.numpy())
predictions = lgb.predict(X_val.numpy())



In [24]:
val_df['lgb_pred'] = predictions.tolist()
val_df['lgb_prob'] = probs.tolist()

In [25]:
val_df.to_csv('../../data/train-val-splits/qrels_args_docs_val_emb_predictions.tsv', sep='\t', index=False)