In [1]:
import numpy as np, json, ast
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
from time import time
from sentence_transformers import SentenceTransformer

In [2]:
with open('id_ques_map.json') as fh:
    id_ques_map = json.load(fh)
id_ques_map = {int(k):str(v) for k,v in  id_ques_map.items()}
id_ques_map[0] = ''
    
train_pairs = np.array([list(ast.literal_eval(l.strip())) for l in open('train_pairs.txt').readlines()])
valid_pairs = np.array([list(ast.literal_eval(l.strip())) for l in open('valid_pairs.txt').readlines()])
y_train = np.loadtxt('y_train.txt')
y_valid = np.loadtxt('y_valid.txt')

In [3]:
max(id_ques_map.keys())

537933

In [4]:
sents = []
for i in range(max(id_ques_map)+1):
    try:
        sents.append(str(id_ques_map[i]))
    except KeyError:
        print(f'key {i} is missing')

In [5]:
set(range(537934))-set(id_ques_map.keys())

set()

In [6]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
t0 = time()
X = model.encode(sents)
encode_time = round(time() - t0,2)
print(f'encoding time: {encode_time} sec')

encoding time: 909.16 sec


In [7]:
def get_2d_train_valid():
    X_1 = X[train_pairs[:,0]]
    X_2 = X[train_pairs[:,1]]
    X_train = np.hstack((X_1,X_2))

    X_1 = X[valid_pairs[:,0]]
    X_2 = X[valid_pairs[:,1]]
    X_valid = np.hstack((X_1,X_2))

    print(X_train.shape, X_valid.shape)
    return X_train, X_valid


def get_3d_train_valid():
    X_1 = X[train_pairs[:,0]]
    X_2 = X[train_pairs[:,1]]
    X_diff = abs(X_1-X_2)
    X_train = np.hstack((X_1,X_2, X_diff))

    X_1 = X[valid_pairs[:,0]]
    X_2 = X[valid_pairs[:,1]]
    X_diff = abs(X_1-X_2)
    X_valid = np.hstack((X_1,X_2, X_diff))

    print(X_train.shape, X_valid.shape)
    return X_train, X_valid

In [10]:
def classify(X_train, y_train, X_valid, y_valid):
    for clf in (LogisticRegression(),):
        t0 = time()
        clf.fit(X_train, y_train)
        train_time = round(time()-t0,2)

        t0 = time()
        y_pred = clf.predict(X_valid)
        inf_time = round(time()-t0,2)

        acc = round(accuracy_score(y_valid, y_pred)*100,2)
        f1 = round(f1_score(y_valid, y_pred)*100,2)
        auc = round(roc_auc_score(y_valid, y_pred)*100,2)
        p = round(precision_score(y_valid, y_pred)*100,2)
        r = round(recall_score(y_valid, y_pred)*100,2)

        print(f'For classifier: {clf}, acc: {acc}, auc: {auc}, p: {p}, r:{r}, f1: {f1}')
        print(f'train time: {train_time} & inf time: {inf_time}')
        print('*'*80)

In [11]:
X_train, X_valid = get_2d_train_valid()
classify(X_train, y_train, X_valid, y_valid)

X_train, X_valid = get_3d_train_valid()
classify(X_train, y_train, X_valid, y_valid)

(283003, 1536) (121287, 1536)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


For classifier: LogisticRegression(), acc: 72.18, auc: 67.83, p: 66.16, r:51.02, f1: 57.61
train time: 29.77 & inf time: 1.08
********************************************************************************
(283003, 2304) (121287, 2304)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


For classifier: LogisticRegression(), acc: 80.12, auc: 78.28, p: 74.14, r:71.17, f1: 72.62
train time: 43.49 & inf time: 1.04
********************************************************************************


In [12]:
model = SentenceTransformer('roberta-base-nli-stsb-mean-tokens')
t0 = time()
X = model.encode(sents)
encode_time = round(time() - t0,2)
print(f'encoding time: {encode_time} sec')

RuntimeError: CUDA out of memory. Tried to allocate 50.00 MiB (GPU 0; 14.76 GiB total capacity; 1.98 GiB already allocated; 48.75 MiB free; 2.10 GiB reserved in total by PyTorch)

In [None]:
X_train, X_valid = get_2d_train_valid()
classify(X_train, y_train, X_valid, y_valid)

X_train, X_valid = get_3d_train_valid()
classify(X_train, y_train, X_valid, y_valid)