In [14]:
import os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"]=""

from absl import logging

import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import re
import seaborn as sns
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score
from tqdm import tqdm
import ast, json, numpy as np

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from time import time
from sklearn.preprocessing import normalize

In [16]:
X_train_fname = './train_pairs.txt'
X_valid_fname = './valid_pairs.txt'
id_ques_map_fname = './id_ques_map.json'
y_train_fname = './y_train.txt'
y_valid_fname = './y_valid.txt'

def load_id_ques_map(fname):
    with open(fname) as fh:
        id_ques_map = json.load(fh)
    id_ques_map = {int(k): str(v) for k, v in id_ques_map.items()}
    id_ques_map[0] = ''
    print(f'Loaded {max(id_ques_map.keys())} ids to questions from {fname}')
    return id_ques_map


def load_Xy(max_samples=None):
    train_pairs = np.array([list(ast.literal_eval(l.strip())) for l in
                            open(X_train_fname).readlines()])[:max_samples]
    valid_pairs = np.array([list(ast.literal_eval(l.strip())) for l in
                            open(X_valid_fname).readlines()])[:max_samples]

    id_ques_map = load_id_ques_map(id_ques_map_fname)
    X_train = [(id_ques_map[i], id_ques_map[j]) for i,j in train_pairs]
    X_valid = [(id_ques_map[i], id_ques_map[j]) for i,j in valid_pairs]

    y_train = np.loadtxt(y_train_fname).astype(int)[:max_samples]
    y_valid = np.loadtxt(y_valid_fname).astype(int)[:max_samples]

    return X_train, X_valid, y_train, y_valid


In [17]:
max_samples = None

In [18]:
X_train, X_valid, y_train, y_valid = load_Xy(max_samples)

Loaded 537933 ids to questions from ./id_ques_map.json


In [19]:
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [20]:
t0 = time()
sents_0 = np.asarray([p[0] for p in X_train])
sents_0 = use_model(sents_0)
sents_1 = np.asarray([p[1] for p in X_train])
sents_1 = use_model(sents_1)
X_train = np.hstack((sents_0, sents_1))
print(X_train.shape)
enc_time = time() - t0
print(f'USE X_train encoding time: {enc_time:.2f}')

(283003, 1024)
USE X_train encoding time: 100.65


In [21]:
t0 = time()
sents_0 = np.asarray([p[0] for p in X_valid])
sents_0 = use_model(sents_0)
sents_1 = np.asarray([p[1] for p in X_valid])
sents_1 = use_model(sents_1)
X_valid = np.hstack((sents_0, sents_1))
print(X_valid.shape)
enc_time = time() - t0
print(f'USE X_valid encoding time: {enc_time:.2f}')

(121287, 1024)
USE X_valid encoding time: 38.76


In [22]:
for c in (LogisticRegression,): #(LogisticRegression, LinearSVC):
    clf = c()
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = round(time()-t0,2)
    
    t0 = time()
    y_pred = clf.predict(X_valid)
    inf_time = round(time()-t0,2)

    acc = round(accuracy_score(y_valid, y_pred)*100,2)
    f1 = round(f1_score(y_valid, y_pred)*100,2)
    auc = round(roc_auc_score(y_valid, y_pred)*100,2)
    p = round(precision_score(y_valid, y_pred)*100,2)
    r = round(recall_score(y_valid, y_pred)*100,2)
    
    print(f'For classifier: {c}, acc: {acc}, auc: {auc}, p: {p}, r:{r}, f1: {f1}')
    print(f'train time: {train_time} & inf time: {inf_time}')
    print('*'*80)
    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


For classifier: <class 'sklearn.linear_model._logistic.LogisticRegression'>, acc: 72.66, auc: 68.35, p: 66.96, r:51.73, f1: 58.36
train time: 19.25 & inf time: 0.49
********************************************************************************


In [23]:
X_train, X_valid, y_train, y_valid = load_Xy(max_samples)

Loaded 537933 ids to questions from ./id_ques_map.json


In [24]:
t0 = time()
sents_0 = np.asarray([p[0] for p in X_train])
sents_0 = use_model(sents_0)
sents_1 = np.asarray([p[1] for p in X_train])
sents_1 = use_model(sents_1)
diff = abs(sents_0 - sents_1)
X_train = np.hstack((sents_0, sents_1, diff))
print(X_train.shape)
enc_time = time() - t0
print(f'USE X_train encoding time: {enc_time:.2f}')

(283003, 1536)
USE X_train encoding time: 100.86


In [25]:
t0 = time()
sents_0 = np.asarray([p[0] for p in X_valid])
sents_0 = use_model(sents_0)
sents_1 = np.asarray([p[1] for p in X_valid])
sents_1 = use_model(sents_1)
diff = abs(sents_0 - sents_1)
X_valid = np.hstack((sents_0, sents_1, diff))
print(X_valid.shape)
enc_time = time() - t0
print(f'USE X_valid encoding time: {enc_time:.2f}')

(121287, 1536)
USE X_valid encoding time: 38.89


In [26]:
for c in (LogisticRegression,): #(LogisticRegression, LinearSVC):
    clf = c()
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = round(time()-t0,2)
    
    t0 = time()
    y_pred = clf.predict(X_valid)
    inf_time = round(time()-t0,2)

    acc = round(accuracy_score(y_valid, y_pred)*100,2)
    f1 = round(f1_score(y_valid, y_pred)*100,2)
    auc = round(roc_auc_score(y_valid, y_pred)*100,2)
    p = round(precision_score(y_valid, y_pred)*100,2)
    r = round(recall_score(y_valid, y_pred)*100,2)
    
    print(f'For classifier: {c}, acc: {acc}, auc: {auc}, p: {p}, r:{r}, f1: {f1}')
    print(f'train time: {train_time} & inf time: {inf_time}')
    print('*'*80)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


For classifier: <class 'sklearn.linear_model._logistic.LogisticRegression'>, acc: 80.11, auc: 78.52, p: 73.54, r:72.37, f1: 72.95
train time: 23.71 & inf time: 0.73
********************************************************************************
