In [1]:
import os
from absl import logging

import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import re
import seaborn as sns
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score
from tqdm import tqdm
import ast, json, numpy as np

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from time import time
from sklearn.preprocessing import normalize

In [3]:
X_train_fname = './train_pairs.txt'
X_valid_fname = './valid_pairs.txt'
id_ques_map_fname = './id_ques_map.json'
y_train_fname = './y_train.txt'
y_valid_fname = './y_valid.txt'

def load_id_ques_map(fname):
    with open(fname) as fh:
        id_ques_map = json.load(fh)
    id_ques_map = {int(k): str(v) for k, v in id_ques_map.items()}
    id_ques_map[0] = ''
    print(f'Loaded {max(id_ques_map.keys())} ids to questions from {fname}')
    return id_ques_map


def load_Xy(max_samples=None):
    train_pairs = np.array([list(ast.literal_eval(l.strip())) for l in
                            open(X_train_fname).readlines()])[:max_samples]
    valid_pairs = np.array([list(ast.literal_eval(l.strip())) for l in
                            open(X_valid_fname).readlines()])[:max_samples]

    id_ques_map = load_id_ques_map(id_ques_map_fname)
    X_train = [(id_ques_map[i], id_ques_map[j]) for i,j in train_pairs]
    X_valid = [(id_ques_map[i], id_ques_map[j]) for i,j in valid_pairs]

    y_train = np.loadtxt(y_train_fname).astype(int)[:max_samples]
    y_valid = np.loadtxt(y_valid_fname).astype(int)[:max_samples]

    return X_train, X_valid, y_train, y_valid

In [4]:
max_samples = None

In [5]:
X_train, X_valid, y_train, y_valid = load_Xy(max_samples)

Loaded 537933 ids to questions from ./id_ques_map.json


In [6]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub

def get_model(model_url, max_seq_length):
  labse_layer = hub.KerasLayer(model_url, trainable=True)

  # Define input.
  input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                         name="input_word_ids")
  input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                     name="input_mask")
  segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                      name="segment_ids")

  # LaBSE layer.
  pooled_output,  _ = labse_layer([input_word_ids, input_mask, segment_ids])

  # The embedding is l2 normalized.
  pooled_output = tf.keras.layers.Lambda(
      lambda x: tf.nn.l2_normalize(x, axis=1))(pooled_output)

  # Define model.
  return tf.keras.Model(
        inputs=[input_word_ids, input_mask, segment_ids],
        outputs=pooled_output), labse_layer

max_seq_length = 64
labse_model, labse_layer = get_model(
    model_url="https://tfhub.dev/google/LaBSE/1", max_seq_length=max_seq_length)

In [7]:
import bert

vocab_file = labse_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = labse_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

def create_input(input_strings, tokenizer, max_seq_length):

  input_ids_all, input_mask_all, segment_ids_all = [], [], []
  for input_string in input_strings:
    # Tokenize input.
    input_tokens = ["[CLS]"] + tokenizer.tokenize(input_string) + ["[SEP]"]
    input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
    sequence_length = min(len(input_ids), max_seq_length)

    # Padding or truncation.
    if len(input_ids) >= max_seq_length:
      input_ids = input_ids[:max_seq_length]
    else:
      input_ids = input_ids + [0] * (max_seq_length - len(input_ids))

    input_mask = [1] * sequence_length + [0] * (max_seq_length - sequence_length)

    input_ids_all.append(input_ids)
    input_mask_all.append(input_mask)
    segment_ids_all.append([0] * max_seq_length)

  return np.array(input_ids_all), np.array(input_mask_all), np.array(segment_ids_all)

def encode(input_text):
  input_ids, input_mask, segment_ids = create_input(
    input_text, tokenizer, max_seq_length)
  return labse_model([input_ids, input_mask, segment_ids])

In [8]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [9]:
sents_0 = np.asarray([p[0] for p in X_train])
sents_1 = np.asarray([p[1] for p in X_train])
t0 = time()
labse_vec = []
for batch in tqdm(list(chunks(sents_0, 100))):
    labse_vec.append(encode(batch))
sents_0 = np.vstack(labse_vec)

labse_vec = []
for batch in tqdm(list(chunks(sents_1, 100))):
    labse_vec.append(encode(batch))
sents_1 = np.vstack(labse_vec)
enc_time = time() - t0
print(f'LaBSE encoding time: {enc_time:.2f}')

100%|██████████| 2831/2831 [23:05<00:00,  2.04it/s]
100%|██████████| 2831/2831 [23:14<00:00,  2.03it/s]


LaBSE encoding time: 2781.76


In [10]:
X_train = np.hstack((sents_0, sents_1))
X_train.shape

(283003, 1536)

In [11]:
sents_0 = np.asarray([p[0] for p in X_valid])
sents_1 = np.asarray([p[1] for p in X_valid])
t0 = time()
labse_vec = []
for batch in tqdm(list(chunks(sents_0, 100))):
    labse_vec.append(encode(batch))
sents_0 = np.vstack(labse_vec)

labse_vec = []
for batch in tqdm(list(chunks(sents_1, 100))):
    labse_vec.append(encode(batch))
sents_1 = np.vstack(labse_vec)
enc_time = time() - t0
print(f'LaBSE encoding time: {enc_time:.2f}')

100%|██████████| 1213/1213 [09:59<00:00,  2.02it/s]
100%|██████████| 1213/1213 [09:56<00:00,  2.03it/s]


LaBSE encoding time: 1198.00


In [12]:
X_valid = np.hstack((sents_0, sents_1))
X_valid.shape

(121287, 1536)

In [13]:
for c in (LogisticRegression,): #(LogisticRegression, LinearSVC):
    clf = c()
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = round(time()-t0,2)
    
    t0 = time()
    y_pred = clf.predict(X_valid)
    inf_time = round(time()-t0,2)

    acc = round(accuracy_score(y_valid, y_pred)*100,2)
    f1 = round(f1_score(y_valid, y_pred)*100,2)
    auc = round(roc_auc_score(y_valid, y_pred)*100,2)
    p = round(precision_score(y_valid, y_pred)*100,2)
    r = round(recall_score(y_valid, y_pred)*100,2)
    
    print(f'For classifier: {c}, acc: {acc}, auc: {auc}, p: {p}, r:{r}, f1: {f1}')
    print(f'train time: {train_time} & inf time: {inf_time}')
    print('*'*80)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


For classifier: <class 'sklearn.linear_model._logistic.LogisticRegression'>, acc: 72.34, auc: 67.78, p: 66.9, r:50.18, f1: 57.34
train time: 24.8 & inf time: 0.72
********************************************************************************


In [14]:
X_train, X_valid, y_train, y_valid = load_Xy(max_samples)

Loaded 537933 ids to questions from ./id_ques_map.json


In [16]:
sents_0 = np.asarray([p[0] for p in X_train])
sents_1 = np.asarray([p[1] for p in X_train])
t0 = time()
labse_vec = []
for batch in tqdm(list(chunks(sents_0, 100))):
    labse_vec.append(encode(batch))
sents_0 = np.vstack(labse_vec)

labse_vec = []
for batch in tqdm(list(chunks(sents_1, 100))):
    labse_vec.append(encode(batch))
sents_1 = np.vstack(labse_vec)
enc_time = time() - t0
print(f'LaBSE encoding time: {enc_time:.2f}')

100%|██████████| 2831/2831 [23:04<00:00,  2.04it/s]
100%|██████████| 2831/2831 [23:08<00:00,  2.04it/s]


LaBSE encoding time: 2775.15


In [17]:
X_train = np.hstack((sents_0, sents_1, abs(sents_0 - sents_1)))
X_train.shape

(283003, 2304)

In [18]:
sents_0 = np.asarray([p[0] for p in X_valid])
sents_1 = np.asarray([p[1] for p in X_valid])
t0 = time()
labse_vec = []
for batch in tqdm(list(chunks(sents_0, 100))):
    labse_vec.append(encode(batch))
sents_0 = np.vstack(labse_vec)

labse_vec = []
for batch in tqdm(list(chunks(sents_1, 100))):
    labse_vec.append(encode(batch))
sents_1 = np.vstack(labse_vec)
enc_time = time() - t0
print(f'LaBSE encoding time: {enc_time:.2f}')

100%|██████████| 1213/1213 [09:54<00:00,  2.04it/s]
100%|██████████| 1213/1213 [09:50<00:00,  2.06it/s]


LaBSE encoding time: 1186.18


In [19]:
X_valid = np.hstack((sents_0, sents_1, abs(sents_0 - sents_1)))
X_valid.shape

(121287, 2304)

In [20]:
for c in (LogisticRegression,): #(LogisticRegression, LinearSVC):
    clf = c()
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = round(time()-t0,2)
    
    t0 = time()
    y_pred = clf.predict(X_valid)
    inf_time = round(time()-t0,2)

    acc = round(accuracy_score(y_valid, y_pred)*100,2)
    f1 = round(f1_score(y_valid, y_pred)*100,2)
    auc = round(roc_auc_score(y_valid, y_pred)*100,2)
    p = round(precision_score(y_valid, y_pred)*100,2)
    r = round(recall_score(y_valid, y_pred)*100,2)
    
    print(f'For classifier: {c}, acc: {acc}, auc: {auc}, p: {p}, r:{r}, f1: {f1}')
    print(f'train time: {train_time} & inf time: {inf_time}')
    print('*'*80)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


For classifier: <class 'sklearn.linear_model._logistic.LogisticRegression'>, acc: 81.08, auc: 79.34, p: 75.38, r:72.66, f1: 73.99
train time: 31.69 & inf time: 2.06
********************************************************************************
