In [1]:
import numpy as np
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import ast
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from time import time

In [2]:
with open('id_ques_map.json') as fh:
    id_ques_map = json.load(fh)
id_ques_map = {int(k):str(v) for k,v in  id_ques_map.items()}
id_ques_map[0] = ''
    
train_pairs = np.array([list(ast.literal_eval(l.strip())) for l in open('train_pairs.txt').readlines()])
valid_pairs = np.array([list(ast.literal_eval(l.strip())) for l in open('valid_pairs.txt').readlines()])
y_train = np.loadtxt('y_train.txt')
y_valid = np.loadtxt('y_valid.txt')

In [3]:
max(id_ques_map.keys())

537933

In [4]:
sents = []
for i in range(max(id_ques_map)+1):
    try:
        sents.append(str(id_ques_map[i]))
    except KeyError:
        print(f'key {i} is missing')

In [5]:
set(range(537934))-set(id_ques_map.keys())

set()

In [6]:
t0 = time()
X = TfidfVectorizer().fit_transform(sents)
enc_time = time() - t0
print(f'TFIDF (whole dataset) encoding time: {enc_time:.2f}')

TFIDF (whole dataset) encoding time: 5.85


In [7]:
X.shape

(537934, 86153)

In [8]:
max(train_pairs[:,0]), max(valid_pairs[:,0])

(537930, 537932)

In [9]:
max(train_pairs[:,1]), max(valid_pairs[:,1])

(537931, 537933)

In [11]:
X_1 = X[train_pairs[:,0]]
X_2 = X[train_pairs[:,1]]
X_train = hstack((X_1,X_2))

X_1 = X[valid_pairs[:,0]]
X_2 = X[valid_pairs[:,1]]
X_valid = hstack((X_1,X_2))

X_train.shape, X_valid.shape

((283003, 172306), (121287, 172306))

In [12]:
for c in (LogisticRegression, LinearSVC):
    clf = c()
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = round(time()-t0,2)
    
    t0 = time()
    y_pred = clf.predict(X_valid)
    inf_time = round(time()-t0,2)

    acc = round(accuracy_score(y_valid, y_pred)*100,2)
    f1 = round(f1_score(y_valid, y_pred)*100,2)
    auc = round(roc_auc_score(y_valid, y_pred)*100,2)
    
    print(f'For classifier: {c}, acc: {acc}, f1: {f1}, auc: {auc}')
    print(f'train time: {train_time} & inf time: {inf_time}')
    print('*'*80)
    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


For classifier: <class 'sklearn.linear_model._logistic.LogisticRegression'>, acc: 75.72, f1: 63.76, auc: 72.0
train time: 14.56 & inf time: 0.11
********************************************************************************
For classifier: <class 'sklearn.svm._classes.LinearSVC'>, acc: 74.94, f1: 64.98, auc: 72.43
train time: 8.3 & inf time: 0.07
********************************************************************************


In [14]:
X_1 = X[train_pairs[:,0]]
X_2 = X[train_pairs[:,1]]
X_diff = abs(X_1-X_2)
X_train = hstack((X_1,X_2, X_diff))

X_1 = X[valid_pairs[:,0]]
X_2 = X[valid_pairs[:,1]]
X_diff = abs(X_1-X_2)
X_valid = hstack((X_1,X_2, X_diff))

X_train.shape, X_valid.shape

((283003, 258459), (121287, 258459))

In [15]:
for c in (LogisticRegression, LinearSVC):
    clf = c()
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = round(time()-t0,2)
    
    t0 = time()
    y_pred = clf.predict(X_valid)
    inf_time = round(time()-t0,2)

    acc = round(accuracy_score(y_valid, y_pred)*100,2)
    f1 = round(f1_score(y_valid, y_pred)*100,2)
    auc = round(roc_auc_score(y_valid, y_pred)*100,2)
    
    print(f'For classifier: {c}, acc: {acc}, f1: {f1}, auc: {auc}')
    print(f'train time: {train_time} & inf time: {inf_time}')
    print('*'*80)
    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


For classifier: <class 'sklearn.linear_model._logistic.LogisticRegression'>, acc: 80.86, f1: 73.23, auc: 78.76
train time: 20.58 & inf time: 0.15
********************************************************************************
For classifier: <class 'sklearn.svm._classes.LinearSVC'>, acc: 81.03, f1: 74.6, auc: 79.83
train time: 11.71 & inf time: 0.12
********************************************************************************
