# load modules

In [1]:
import re
import unidecode
import spacy
import pandas as pd
import numpy as np
import time
import pickle
import operator
import torch
import torch.nn.functional as F
from tqdm import tqdm

In [2]:
spacy_model = "en_core_web_lg"

tokenizer = re.compile(r'\w+')

In [3]:
def saveOBJ(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def loadOBJ(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [4]:
start = time.time()
print("Importing spaCy \""+spacy_model+"\"...")
nlp = spacy.load(spacy_model)
print("Done!")
print("Time elapsed: "+str(round(time.time()-start))+"s")
print("\n")

Importing spaCy "en_core_web_lg"...
Done!
Time elapsed: 12s




# Load Data

In [86]:
df_train = pd.read_csv("data/train_data.csv",index_col="id")
df_train = df_train.drop("is_duplicate",axis=1)

df_labels = pd.read_csv("data/train_labels.csv",index_col="id")

df_train = df_train.join(df_labels)

In [87]:
df_train.head()

Unnamed: 0_level_0,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [88]:
len0 = len(df_train[df_train["is_duplicate"]==0])
len1 = len(df_train[df_train["is_duplicate"]==1])
fraction = len1/(len0+len1)

print('Fraction duplicate: %s' % (fraction))
print('Fraction !duplicate: %s' % (1-fraction))


Fraction duplicate: 0.3688313054671931
Fraction !duplicate: 0.6311686945328069


# Build TF-IDF

In [89]:
# token_counts = {}

# print("Starting with "+str(len(df_train))+ " rows..")

# for index, row in df_train.iterrows():
#     docs = [row['question1'], row['question2']]
#     for doc in docs:
#         tokens = tokenizer.findall(str(doc).lower())
#         for token in tokens:
#             if token in token_counts:
#                 token_counts[token] += 1
#             else:
#                 token_counts[token] = 1
                
#     if (index+1) % 10000 == 0:
#         print(str(index+1)+" rows done..")

# print()
# print("Done!")

In [90]:
# n_docs = 2*len(df_train)

# token_idf = {}
# for token in token_counts:
#     token_idf[token] = np.log(n_docs/(token_counts[token]))
    
# saveOBJ(token_idf,"data/token_IDF")

In [91]:
token_idf = loadOBJ("data/token_IDF")
sorted_idf = sorted(token_idf.items(), key=operator.itemgetter(1),reverse=True)
max_idf = sorted_idf[0][1]

# Feature functions

In [92]:
tqdm.pandas()

df_train = df_train[:100000]

* first word same
* last word same
* length ratio
* n_words ratio
* context embedding similarity
* token one-hot encoding similarity with TF-IDF
* both contain (or don't contain) math
* NER->both contain (or don't contain) same entity

In [102]:
def tokenize(text):
    tokens = tokenizer.findall(str(text).lower())
    return tokens

def firstWordSame(row):
    if len(row['tokens1']) == 0 or len(row['tokens2']) == 0:
        return 0
    if row['tokens1'][0] == row['tokens2'][0]:
        return 1
    return 0

def lastWordSame(row):
    if len(row['tokens1']) == 0 or len(row['tokens2']) == 0:
        return 0
    if row['tokens1'][-1] == row['tokens2'][-1]:
        return 1
    return 0

def embedding_similarity(row):
    if len(row['tokens1']) == 0 or len(row['tokens2']) == 0:
        return 0
    docs = []
    docs.append(nlp(" ".join(row['tokens1'])))
    docs.append(nlp(" ".join(row['tokens2'])))
    vectors = []
    for doc in docs:    
        subvectors = []
        for token in doc:
            if not token.has_vector:
                continue
            subvectors.append(token.vector)
        if len(subvectors) == 0:
            return 0
        subvectors = np.array(subvectors)
        vector = np.average(subvectors,axis=0)
        if np.linalg.norm(vector) == 0:
            return 0
        vectors.append(vector)
    similarity = np.dot(vectors[0], vectors[1])/(np.linalg.norm(vectors[0])*np.linalg.norm(vectors[1]))
    return similarity

def embedding_similarity2(row):
    if len(row['tokens1']) == 0 or len(row['tokens2']) == 0:
        return 0

    token_sets = [row['tokens1'],row['tokens2']]
    vectors = []
    for token_set in token_sets:    
        subvectors = []
        for token in token_set:
            token_id = nlp.vocab.strings[token]
            try:
                subvectors.append(nlp.vocab.vectors[token_id])
            except:
                continue
        if len(subvectors) == 0:
            return 0
        subvectors = np.array(subvectors)
        vector = np.average(subvectors,axis=0)
        if np.linalg.norm(vector) == 0:
            return 0
        vectors.append(vector)
    similarity = np.dot(vectors[0], vectors[1])/(np.linalg.norm(vectors[0])*np.linalg.norm(vectors[1]))
    return similarity

def word_ratio(row):
    if len(row['tokens1']) == 0 or len(row['tokens2']) == 0:
        return 0
    ratio = len(row['tokens1'])/len(row['tokens2'])
    if ratio > 1:
        return 1/ratio
    return ratio

def char_ratio(row):
    if len(row['tokens1']) == 0 or len(row['tokens2']) == 0:
        return 0
    ratio = len("".join(row['tokens1']))/len("".join(row['tokens2']))
    if ratio > 1:
        return 1/ratio
    return ratio

def math_similarity(row):
    hit1 = 0
    hit2 = 0
    if "[math]" in str(row["question1"]):
        hit1 = 1
    if "[math]" in str(row["question2"]):
        hit2 = 1
    if hit1 == hit2:
        return 1
    return 0

def getVector(tokens,TFIDF=False):
    if len(tokens) == 0:
        return 0
    vectors = []
    for token in tokens:
        token_id = nlp.vocab.strings[token]
        try:
            weight = 1
            if TFIDF:
                if token in token_idf:
                    weight = token_idf[token]
                else:
                    weight = max_idf
            vectors.append(nlp.vocab.vectors[token_id]*weight)
        except:
            continue
    if len(vectors) == 0:
        return 0
    vectors = np.array(vectors)
    vector = np.average(vectors,axis=0)
    if np.linalg.norm(vector) == 0:
        return 0
    return vector

def cosine_similarity(vectors):
    v1 = vectors[0]
    v2 = vectors[1]
    if np.linalg.norm(v1) == 0:
        return 0
    elif np.linalg.norm(v2) == 0:
        return 0
    similarity = np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))
    return similarity


def multiply_vectors(vectors):
    v1 = vectors[0]
    v2 = vectors[1]
    if np.linalg.norm(v1) == 0:
        return np.zeros(300)
    elif np.linalg.norm(v2) == 0:
        return np.zeros(300)
    product = v1*v2
    return product

def add_vectors(vectors):
    v1 = vectors[0]
    v2 = vectors[1]
    if np.linalg.norm(v1) == 0:
        return np.zeros(300)
    elif np.linalg.norm(v2) == 0:
        return np.zeros(300)
    added = np.absolute(v1+v2)
    return added

def token_similarity(row):
    if len(row['tokens1']) == 0 or len(row['tokens2']) == 0:
        return 0
    
    #tokens of each question, including repeated words
    token_sets = [row['tokens1'],row['tokens2']]
    
    #total unique tokens across both questions
    total_tokens = list(set(row['tokens1']+row['tokens2']))
    
    vectors = []
    for token_set in token_sets:
        token_vector = np.zeros(len(total_tokens))
        for token in token_set:
            if token in token_idf:
                #use += instead of = to automatically include TF
                token_vector[total_tokens.index(token)] += token_idf[token]
            else:
                token_vector[total_tokens.index(token)] += max_idf
        if np.linalg.norm(token_vector) == 0:
            return 0
        token_vector = token_vector/np.linalg.norm(token_vector)
        vectors.append(token_vector)

    #only dot product is sufficient because already normalized
    similarity = np.dot(vectors[0],vectors[1])
    return similarity
    

In [94]:
df_train['tokens1'] = df_train['question1'].progress_apply(lambda x: tokenize(x))
df_train['tokens2'] = df_train['question2'].progress_apply(lambda x: tokenize(x))


100%|██████████| 100000/100000 [00:01<00:00, 88091.68it/s]
100%|██████████| 100000/100000 [00:00<00:00, 109725.43it/s]


In [95]:
df_train['vector1'] = df_train['tokens1'].progress_apply(lambda x: getVector(x))
df_train['vector2'] = df_train['tokens2'].progress_apply(lambda x: getVector(x))

100%|██████████| 100000/100000 [00:11<00:00, 8689.21it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9395.74it/s]


In [96]:
df_train['vector_tfidf1'] = df_train['tokens1'].progress_apply(lambda x: getVector(x,TFIDF=True))
df_train['vector_tfidf2'] = df_train['tokens2'].progress_apply(lambda x: getVector(x,TFIDF=True))


100%|██████████| 100000/100000 [00:12<00:00, 7843.21it/s]
100%|██████████| 100000/100000 [00:12<00:00, 7822.25it/s]


In [97]:
df_train['embedding_similarity'] = df_train[['vector1','vector2']].progress_apply(lambda row: cosine_similarity(row),axis=1)
df_train['embedding_similarity_tfidf'] = df_train[['vector_tfidf1','vector_tfidf2']].progress_apply(lambda row: cosine_similarity(row),axis=1)


100%|██████████| 100000/100000 [00:12<00:00, 7914.04it/s]
100%|██████████| 100000/100000 [00:12<00:00, 8035.01it/s]


In [98]:
df_train['vector_combo1'] = df_train[['vector1','vector2']].progress_apply(lambda row: multiply_vectors(row),axis=1)
df_train['vector_combo2'] = df_train[['vector1','vector2']].progress_apply(lambda row: add_vectors(row),axis=1)
df_train['vector_combo3'] = df_train[['vector_tfidf1','vector_tfidf2']].progress_apply(lambda row: multiply_vectors(row),axis=1)
df_train['vector_combo4'] = df_train[['vector_tfidf1','vector_tfidf2']].progress_apply(lambda row: add_vectors(row),axis=1)


100%|██████████| 100000/100000 [00:18<00:00, 5297.34it/s]
100%|██████████| 100000/100000 [00:18<00:00, 5469.09it/s]
100%|██████████| 100000/100000 [00:17<00:00, 5696.39it/s]
100%|██████████| 100000/100000 [00:20<00:00, 4951.55it/s]


In [99]:
df_train['firstWordSame'] = df_train[['tokens1','tokens2']].progress_apply(lambda row: firstWordSame(row),axis=1)
df_train['lastWordSame'] = df_train[['tokens1','tokens2']].progress_apply(lambda row: lastWordSame(row),axis=1)


100%|██████████| 100000/100000 [00:05<00:00, 17506.10it/s]
100%|██████████| 100000/100000 [00:05<00:00, 17839.72it/s]


In [100]:
df_train['word_ratio'] = df_train[['tokens1','tokens2']].progress_apply(lambda row: word_ratio(row),axis=1)
df_train['char_ratio'] = df_train[['tokens1','tokens2']].progress_apply(lambda row: char_ratio(row),axis=1)


100%|██████████| 100000/100000 [00:05<00:00, 16835.56it/s]
100%|██████████| 100000/100000 [00:06<00:00, 16648.21it/s]


In [103]:
df_train['math_similarity'] = df_train[['question1','question2']].progress_apply(lambda row: math_similarity(row),axis=1)



  0%|          | 0/100000 [00:00<?, ?it/s][A
  3%|▎         | 3335/100000 [00:00<00:02, 33347.57it/s][A
  5%|▍         | 4826/100000 [00:00<00:03, 24321.45it/s][A
  6%|▌         | 5974/100000 [00:00<00:05, 16161.27it/s][A
  8%|▊         | 7512/100000 [00:00<00:05, 15908.61it/s][A
 10%|▉         | 9935/100000 [00:00<00:05, 17735.94it/s][A
 12%|█▏        | 11807/100000 [00:00<00:04, 18011.60it/s][A
 14%|█▎        | 13577/100000 [00:00<00:04, 17915.63it/s][A
 16%|█▌        | 15733/100000 [00:00<00:04, 18856.81it/s][A
 18%|█▊        | 17829/100000 [00:00<00:04, 19432.73it/s][A
 20%|██        | 20197/100000 [00:01<00:03, 20536.71it/s][A
 22%|██▏       | 22476/100000 [00:01<00:03, 21075.94it/s][A
 25%|██▍       | 24583/100000 [00:01<00:04, 17454.49it/s][A
 26%|██▋       | 26432/100000 [00:01<00:04, 14811.57it/s][A
 29%|██▉       | 28770/100000 [00:01<00:04, 16621.24it/s][A
 31%|███       | 30604/100000 [00:01<00:04, 15235.62it/s][A
 32%|███▏      | 32270/100000 [00:01<00:04,

In [104]:
df_train['token_similarity'] = df_train[['tokens1','tokens2']].progress_apply(lambda row: token_similarity(row),axis=1)



  0%|          | 0/100000 [00:00<?, ?it/s][A
  0%|          | 289/100000 [00:00<00:34, 2888.41it/s][A
  1%|          | 578/100000 [00:00<00:34, 2888.01it/s][A
  1%|          | 799/100000 [00:00<00:37, 2640.11it/s][A
  1%|          | 1000/100000 [00:00<00:41, 2411.54it/s][A
  1%|▏         | 1279/100000 [00:00<00:39, 2512.27it/s][A
  2%|▏         | 1531/100000 [00:00<00:39, 2513.56it/s][A
  2%|▏         | 1798/100000 [00:00<00:38, 2558.13it/s][A
  2%|▏         | 2034/100000 [00:00<00:40, 2429.36it/s][A
  2%|▏         | 2265/100000 [00:00<00:40, 2391.26it/s][A
  2%|▏         | 2496/100000 [00:01<00:42, 2301.16it/s][A
  3%|▎         | 2742/100000 [00:01<00:41, 2342.68it/s][A
  3%|▎         | 2994/100000 [00:01<00:40, 2393.03it/s][A
  3%|▎         | 3231/100000 [00:01<00:41, 2345.91it/s][A
  4%|▎         | 3542/100000 [00:01<00:38, 2529.21it/s][A
  4%|▍         | 3879/100000 [00:01<00:35, 2729.37it/s][A
  4%|▍         | 4224/100000 [00:01<00:32, 2911.33it/s][A
  5%|▍      

 44%|████▍     | 44335/100000 [00:15<00:30, 1832.47it/s][A
 45%|████▍     | 44630/100000 [00:15<00:26, 2062.84it/s][A
 45%|████▍     | 44889/100000 [00:15<00:25, 2196.53it/s][A
 45%|████▌     | 45126/100000 [00:15<00:28, 1957.53it/s][A
 45%|████▌     | 45372/100000 [00:15<00:26, 2084.55it/s][A
 46%|████▌     | 45631/100000 [00:15<00:24, 2213.31it/s][A
 46%|████▌     | 45932/100000 [00:16<00:22, 2404.16it/s][A
 46%|████▌     | 46186/100000 [00:16<00:23, 2330.84it/s][A
 46%|████▋     | 46429/100000 [00:16<00:22, 2337.00it/s][A
 47%|████▋     | 46670/100000 [00:16<00:26, 2034.62it/s][A
 47%|████▋     | 46916/100000 [00:16<00:24, 2142.08it/s][A
 47%|████▋     | 47140/100000 [00:16<00:27, 1919.42it/s][A
 47%|████▋     | 47395/100000 [00:16<00:25, 2073.20it/s][A
 48%|████▊     | 47680/100000 [00:16<00:23, 2257.23it/s][A
 48%|████▊     | 47979/100000 [00:16<00:21, 2433.49it/s][A
 48%|████▊     | 48236/100000 [00:17<00:22, 2282.17it/s][A
 48%|████▊     | 48476/100000 [00:17<00:

# Training

In [106]:
def calc_log_loss(true,predicted,eps=1e-15):
    if len(true) != len(predicted):
        print("True and predicted values need to be of same shape.")
        return 0

    true = true.astype("float64")
    predicted = predicted.astype("float64")
    
    predicted = np.minimum(np.maximum(predicted,eps),1-eps)
    intermediate = true * np.log(predicted) + (1-true) * np.log(1-predicted)
    score = -np.mean(intermediate)
    return score

def getModelStats(model, testing_data, testing_target):
    test_x = torch.from_numpy(testing_data).float()

    results = model(test_x).detach().numpy()
    results2 = np.round(results)
    error = np.sum(np.abs(results2-testing_target))/len(testing_target)

    log_loss = calc_log_loss(testing_target,results)

    print("Error: "+str(error))
    print("Accuracy: "+str(1-error))
    print("Log-loss: "+str(log_loss))

# MODEL 1
## Fully Connected - 1 Hidden Layer - MSE Loss

In [107]:
pred_columns = ['firstWordSame','lastWordSame','embedding_similarity','embedding_similarity_tfidf','word_ratio','char_ratio','math_similarity','token_similarity']

df_tr = df_train[:80000]
df_te = df_train[80000:]

training_target = df_tr['is_duplicate'].values.reshape((len(df_tr), 1))
training_data = df_tr[pred_columns].values

testing_target = df_te['is_duplicate'].values.reshape((len(df_te), 1))
testing_data = df_te[pred_columns].values


In [108]:
x = torch.from_numpy(training_data).float()
y = torch.from_numpy(training_target).float()

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
          torch.nn.Linear(8, 8),
          torch.nn.Softmax(dim=1),
          torch.nn.Linear(8, 1),
          torch.nn.Sigmoid()
        )
#loss_fn = torch.nn.BCELoss()
loss_fn = torch.nn.MSELoss()

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
learning_rate = 5e-2
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(4001):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    if (t) % 500 == 0:
        print((t), loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the Tensors it will update (which are the learnable weights
    # of the model)
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its parameters
    optimizer.step()

0 0.26189756393432617
500 0.18413563072681427
1000 0.18403802812099457
1500 0.18010516464710236
2000 0.17978347837924957
2500 0.17948423326015472
3000 0.17828193306922913
3500 0.17627638578414917
4000 0.175069659948349


In [109]:
getModelStats(model, testing_data, testing_target)

Error: 0.273
Accuracy: 0.727
Log-loss: 0.5103850184193128


# MODEL 2
## Vectors, convolutions

In [110]:
def buildVectorModel(column, df_tr, df_te, training_target, testing_target):
    training_data_pre = df_tr[[column]].values
    training_data = []
    for vector in training_data_pre:
        training_data.append(vector[0])
    training_data = np.array(training_data)

    testing_data_pre = df_te[[column]].values
    testing_data = []
    for vector in testing_data_pre:
        testing_data.append(vector[0])
    testing_data = np.array(testing_data)
    
    x = torch.from_numpy(training_data).float()
    y = torch.from_numpy(training_target).float()

    v_model = torch.nn.Sequential(
              torch.nn.Linear(300, 100),
              torch.nn.ReLU(),
              torch.nn.Linear(100, 20),
              torch.nn.ReLU(),
              torch.nn.Linear(20, 1),
              torch.nn.Sigmoid()
            )
    loss_fn = torch.nn.MSELoss()

    learning_rate = 5e-2
    optimizer = torch.optim.Adam(v_model.parameters(), lr=learning_rate)
    for t in range(1001):
        y_pred = v_model(x)

        loss = loss_fn(y_pred, y)
        if (t) % 100 == 0:
            print((t), loss.item())
            
        optimizer.zero_grad()

        loss.backward()

        optimizer.step()
        
    getModelStats(v_model, testing_data, testing_target)
    
    return v_model

In [111]:
model2 = buildVectorModel('vector_combo1', df_tr, df_te, training_target, testing_target)

0 0.26593101024627686
100 0.174383282661438
200 0.160793274641037
300 0.15550464391708374
400 0.1549474149942398
500 0.1526605784893036
600 0.14689694344997406
700 0.14630483090877533
800 0.1447707861661911
900 0.1471860557794571
1000 0.14673252403736115
Error: 0.2597
Accuracy: 0.7403
Log-loss: 0.5596444850941115


In [112]:
model3 = buildVectorModel('vector_combo2', df_tr, df_te, training_target, testing_target)

0 0.2406657189130783
100 0.23376934230327606
200 0.2337692379951477
300 0.2337692379951477


KeyboardInterrupt: 

In [113]:
model4 = buildVectorModel('vector_combo3', df_tr, df_te, training_target, testing_target)

0 0.2991880178451538


KeyboardInterrupt: 

In [None]:
model5 = buildVectorModel('vector_combo4', df_tr, df_te, training_target, testing_target)

# Stacking model

In [None]:
pred1 = model(test_x).detach().numpy()