# load modules

In [1]:
import re
import unidecode
import spacy
import pandas as pd
import numpy as np
import time
import pickle
import operator
import torch
import torch.nn.functional as F
from nltk import ngrams
from nltk.metrics import edit_distance
from nltk.corpus import stopwords
from pyjarowinkler import distance
from tqdm import tqdm

In [2]:
stopWords = set(stopwords.words('english'))


In [3]:
ent_types = ['PERSON','NORP','FAC','ORG','GPE','LOC','PRODUCT','DATE']

In [4]:
spacy_model = "en_core_web_lg"

tokenizer = re.compile(r'\w+')

In [5]:
def saveOBJ(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def loadOBJ(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [6]:
start = time.time()
print("Importing spaCy \""+spacy_model+"\"...")
nlp = spacy.load(spacy_model)
print("Done!")
print("Time elapsed: "+str(round(time.time()-start))+"s")
print("\n")

Importing spaCy "en_core_web_lg"...
Done!
Time elapsed: 14s




# Load Training Data

In [6]:
# training data
df_train = pd.read_csv("data/train_data.csv",index_col="id")
df_train = df_train.drop("is_duplicate",axis=1)

df_labels = pd.read_csv("data/train_labels.csv",index_col="id")

df_train = df_train.join(df_labels)

In [7]:
df_train.head()

Unnamed: 0_level_0,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


## Add graph info

In [10]:
df_training_graphs = pd.read_csv("gen_data/train_graph.csv",index_col="id")

df_train = df_train.join(df_training_graphs)

# Load Testing Data

In [9]:
# testing data
df_train = pd.read_csv("data/test_data.csv",index_col="test_id")
df_train.head()

Unnamed: 0_level_0,question1,question2
test_id,Unnamed: 1_level_1,Unnamed: 2_level_1
15,What would a Trump presidency mean for current...,How will a Trump presidency affect the student...
20,Why do rockets look white?,Why are rockets and boosters painted white?
21,What's causing someone to be jealous?,What can I do to avoid being jealous of someone?
23,How much is 30 kV in HP?,Where can I find a conversion chart for CC to ...
34,What is the best travel website in spain?,What is the best travel website?


In [11]:
df_testing_graphs = pd.read_csv("gen_data/test_graph_all.csv",index_col="test_id")

df_train = df_train.join(df_testing_graphs)

# Build TF-IDF

In [7]:
# token_counts = {}

# print("Starting with "+str(len(df_train))+ " rows..")

# for index, row in df_train.iterrows():
#     docs = [row['question1'], row['question2']]
#     for doc in docs:
#         tokens = tokenizer.findall(str(doc).lower())
#         for token in tokens:
#             if token in token_counts:
#                 token_counts[token] += 1
#             else:
#                 token_counts[token] = 1
                
#     if (index+1) % 10000 == 0:
#         print(str(index+1)+" rows done..")

# print()
# print("Done!")

In [8]:
# n_docs = 2*len(df_train)

# token_idf = {}
# for token in token_counts:
#     token_idf[token] = np.log(n_docs/(token_counts[token]))
    
# saveOBJ(token_idf,"data/token_IDF")

In [9]:
token_idf = loadOBJ("data/token_IDF")
sorted_idf = sorted(token_idf.items(), key=operator.itemgetter(1),reverse=True)
max_idf = sorted_idf[0][1]

# Feature functions

In [10]:
tqdm.pandas()

#df_train = df_train[:10000]

* first word same (also fuzzy)
* last word same (also fuzzy)
* length ratio
* n_words ratio
* context embedding similarity (also with TF-IDF)
* token one-hot encoding similarity (also with TF-IDF)
* both contain (or don't contain) math
* graph network features

Todo:
* NER->both contain (or don't contain) same entity
* Jaro distance
* Lehvenstein distance
* bi,tri,4-grams
* Fuzzy one-hot (&TF-IDF) token matching

In [11]:
def absmaxND(a, axis=None):
    amax = a.max(axis)
    amin = a.min(axis)
    return np.where(-amin > amax, amin, amax)

def tokenize(text):
    tokens = tokenizer.findall(str(text).lower())
    return tokens

def entities(question):
    try:
        doc = nlp(question)
        ents = []
        for ent in doc.ents:
            ents.append([ent.text,ent.label_])
    except:
        ents = []
    return ents

def entity_score(row):
    question1 = str(row["question1"]).lower()
    question2 = str(row["question2"]).lower()
    
    entities1 = []
    for ent in row["entities1"]:
        if ent[1] in ent_types:
            entities1.append(ent[0].lower())
            
    entities2 = []
    for ent in row["entities2"]:
        if ent[1] in ent_types:
            entities2.append(ent[0].lower())
    
    if len(entities1) == 0 and len(entities2) == 0:
        return 1
    
    ent1score = 0
    ent1maxscore = len(entities1)
    for ent1 in entities1:
        subscore = 0
        for ent2 in entities2:
            if ent1 == ent2:
                subscore += 1
                continue
            if edit_distance(ent1, ent2) < len(ent1)/4:
                subscore += 1
                continue
            if ent1 in ent2:
                subscore += 1
                continue
            if ent2 in ent1:
                subscore += 1
                continue
        if ent1 in question2:
            subscore += 1
        ent1score += np.minimum(subscore,1)
    ent1score = np.minimum(ent1score,ent1maxscore)
    
    ent2score = 0
    ent2maxscore = len(entities2)
    for ent2 in entities2:
        subscore = 0
        for ent1 in entities1:
            if ent1 == ent2:
                subscore += 1
                continue
            if edit_distance(ent1, ent2) < len(ent2)/4:
                subscore += 1
                continue
            if ent1 in ent2:
                subscore += 1
                continue
            if ent2 in ent1:
                subscore += 1
                continue
        if ent2 in question1:
            subscore += 1
        ent2score += np.minimum(subscore,1)
    ent2score = np.minimum(ent2score,ent2maxscore)
    
    score = (ent1score+ent2score)/(ent1maxscore+ent2maxscore)
    return score

def check_word(row,word="why"):
    if len(row['tokens1']) == 0 or len(row['tokens2']) == 0:
        return 1
    token_sets = [row['tokens1'],row['tokens2']]
    founds = []
    for token_set in token_sets:
        if word in token_set:
            founds.append(True)
        else:
            founds.append(False)
    if founds[0] == founds[1]:
        return 1
    return 0

def word_length_ratio(row):
    if len(row['tokens1']) == 0 or len(row['tokens2']) == 0:
        return 0
    token_sets = [row['tokens1'],row['tokens2']]
    avg_lengths = []
    for token_set in token_sets:
        length = 0
        for token in token_set:
            length += len(token)
        avg_lengths.append(length/len(token_set))
    ratio = avg_lengths[0]/avg_lengths[1]
    ratio = np.minimum(ratio,1/ratio)
    return ratio
        
def stop_word_ratio(row):
    if len(row['tokens1']) == 0 or len(row['tokens2']) == 0:
        return 0
    token_sets = [row['tokens1'],row['tokens2']]
    stop_word_parts = []
    for token_set in token_sets:
        stops = 0
        for token in token_set:
            if token in stopWords:
                stops += 1
        stop_word_parts.append(stops/len(token_set))
    try:
        ratio = stop_word_parts[0]/stop_word_parts[1]
        ratio = np.minimum(ratio,1/ratio)
    except:
        ratio = 0
    return ratio

def UC_ratio(row):
    if len(str(row['question1'])) == 0 or len(str(row['question2'])) == 0:
        return 0
    questions = [row['question1'],row['question2']]
    UCS = []
    for question in questions:
        UCsum = sum(1 for c in str(question) if c.isupper())
        UCS.append(UCsum)
    try:
        ratio = UCS[0]/UCS[1]
        ratio = np.minimum(ratio,1/ratio)
    except:
        ratio = 0
    return ratio
    
def tok_ngrams(row,n=2):
    if len(row['tokens1']) == 0 or len(row['tokens2']) == 0:
        return 0
    token_sets = [row['tokens1'],row['tokens2']]
    gram_sets = []
    try:
        for token_set in token_sets:
            gram_set = []
            for gram in ngrams(token_set, n):
                gram_set.append(gram)
            gram_sets.append(gram_set)
        L1 = len(gram_sets[0]) + len(gram_sets[1])
        L2 = len(list(set(gram_sets[0] + gram_sets[1])))
        if L1 > 0 and L2 > 0:
            score = L1/L2 - 1
        else:
            score = 0
    except:
        score = 0
    return score
    
def edit_distance_score(row):
    try:
        distance = edit_distance(row["question1"].lower(), row["question2"].lower())
        length = np.maximum(len(row["question1"]),len(row["question2"]))
        if distance == 0 and len(row["question1"]) > 0 and len(row["question2"]) > 0:
            score = 2
        else:
            score = (length/distance)/50
    except:
        score = 0
    return score

def jaro_similarity(row):
    try:
        score = distance.get_jaro_distance(row["question1"].lower(), row["question2"].lower(), winkler=False)
    except:
        score = 0
    return score

def firstWordSame(row):
    if len(row['tokens1']) == 0 or len(row['tokens2']) == 0:
        return 0
    if row['tokens1'][0] == row['tokens2'][0]:
        return 1
    return 0

def lastWordSame(row):
    if len(row['tokens1']) == 0 or len(row['tokens2']) == 0:
        return 0
    if row['tokens1'][-1] == row['tokens2'][-1]:
        return 1
    return 0


def word_ratio(row):
    if len(row['tokens1']) == 0 or len(row['tokens2']) == 0:
        return 0
    ratio = len(row['tokens1'])/len(row['tokens2'])
    if ratio > 1:
        return 1/ratio
    return ratio

def char_ratio(row):
    if len(row['tokens1']) == 0 or len(row['tokens2']) == 0:
        return 0
    ratio = len("".join(row['tokens1']))/len("".join(row['tokens2']))
    if ratio > 1:
        return 1/ratio
    return ratio

def math_similarity(row):
    hit1 = 0
    hit2 = 0
    if "[math]" in str(row["question1"]):
        hit1 = 1
    if "[math]" in str(row["question2"]):
        hit2 = 1
    if hit1 == hit2:
        return 1
    return 0

def getVector(tokens,TFIDF=False,mode="AVG"):
    if len(tokens) == 0:
        return 0
    vectors = []
    totalWeight = 0
    for token in tokens:
        token_id = nlp.vocab.strings[token]
        try:
            weight = 1
            if TFIDF:
                if token in token_idf:
                    weight = token_idf[token]
                else:
                    weight = max_idf
            vectors.append(nlp.vocab.vectors[token_id]*weight)
            totalWeight += weight
        except:
            continue
    if len(vectors) == 0:
        return 0
    vectors = np.array(vectors)
    if mode == "AVG":
        vector = np.sum(vectors,axis=0)/totalWeight
    elif mode == "MAX":
        vector = absmaxND(vectors,axis=0)
    if np.linalg.norm(vector) == 0:
        return 0
    return vector

def cosine_similarity(vectors):
    v1 = vectors[0]
    v2 = vectors[1]
    if np.linalg.norm(v1) == 0:
        return 0
    elif np.linalg.norm(v2) == 0:
        return 0
    similarity = np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))
    return similarity


def multiply_vectors(vectors):
    v1 = vectors[0]
    v2 = vectors[1]
    if np.linalg.norm(v1) == 0:
        return np.zeros(300)
    elif np.linalg.norm(v2) == 0:
        return np.zeros(300)
    product = v1*v2
    return product

def add_vectors(vectors):
    v1 = vectors[0]
    v2 = vectors[1]
    if np.linalg.norm(v1) == 0:
        return np.zeros(300)
    elif np.linalg.norm(v2) == 0:
        return np.zeros(300)
    added = np.absolute(v1+v2)
    return added

def token_similarity(row, TFIDF=False):
    if len(row['tokens1']) == 0 or len(row['tokens2']) == 0:
        return 0
    
    #tokens of each question, including repeated words
    token_sets = [row['tokens1'],row['tokens2']]
    
    #total unique tokens across both questions
    total_tokens = list(set(row['tokens1']+row['tokens2']))
    
    vectors = []
    for token_set in token_sets:
        token_vector = np.zeros(len(total_tokens))
        for token in token_set:
            weight = 1
            if TFIDF:
                if token in token_idf:
                    weight = token_idf[token]
                else:
                    weight = max_idf
            token_vector[total_tokens.index(token)] += weight
        if np.linalg.norm(token_vector) == 0:
            return 0
        token_vector = token_vector/np.linalg.norm(token_vector)
        vectors.append(token_vector)

    #only dot product is sufficient because already normalized
    similarity = np.dot(vectors[0],vectors[1])
    return similarity

def word_match_fuzzy(row, first=True):
    if len(row['tokens1']) == 0 or len(row['tokens2']) == 0:
        return 0
    vectors = []
    
    if first:
        tokens = [row['tokens1'][0], row['tokens2'][0]]
    else:
        tokens = [row['tokens1'][-1], row['tokens2'][-1]]
    
    for token in tokens:
        token_id = nlp.vocab.strings[token]
        try:
            vector = nlp.vocab.vectors[token_id]
            if np.linalg.norm(vector) == 0:
                continue
            vectors.append(vector)
        except:
            continue
    if len(vectors) != 2:
        return 0
    vectors = np.array(vectors)
    return cosine_similarity(vectors)

def cross_correlation(row, TFIDF=False):
    if len(row['tokens1']) == 0 or len(row['tokens2']) == 0:
        return 0
    
    #tokens of each question, including repeated words
    token_sets = [row['tokens1'],row['tokens2']]
    
    vectors = []
    weights = []
    for token_set in token_sets:
        subset = []
        subweights = []
        for token in token_set:
            token_id = nlp.vocab.strings[token]
            try:
                weight = 1
                if TFIDF:
                    if token in token_idf:
                        weight = token_idf[token]
                    else:
                        weight = max_idf
                subset.append(nlp.vocab.vectors[token_id])
                subweights.append(weight)
            except:
                continue
        if len(subset) == 0:
            return 0
        vectors.append(subset)
        weights.append(subweights)
    
    scores = []
    weight_products = []
    
    for i, v1 in enumerate(vectors[0]):
        for j, v2 in enumerate(vectors[1]):
            combi_weight = np.sqrt(weights[0][i]*weights[1][j])
            scores.append(cosine_similarity([v1,v2])*combi_weight)
            weight_products.append(combi_weight)
            
    correlation_score = np.sum(scores)/np.sum(weight_products)
    return correlation_score


In [34]:
df_train['tokens1'] = df_train['question1'].progress_apply(lambda x: tokenize(x))
df_train['tokens2'] = df_train['question2'].progress_apply(lambda x: tokenize(x))


100%|██████████| 323164/323164 [00:03<00:00, 106869.55it/s]
100%|██████████| 323164/323164 [00:03<00:00, 96320.98it/s] 


In [18]:
df_train['vector1'] = df_train['tokens1'].progress_apply(lambda x: getVector(x))
df_train['vector2'] = df_train['tokens2'].progress_apply(lambda x: getVector(x))

100%|██████████| 81126/81126 [00:08<00:00, 9139.20it/s] 
100%|██████████| 81126/81126 [00:09<00:00, 8967.86it/s] 


In [19]:
df_train['vector_tfidf1'] = df_train['tokens1'].progress_apply(lambda x: getVector(x,TFIDF=True))
df_train['vector_tfidf2'] = df_train['tokens2'].progress_apply(lambda x: getVector(x,TFIDF=True))


100%|██████████| 81126/81126 [00:10<00:00, 7945.47it/s]
100%|██████████| 81126/81126 [00:09<00:00, 8522.56it/s]


In [20]:
df_train['embedding_similarity'] = df_train[['vector1','vector2']].progress_apply(lambda row: cosine_similarity(row),axis=1)
df_train['embedding_similarity_tfidf'] = df_train[['vector_tfidf1','vector_tfidf2']].progress_apply(lambda row: cosine_similarity(row),axis=1)


100%|██████████| 81126/81126 [00:09<00:00, 8431.31it/s]
100%|██████████| 81126/81126 [00:10<00:00, 7624.53it/s]


In [21]:
df_train['vector_combo1'] = df_train[['vector1','vector2']].progress_apply(lambda row: multiply_vectors(row),axis=1)
df_train['vector_combo2'] = df_train[['vector_tfidf1','vector_tfidf2']].progress_apply(lambda row: multiply_vectors(row),axis=1)


100%|██████████| 81126/81126 [00:15<00:00, 5231.78it/s]
100%|██████████| 81126/81126 [00:13<00:00, 5862.80it/s]


In [22]:
df_train['firstWordSame'] = df_train[['tokens1','tokens2']].progress_apply(lambda row: firstWordSame(row),axis=1)
df_train['lastWordSame'] = df_train[['tokens1','tokens2']].progress_apply(lambda row: lastWordSame(row),axis=1)


100%|██████████| 81126/81126 [00:04<00:00, 18630.49it/s]
100%|██████████| 81126/81126 [00:04<00:00, 19394.83it/s]


In [23]:
df_train['firstWordSame_fuzzy'] = df_train[['tokens1','tokens2']].progress_apply(lambda row: word_match_fuzzy(row),axis=1)
df_train['lastWordSame_fuzzy'] = df_train[['tokens1','tokens2']].progress_apply(lambda row: word_match_fuzzy(row, first=False),axis=1)


100%|██████████| 81126/81126 [00:11<00:00, 7224.03it/s]
100%|██████████| 81126/81126 [00:13<00:00, 5905.15it/s]


In [24]:
df_train['word_ratio'] = df_train[['tokens1','tokens2']].progress_apply(lambda row: word_ratio(row),axis=1)
df_train['char_ratio'] = df_train[['tokens1','tokens2']].progress_apply(lambda row: char_ratio(row),axis=1)


100%|██████████| 81126/81126 [00:04<00:00, 17145.48it/s]
100%|██████████| 81126/81126 [00:04<00:00, 17760.74it/s]


In [25]:
df_train['math_similarity'] = df_train[['question1','question2']].progress_apply(lambda row: math_similarity(row),axis=1)


100%|██████████| 81126/81126 [00:02<00:00, 37128.67it/s]


In [26]:
df_train['token_similarity'] = df_train[['tokens1','tokens2']].progress_apply(lambda row: token_similarity(row),axis=1)
df_train['token_similarity_idf'] = df_train[['tokens1','tokens2']].progress_apply(lambda row: token_similarity(row, TFIDF=True),axis=1)


100%|██████████| 81126/81126 [00:16<00:00, 5016.58it/s]
100%|██████████| 81126/81126 [00:16<00:00, 4884.63it/s]


In [27]:
df_train['cross_correlation'] = df_train[['tokens1','tokens2']].progress_apply(lambda row: cross_correlation(row),axis=1)
df_train['cross_correlation_idf'] = df_train[['tokens1','tokens2']].progress_apply(lambda row: cross_correlation(row, TFIDF=True),axis=1)


100%|██████████| 81126/81126 [09:02<00:00, 113.29it/s]
100%|██████████| 81126/81126 [12:35<00:00, 107.45it/s] 


In [18]:
df_train['2grams'] =  df_train[['tokens1','tokens2']].progress_apply(lambda row: tok_ngrams(row,n=2),axis=1)
df_train['3grams'] =  df_train[['tokens1','tokens2']].progress_apply(lambda row: tok_ngrams(row,n=3),axis=1)
df_train['4grams'] =  df_train[['tokens1','tokens2']].progress_apply(lambda row: tok_ngrams(row,n=4),axis=1)


100%|██████████| 81126/81126 [00:06<00:00, 11928.40it/s]
100%|██████████| 81126/81126 [00:06<00:00, 13142.66it/s]
100%|██████████| 81126/81126 [00:06<00:00, 12957.94it/s]


In [19]:
df_train['edit_score'] = df_train[['question1','question2']].progress_apply(lambda row: edit_distance_score(row),axis=1)


100%|██████████| 81126/81126 [05:57<00:00, 226.90it/s]


In [20]:
df_train['jaro_score'] = df_train[['question1','question2']].progress_apply(lambda row: jaro_similarity(row),axis=1)


100%|██████████| 81126/81126 [00:20<00:00, 3913.01it/s]


In [10]:
df_train['wl_ratio'] =  df_train[['tokens1','tokens2']].progress_apply(lambda row: word_length_ratio(row),axis=1)


100%|██████████| 81126/81126 [00:06<00:00, 12910.84it/s]


In [11]:
df_train['stop_ratio'] =  df_train[['tokens1','tokens2']].progress_apply(lambda row: stop_word_ratio(row),axis=1)


100%|██████████| 81126/81126 [00:05<00:00, 14080.82it/s]


In [12]:
df_train['UC_ratio'] =  df_train[['question1','question2']].progress_apply(lambda row: UC_ratio(row),axis=1)


100%|██████████| 81126/81126 [00:05<00:00, 15447.59it/s]


In [13]:
df_train['why'] =  df_train[['tokens1','tokens2']].progress_apply(lambda row: check_word(row,word="why"),axis=1)
df_train['what'] =  df_train[['tokens1','tokens2']].progress_apply(lambda row: check_word(row,word="what"),axis=1)
df_train['when'] =  df_train[['tokens1','tokens2']].progress_apply(lambda row: check_word(row,word="when"),axis=1)
df_train['where'] =  df_train[['tokens1','tokens2']].progress_apply(lambda row: check_word(row,word="where"),axis=1)
df_train['how'] =  df_train[['tokens1','tokens2']].progress_apply(lambda row: check_word(row,word="how"),axis=1)
df_train['who'] =  df_train[['tokens1','tokens2']].progress_apply(lambda row: check_word(row,word="who"),axis=1)

100%|██████████| 81126/81126 [00:05<00:00, 15844.94it/s]
100%|██████████| 81126/81126 [00:04<00:00, 16883.29it/s]
100%|██████████| 81126/81126 [00:05<00:00, 14621.69it/s]
100%|██████████| 81126/81126 [00:04<00:00, 17450.90it/s]
100%|██████████| 81126/81126 [00:04<00:00, 17560.26it/s]
100%|██████████| 81126/81126 [00:05<00:00, 14096.22it/s]


In [25]:
df_train['entities1'] = df_train['question1'].progress_apply(lambda x: entities(x))
df_train['entities2'] = df_train['question2'].progress_apply(lambda x: entities(x))


100%|██████████| 81126/81126 [16:58<00:00, 79.66it/s]  
100%|██████████| 81126/81126 [15:03<00:00, 89.81it/s] 


In [17]:
df_train['vector_max1'] = df_train['tokens1'].progress_apply(lambda x: getVector(x,mode="MAX"))
df_train['vector_max2'] = df_train['tokens2'].progress_apply(lambda x: getVector(x,mode="MAX"))
df_train['vector_max_tfidf1'] = df_train['tokens1'].progress_apply(lambda x: getVector(x,TFIDF=True,mode="MAX"))
df_train['vector_max_tfidf2'] = df_train['tokens2'].progress_apply(lambda x: getVector(x,TFIDF=True,mode="MAX"))

100%|██████████| 81126/81126 [00:09<00:00, 8208.34it/s]
100%|██████████| 81126/81126 [00:09<00:00, 8657.89it/s]
100%|██████████| 81126/81126 [00:10<00:00, 7595.47it/s]
100%|██████████| 81126/81126 [00:10<00:00, 7419.91it/s]


In [18]:
df_train['embedding_similarity_max'] = df_train[['vector_max1','vector_max2']].progress_apply(lambda row: cosine_similarity(row),axis=1)
df_train['embedding_similarity_max_tfidf'] = df_train[['vector_max_tfidf1','vector_max_tfidf2']].progress_apply(lambda row: cosine_similarity(row),axis=1)


100%|██████████| 81126/81126 [00:09<00:00, 8370.73it/s] 
100%|██████████| 81126/81126 [00:10<00:00, 7989.54it/s]


In [26]:
df_train['entity_score'] = df_train[['question1','question2','entities1','entities2']].progress_apply(lambda row: entity_score(row),axis=1)


100%|██████████| 81126/81126 [00:23<00:00, 3429.23it/s]


In [12]:
#df_train = loadOBJ("gen_data/df_train")
df_train = loadOBJ("gen_data/df_test")

In [28]:
df_train.head()

Unnamed: 0_level_0,question1,question2,tokens1,tokens2,embedding_similarity,embedding_similarity_tfidf,vector_combo1,vector_combo2,firstWordSame,lastWordSame,...,what,when,where,how,who,embedding_similarity_max,embedding_similarity_max_tfidf,entities1,entities2,entity_score
test_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15,What would a Trump presidency mean for current...,How will a Trump presidency affect the student...,"[what, would, a, trump, presidency, mean, for,...","[how, will, a, trump, presidency, affect, the,...",0.894801,0.826423,"[0.0011587327, 0.009684816, 0.001439825, 0.000...","[0.00010098619, 0.0010915771, 0.020145686, 0.0...",0,0,...,0,1,1,0,1,0.520753,0.397651,"[[Trump, ORG]]","[[Trump, ORG], [US, GPE], [US, GPE]]",0.5
20,Why do rockets look white?,Why are rockets and boosters painted white?,"[why, do, rockets, look, white]","[why, are, rockets, and, boosters, painted, wh...",0.87104,0.846412,"[0.06817112, -5.824842e-05, 0.106069826, 0.044...","[0.11919739, 0.0032099667, 0.10251035, 0.04612...",1,1,...,1,1,1,1,1,0.743635,0.737955,[],[],1.0
21,What's causing someone to be jealous?,What can I do to avoid being jealous of someone?,"[what, s, causing, someone, to, be, jealous]","[what, can, i, do, to, avoid, being, jealous, ...",0.948524,0.924701,"[0.010460816, 0.029607525, 0.08545068, 0.00066...","[0.06137621, -0.0012372601, 0.070046395, 0.002...",1,0,...,1,1,1,1,1,0.634457,0.749039,[],[],1.0
23,How much is 30 kV in HP?,Where can I find a conversion chart for CC to ...,"[how, much, is, 30, kv, in, hp]","[where, can, i, find, a, conversion, chart, fo...",0.804563,0.55917,"[-0.0070741503, 0.16010112, -0.04363538, 0.002...","[0.015298527, 0.22946647, -0.00915852, 0.01152...",0,0,...,1,1,0,0,1,0.411248,0.259039,"[[30 kV, QUANTITY], [HP, PRODUCT]]","[[CC, ORG]]",0.0
34,What is the best travel website in spain?,What is the best travel website?,"[what, is, the, best, travel, website, in, spain]","[what, is, the, best, travel, website]",0.962578,0.87473,"[0.0036688992, 0.0861908, 0.0041611576, 0.0147...","[0.0065529128, 0.06635068, 0.003924161, 0.0161...",1,0,...,1,1,1,1,1,0.803588,0.651903,"[[spain, GPE]]",[],0.0


In [23]:
print(df_train.columns)

Index(['question1', 'question2', 'tokens1', 'tokens2', 'embedding_similarity',
       'embedding_similarity_tfidf', 'vector_combo1', 'vector_combo2',
       'firstWordSame', 'lastWordSame', 'firstWordSame_fuzzy',
       'lastWordSame_fuzzy', 'word_ratio', 'char_ratio', 'math_similarity',
       'token_similarity', 'token_similarity_idf', 'cross_correlation',
       'cross_correlation_idf', 'q1_degree', 'q2_degree', 'intersection_count',
       '2grams', '3grams', '4grams', 'edit_score', 'jaro_score', 'wl_ratio',
       'stop_ratio', 'UC_ratio', 'why', 'what', 'when', 'where', 'how', 'who',
       'embedding_similarity_max', 'embedding_similarity_max_tfidf'],
      dtype='object')


In [21]:
dropCols = ["vector1","vector_tfidf1","vector_max1","vector_max_tfidf1","vector2","vector_tfidf2","vector_max2","vector_max_tfidf2","vector_combo1","vector_combo2"]
for dropCol in dropCols:
    if dropCol in list(df_train.columns):
        df_train = df_train.drop(dropCol,axis=1)


In [29]:
#saveOBJ(df_train,"gen_data/df_train")
saveOBJ(df_train,"gen_data/df_test")

# Pairs to omit from training

In [45]:
for index, row in df_train.iterrows():
    if row["tokens1"] == [] or row["tokens2"] == []:
        print(index)

3306
13016
47056
96725
104101
134403
190570
208485
213220
226925
273065
301583
384293
402423


# Pairs to omit from testing

In [16]:
for index, row in df_train.iterrows():
    if row["tokens1"] == [] or row["tokens2"] == []:
        print(index)

20072
20794
189659
254161
