# Imports

In [None]:
import time
import nltk
import string
import re
import gensim.downloader

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn import random_projection
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.svm import SVC,LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, ShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from sklearn.decomposition import PCA

!pip3 install fuzzywuzzy
from fuzzywuzzy import fuzz
import xgboost

from collections import Counter
from tqdm.notebook import tqdm

try:
    from bayes_opt import BayesianOptimization
except :
     !pip3 install bayesian-optimization
     from bayes_opt import BayesianOptimization

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Loading and Preprocessing

Define the preprocessing function(s).

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def benchmark_clean_text(text):
    # Lower words
    try:
        cleaned = text.lower()
    except AttributeError:
        return  "this is an empty question".split()
    cleaned = text
    # Remove stopwords. Needed to be done before the apostrophes are removed
    cleaned = ' '.join([w for w in cleaned.split(' ') if not w in stop_words])
    
    # Remove punctuation
    cleaned = cleaned.translate(str.maketrans('', '', string.punctuation +'–—‘’“”…')) 
    
    # Replace all the remaining numbers that did not match any of the above categories  
    number_pattern = re.compile(r'(\d{1,3},)?(\d{3},)*\d+(\.\d+)?')
    cleaned = number_pattern.sub(r' ', cleaned)

    # lemmatization
    cleaned = " ".join([lemmatizer.lemmatize(word) for word in cleaned.split()])
    
    cleaned = cleaned.split() #note the added .split() here. Returns a list of list of words
    
    if cleaned == [] or cleaned == ['nan'] or cleaned == ['null'] or cleaned == ['a']:
        cleaned = "this is an empty question".split()
        
    return cleaned    

def quora_clean_text(text):
    text = str(text)
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    text = text.split()
    
    # Check for empty texts
    if text == [] or text == ['nan'] or text == ['null'] or text == ['a']:
        text = "this is an empty question".split()
        
    return text

def preprocess(text):
    return benchmark_clean_text(text)

Load, prepocess and store the data in a proper form.

In [None]:
train = pd.read_csv("train.csv")#.dropna()
test  = pd.read_csv("test_without_labels.csv")#.dropna()

# We need the test NaN values for ordering when submitting
#train_df = pd.read_csv("../storage/datasets/q2b/train.csv")
#test_df  = pd.read_csv("../storage/datasets/q2b/test_without_labels.csv")

In [None]:
# Create the preprocessed text in memory
train_q1 = [preprocess(t) for t in train['Question1']]
train_q2 = [preprocess(t) for t in train['Question2']]
train_labels = np.array(train['IsDuplicate'])

test_q1 = [preprocess(t) for t in test['Question1']]

test_q2 = [preprocess(t) for t in test['Question2']]

In [None]:
# Create the preprocessed train texts and save it in disk (care, we save a string insted of a list of words)
cleaned_train_df = train_df.copy(deep=True)

cleaned_train_q1 = cleaned_train_df.Question1.apply(lambda text: " ".join(preprocess(text)))
cleaned_train_df = cleaned_train_df.assign(Q1=cleaned_train_q1)

cleaned_train_q2 = cleaned_train_df.Question2.apply(lambda text: " ".join(preprocess(text)))
cleaned_train_df = cleaned_train_df.assign(Q2=cleaned_train_q2)
                                                       
cleaned_train_df.drop(columns=['Question1', 'Question2'], inplace=True)
#cleaned_train_df.to_csv('../storage/datasets/q2b/preprocessed/train_benchmark_clean.csv', sep=',')

In [None]:
# Create the preprocessed test texts and save it in disk (care, we save a string insted of a list of words)
cleaned_test_df = test_df.copy(deep=True)

cleaned_test_q1 = cleaned_test_df.Question1.apply(lambda text: " ".join(preprocess(text)))
cleaned_test_df = cleaned_test_df.assign(Q1=cleaned_test_q1)

cleaned_test_q2 = cleaned_test_df.Question2.apply(lambda text: " ".join(preprocess(text)))
cleaned_test_df = cleaned_test_df.assign(Q2=cleaned_test_q2)
                                                       
cleaned_test_df.drop(columns=['Question1', 'Question2'], inplace=True)
cleaned_test_df.to_csv('../storage/datasets/q2b/preprocessed/test_benchmark_clean.csv', sep=',')

# Feature Engineering

In [None]:
# Reading the preprocessed train and test sets
clean_train_df = pd.read_csv('../storage/datasets/q2b/preprocessed/train_quora_clean.csv')
clean_test_df = pd.read_csv('../storage/datasets/q2b/preprocessed/test_quora_clean.csv')

### Feature Class

For easier organisation and creation / storing of features we create a class which offers some basic functionality.

In [None]:
class TextFeature:
    def __init__(self, train_arr, test_arr, name):
        self.train_arr = train_arr
        self.test_arr = test_arr
        self.name = name

## Word2Vec


Create the word2vec model.

In [None]:
# This is a 1.6GB download for the first time
quora_w2v = gensim.downloader.load('word2vec-google-news-300')
google_news_emb_dim = 300

In [None]:
# This part creates the vocabulary that can be both used in the feature extraction part and as an embedding layer in a NN
vocabulary = dict()
inverse_vocabulary = ['<unk>']
stop_words = set(stopwords.words('english'))

def text_to_vec(df, w2v, vocabulary, inverse_vocabulary):
    numb_represantations = []
    for index, row in df.iterrows():
        questions = []
        for question in ['Q1', 'Q2']:
            q2n = []
#             print(row.loc[question])
            for word in row.loc[question].split():
#                 print(row.loc[question].split())
                # Stopwords have not yet been removed since they might be included in the pretrained word2vec
                if word in stop_words and word not in w2v.vocab:
                    continue
                    
                if word not in vocabulary:
                    vocabulary[word] = len(inverse_vocabulary)
                    q2n.append(len(inverse_vocabulary))
                    inverse_vocabulary.append(word)
                else:
                    q2n.append(vocabulary[word])
            questions.append(q2n)
        numb_represantations.append(questions)
    
    return numb_represantations, vocabulary, inverse_vocabulary

numb_represantation_train, vocabulary, inverse_vocabulary = text_to_vec(clean_train_df, quora_w2v, vocabulary, inverse_vocabulary)
numb_represantation_test, vocabulary, inverse_vocabulary = text_to_vec(clean_test_df, quora_w2v, vocabulary, inverse_vocabulary)

**Care:** In the method below we create and save the **aggregated** (mean) word embeddings for each sentence. This means that we create a numpy array of dimensions: `numb_of_texts, 2, embedding_dim`.

Saving the embeddings individually is not possible due to memory constraints.

In [None]:
# From integer represantation to embedding represantation
def int_to_embed(w2v, dims, numb_repr, inverse_vocabulary):
    embeddings = []
    for numb in numb_repr:
        word = inverse_vocabulary[numb]
        if word in w2v.vocab:
                embeddings.append(w2v.word_vec(word))
        else:
            embeddings.append(np.random.normal(0, 1, dims))
    
    return embeddings

def create_avg_embedding_matrix(numb_represantation_of_texts, w2v, dims, inverse_vocabulary):
    ret_embedding_mat = []
    for questions in numb_represantation_of_texts:
        both_question_embeddings = []
        for q in questions:
            q_embeddings = np.mean(np.array(int_to_embed(w2v, dims, q, inverse_vocabulary)), axis=0)
            both_question_embeddings.append(q_embeddings)
            
        ret_embedding_mat.append(both_question_embeddings)
    return ret_embedding_mat
                

train_embedding_mat = create_avg_embedding_matrix(numb_represantation_train, quora_w2v, google_news_emb_dim, inverse_vocabulary)
test_embedding_mat = create_avg_embedding_matrix(numb_represantation_test, quora_w2v, google_news_emb_dim, inverse_vocabulary)

In [None]:
# Transforming to numpy arrays and saving in disk
train_embedding_arr = np.array(train_embedding_mat)
np.save('../storage/datasets/q2b/word_embeddings/train_embedding_avg.npy', train_embedding_arr)

test_embedding_arr = np.array(test_embedding_mat)
np.save('../storage/datasets/q2b/word_embeddings/test_embedding_avg.npy', test_embedding_arr)

## Cosine similarity of averaged word embeddings

In [None]:
# We first load the averaged embeddings from disk
train_embeddings = np.load('../storage/datasets/q2b/word_embeddings/train_embedding_avg.npy')
test_embeddings = np.load('../storage/datasets/q2b/word_embeddings/test_embedding_avg.npy')

In [None]:
# We then calculate the cosine similarities of the two questions of each row
def matrix_cosine(x, y):
    return np.einsum('ij,ij->i', x, y) / (
              np.linalg.norm(x, axis=1) * np.linalg.norm(y, axis=1)
    )

train_cosine_similarities = matrix_cosine(train_embeddings[:, 0, :], train_embeddings[:, 1, :])
test_cosine_similarities = matrix_cosine(test_embeddings[:, 0, :], test_embeddings[:, 1, :])

In [None]:
# We store them in memory in a `TextFeature` object
avg_embeddings_cos_similarities = TextFeature(train_cosine_similarities, test_cosine_similarities, "AvgEmbeddingsCosine")

## BoW cosine similarity

We first create a bag of words represantation using the the tf-idf vectorizer on all the questions. Then we calculate their cosine similarity.

In [None]:
df_train_cleaned = pd.read_csv('../storage/datasets/q2b/preprocessed/train_benchmark_clean.csv')
df_test_cleaned = pd.read_csv('../storage/datasets/q2b/preprocessed/test_benchmark_clean.csv') 

In [None]:
concatenated_train_questions = pd.concat([df_train_cleaned.Q1, df_train_cleaned.Q2])
tfidf_vectorizer = TfidfVectorizer()

# Fit on all the quetions in the train set
tfidf_vectorizer.fit(concatenated_train_questions)

# Trnaform the train questions separately
q1_train_tfidf = tfidf_vectorizer.transform(df_train_cleaned.Q1)
q2_train_tfidf = tfidf_vectorizer.transform(df_train_cleaned.Q2)

# Trnaform the test questions separately
q1_test_tfidf = tfidf_vectorizer.transform(df_test_cleaned.Q1)
q2_test_tfidf = tfidf_vectorizer.transform(df_test_cleaned.Q2)

In [None]:
# Calculate the cosine similarities
from scipy.spatial import distance

def cosine_of_vectors(t1, t2):
    t1 = t1.toarray()[0]
    t2 = t2.toarray()[0]
    return (t1 @ t2) / (np.linalg.norm(t1) * np.linalg.norm(t2))

train_bow_similarities = np.array([cosine_of_vectors(t1, t2) for t1, t2 in zip(q1_train_tfidf, q2_train_tfidf)])
test_bow_similarities = np.array([cosine_of_vectors(t1, t2) for t1, t2 in zip(q1_test_tfidf, q2_test_tfidf)])

  return (t1 @ t2) / (np.linalg.norm(t1) * np.linalg.norm(t2))


In [None]:
# We store them in memory in a `TextFeature` object
tfidf_cos_sim = TextFeature(np.nan_to_num(train_bow_similarities), np.nan_to_num(test_bow_similarities), "TfIdfCosSimilarity")

## Edit distance (Levenshtein)

In [None]:
df_train_cleaned = pd.read_csv('../storage/datasets/q2b/preprocessed/train_benchmark_clean.csv')
df_test_cleaned = pd.read_csv('../storage/datasets/q2b/preprocessed/test_benchmark_clean.csv') 

In [None]:
train_edit_ratio = np.array([fuzz.ratio(row.Q1, row.Q2) for ind, row in df_train_cleaned.iterrows()])
test_edit_ratio = np.array([fuzz.ratio(row.Q1, row.Q2) for ind, row in df_test_cleaned.iterrows()])

train_edit_partial_ratio = np.array([fuzz.partial_ratio(row.Q1, row.Q2) for ind, row in df_train_cleaned.iterrows()])
test_edit_partial_ratio = np.array([fuzz.partial_ratio(row.Q1, row.Q2) for ind, row in df_test_cleaned.iterrows()])

train_edit_token_sort_ratio = np.array([fuzz.token_sort_ratio(row.Q1, row.Q2) for ind, row in df_train_cleaned.iterrows()])
test_edit_token_sort_ratio = np.array([fuzz.token_sort_ratio(row.Q1, row.Q2) for ind, row in df_test_cleaned.iterrows()])

train_edit_token_set_ratio = np.array([fuzz.token_set_ratio(row.Q1, row.Q2) for ind, row in df_train_cleaned.iterrows()])
test_edit_token_set_ratio = np.array([fuzz.token_set_ratio(row.Q1, row.Q2) for ind, row in df_test_cleaned.iterrows()])

In [None]:
# We store them in memory in `TextFeature` objects
edit_ratio = TextFeature(train_edit_ratio, test_edit_ratio, "edit_ratio")
edit_partial_ratio = TextFeature(train_edit_partial_ratio, test_edit_partial_ratio, "edit_partial_ratio")
edit_token_sort_ratio = TextFeature(train_edit_token_sort_ratio, test_edit_token_sort_ratio, "edit_token_sort_ratio")
edit_token_set_ratio = TextFeature(train_edit_token_set_ratio, test_edit_token_set_ratio, "edit_token_set_ratio")

## Text Length

In [None]:
df_train_cleaned = pd.read_csv('../storage/datasets/q2b/preprocessed/train_benchmark_clean.csv')
df_test_cleaned = pd.read_csv('../storage/datasets/q2b/preprocessed/test_benchmark_clean.csv')

df_train = pd.read_csv('../storage/datasets/q2b/train.csv')
df_test = pd.read_csv('../storage/datasets/q2b/test_without_labels.csv')

df_train_quora = pd.read_csv('../storage/datasets/q2b/preprocessed/train_quora_clean.csv')
df_test_quora = pd.read_csv('../storage/datasets/q2b/preprocessed/test_quora_clean.csv')

In [None]:
train_lengths_q1_clean = np.array([len(row.Q1.split()) for ind, row in df_train_cleaned.iterrows()])
train_lengths_q2_clean = np.array([len(row.Q2.split()) for ind, row in df_train_cleaned.iterrows()])
test_lengths_q1_clean = np.array([len(row.Q1.split()) for ind, row in df_test_cleaned.iterrows()])
test_lengths_q2_clean = np.array([len(row.Q2.split()) for ind, row in df_test_cleaned.iterrows()])

train_lengths_q1_original = np.array([len(row.Question1.split()) for ind, row in df_train.iterrows()])
train_lengths_q2_original = np.array([len(row.Question2.split()) if not isinstance(row.Question2, float) else 0 for ind, row in df_train.iterrows()])
test_lengths_q1_original = np.array([len(row.Question1.split()) if not isinstance(row.Question1, float) else 0 for ind, row in df_test.iterrows()])
test_lengths_q2_original = np.array([len(row.Question2.split()) if not isinstance(row.Question2, float) else 0 for ind, row in df_test.iterrows()])

train_lengths_q1_quora = np.array([len(row.Q1.split()) for ind, row in df_train_quora.iterrows()])
train_lengths_q2_quora = np.array([len(row.Q2.split()) for ind, row in df_train_quora.iterrows()])
test_lengths_q1_quora = np.array([len(row.Q1.split()) for ind, row in df_test_quora.iterrows()])
test_lengths_q2_quora = np.array([len(row.Q2.split()) for ind, row in df_test_quora.iterrows()])

In [None]:
# Store all of the above into feature objects
length_q1_clean = TextFeature(train_lengths_q1_clean, test_lengths_q1_clean, "length_q1_clean")
length_q2_clean = TextFeature(train_lengths_q2_clean, test_lengths_q2_clean, "length_q2_clean")
length_q1_original = TextFeature(train_lengths_q1_original, test_lengths_q1_original, "length_q1_original")
length_q2_original = TextFeature(train_lengths_q2_original, test_lengths_q2_original, "length_q2_original")
length_q1_quora = TextFeature(train_lengths_q1_quora, test_lengths_q1_quora, "length_q1_quora")
length_q2_quora = TextFeature(train_lengths_q2_quora, test_lengths_q2_quora, "length_q2_quora")

## Names

In [None]:
clean_train_df = pd.read_csv('train_quora_clean.csv')
clean_test_df = pd.read_csv('test_quora_clean.csv')

In [None]:
vecs = [[w if w[0].isupper() else "" for w in q[1:]] for q in train_q1]
train_q1_names = []
train_q1_num_of_names = []
for q in vecs:
    aver = np.zeros(300)
    words = 0
    for w in q:
        if w=="":
            continue
        try:
            aver += np.array(quora_w2v.word_vec(w))
            words+=1
        except:
            pass
    aver /= max(words,1)
    train_q1_names.append(aver)
    train_q1_num_of_names.append(words)

vecs = [[w if w[0].isupper() else "" for w in q[1:]] for q in train_q2]
train_q2_names = []
train_q2_num_of_names = []
for q in vecs:
    aver = np.zeros(300)
    words = 0
    for w in q:
        if w=="":
            continue
        try:
            aver += np.array(quora_w2v.word_vec(w))
            words+=1
        except:
            pass
    aver /= max(words,1)
    train_q2_names.append(aver)
    train_q2_num_of_names.append(words)

train_names_cosine = []
for i in range(len(train_q1_names)):
    names1 = train_q1_names[i]
    names2 = train_q2_names[i]
    train_names_cosine.append(cosine_of_vectors(names1,names2))

vecs = [[w if w[0].isupper() else "" for w in q[1:]] for q in test_q1]
test_q1_names = []
test_q1_num_of_names = []
for q in vecs:
    aver = np.zeros(300)
    words = 0
    for w in q:
        if w=="":
            continue
        try:
            aver += np.array(quora_w2v.word_vec(w))
            words+=1
        except:
            pass
    aver /= max(words,1)
    test_q1_names.append(aver)
    test_q1_num_of_names.append(words)

vecs = [[w if w[0].isupper() else "" for w in q[1:]] for q in test_q2]
test_q2_names = []
test_q2_num_of_names = []
for q in vecs:
    aver = np.zeros(300)
    words = 0
    for w in q:
        if w=="":
            continue
        try:
            aver += np.array(quora_w2v.word_vec(w))
            words+=1
        except:
            pass
    aver /= max(words,1)
    test_q2_names.append(aver)
    test_q2_num_of_names.append(words)

In [None]:
def all_zeros(a):
    for i in a:
        if i!=0:
            return False
    return True

train_names_cosine = []
for i in range(len(train_q1_names)):
    names1 = train_q1_names[i]
    names2 = train_q2_names[i]
    if all_zeros(names1) or all_zeros(names2):
        cosine=0
    else:
        cosine = cosine_of_vectors(names1,names2)
    train_names_cosine.append(cosine)

test_names_cosine = []
for i in range(len(test_q1_names)):
    names1 = test_q1_names[i]
    names2 = test_q2_names[i]
    if all_zeros(names1) or all_zeros(names2):
        cosine=0
    else:
        cosine = cosine_of_vectors(names1,names2)
    test_names_cosine.append(cosine)

In [None]:
for i in range(100):
    c = train_names_cosine[i]
    print(c, train_q1_num_of_names[i],train_q2_num_of_names[i])

In [None]:
q1_num_names = TextFeature(train_q1_num_of_names,test_q1_num_of_names, "q1_num_names")
q2_num_names = TextFeature(train_q2_num_of_names,test_q2_num_of_names, "q2_num_names")

names_cosine = TextFeature(train_names_cosine,test_names_cosine, "names_cosine")

## Number of Stopwords

In [None]:
df_train = pd.read_csv('../storage/datasets/q2b/preprocessed/train_quora_clean.csv')
df_test = pd.read_csv('../storage/datasets/q2b/preprocessed/test_quora_clean.csv')

In [None]:
stop_words = set(stopwords.words('english'))

def stopwords_in_str(txt):
    tokens = txt.split()
    return sum([1 if word in stop_words else 0 for word in tokens])

train_stopwords_q1 = np.array([stopwords_in_str(q) for q in df_train.Q1])
train_stopwords_q2 = np.array([stopwords_in_str(q) for q in df_train.Q2])

test_stopwords_q1 = np.array([stopwords_in_str(q) for q in df_test.Q1])
test_stopwords_q2 = np.array([stopwords_in_str(q) for q in df_test.Q2])

train_stopwords_diff = np.abs(train_stopwords_q1 - train_stopwords_q2)
test_stopwords_diff = np.abs(test_stopwords_q1 - test_stopwords_q2)

In [None]:
stopwords_q1 = TextFeature(train_stopwords_q1, test_stopwords_q1, "stopwords_q1")
stopwords_q2 = TextFeature(train_stopwords_q2, test_stopwords_q2, "stopwords_q2")
stopwords_diff = TextFeature(train_stopwords_diff, test_stopwords_diff, "stopwords_diff")

## Number of Punctuation symbols

In [None]:
df_train = pd.read_csv('../storage/datasets/q2b/train.csv')
df_test = pd.read_csv('../storage/datasets/q2b/test_without_labels.csv')

In [None]:
punctuations = set(string.punctuation + '–—‘’“”…')
def numb_of_punct(txt):
    try:
        return sum([1 if char in punctuations else 0 for char in txt])
    except TypeError:
        return 0
    
train_puncts_q1 = np.array([numb_of_punct(q) for q in df_train.Question1])
train_puncts_q2 = np.array([numb_of_punct(q) for q in df_train.Question2])

test_puncts_q1 = np.array([numb_of_punct(q) for q in df_test.Question1])
test_puncts_q2 = np.array([numb_of_punct(q) for q in df_test.Question2])

train_puncts_diff = np.abs(train_puncts_q1 - train_puncts_q2)
test_puncts_diff = np.abs(test_puncts_q1 - test_puncts_q2)

In [None]:
puncts_q1 = TextFeature(train_puncts_q1, test_puncts_q1, "punctuations_q1")
puncts_q2 = TextFeature(train_puncts_q2, test_puncts_q2, "punctuations_q2")
puncts_diff = TextFeature(train_puncts_diff, test_puncts_diff, "punctuations_diff")

## Number of words not in Google News embeddings

In [None]:
df_train = pd.read_csv('../storage/datasets/q2b/preprocessed/train_quora_clean.csv')
df_test = pd.read_csv('../storage/datasets/q2b/preprocessed/test_quora_clean.csv')

quora_w2v = gensim.downloader.load('word2vec-google-news-300')

In [None]:
def numb_of_words_not_in_embeddings(txt):
    tokens = txt.split()
    return sum([1 if word not in quora_w2v.vocab else 0 for word in tokens])

train_not_in_w2v_q1 = np.array([numb_of_words_not_in_embeddings(q) for q in df_train.Q1])
train_not_in_w2v_q2 = np.array([numb_of_words_not_in_embeddings(q) for q in df_train.Q2])

test_not_in_w2v_q1 = np.array([numb_of_words_not_in_embeddings(q) for q in df_test.Q1])
test_not_in_w2v_q2 = np.array([numb_of_words_not_in_embeddings(q) for q in df_test.Q2])

train_not_in_w2v_diff = np.abs(train_not_in_w2v_q1 - train_not_in_w2v_q2)
test_not_in_w2v_diff = np.abs(test_not_in_w2v_q1 - test_not_in_w2v_q2)

In [None]:
not_in_w2v_q1 = TextFeature(train_not_in_w2v_q1, test_not_in_w2v_q1, "not_in_w2v_q1")
not_in_w2v_q2 = TextFeature(train_not_in_w2v_q2, test_not_in_w2v_q2, "not_in_w2v_q2")
not_in_w2v_diff = TextFeature(train_not_in_w2v_diff, test_not_in_w2v_diff, "not_in_w2v_diff")

## Edit distance of words not in Google News embeddings

In [None]:
df_train = pd.read_csv('../storage/datasets/q2b/preprocessed/train_quora_clean.csv')
df_test = pd.read_csv('../storage/datasets/q2b/preprocessed/test_quora_clean.csv')

quora_w2v = gensim.downloader.load('word2vec-google-news-300')

In [None]:
def dist_of_words_not_in_w2v(q1, q2, distance):
    # Get which words are not in google news w2v, and remove any extra whitespace
    not_words_q1 = re.sub(' +', ' ', " ".join([word if word not in quora_w2v.vocab else "" for word in q1.split()]))
    not_words_q2 = re.sub(' +', ' ', " ".join([word if word not in quora_w2v.vocab else "" for word in q2.split()]))
    
    return distance(q1, q2)
    
train_not_in_w2v_ratio_dist = np.array([dist_of_words_not_in_w2v(row.Q1, row.Q2, fuzz.token_sort_ratio) for ind, row in df_train.iterrows()])
test_not_in_w2v_ratio_dist = np.array([dist_of_words_not_in_w2v(row.Q1, row.Q2, fuzz.token_sort_ratio) for ind, row in df_test.iterrows()])

In [None]:
not_in_w2v_ratio = TextFeature(train_not_in_w2v_ratio_dist, test_not_in_w2v_ratio_dist, "not_in_w2v_ratio")

## Number of digits

In [None]:
df_train = pd.read_csv('../storage/datasets/q2b/train.csv')
df_test = pd.read_csv('../storage/datasets/q2b/test_without_labels.csv')

In [None]:
digits = set(range(10))
def numb_of_digits(txt):
    try:
        return sum([1 if char in digits else 0 for char in txt])
    except TypeError:
        return 0
    
train_digits_q1 = np.array([numb_of_digits(q) for q in df_train.Question1])
train_digits_q2 = np.array([numb_of_digits(q) for q in df_train.Question2])

test_digits_q1 = np.array([numb_of_digits(q) for q in df_test.Question1])
test_digits_q2 = np.array([numb_of_digits(q) for q in df_test.Question2])

train_digits_diff = np.abs(train_digits_q1 - train_digits_q2)
test_digits_diff = np.abs(test_digits_q1 - test_digits_q2)

In [None]:
digits_q1 = TextFeature(train_digits_q1, test_digits_q1, "digits_q1")
digits_q2 = TextFeature(train_digits_q2, test_digits_q2, "digits_q2")
digits_diff = TextFeature(train_digits_diff, test_digits_diff, "digits_diff")

## Number of nouns

In [None]:
df_train = pd.read_csv('../storage/datasets/q2b/preprocessed/train_quora_clean.csv')
df_test = pd.read_csv('../storage/datasets/q2b/preprocessed/test_quora_clean.csv')

In [None]:
def numb_of_nouns(txt):
    tags = nltk.pos_tag(txt.split())
    return sum([1 if tag=='NN' else 0 for word, tag in tags])

train_nouns_q1 = np.array([numb_of_nouns(q) for q in df_train.Q1])
train_nouns_q2 = np.array([numb_of_nouns(q) for q in df_train.Q2])

test_nouns_q1 = np.array([numb_of_nouns(q) for q in df_test.Q1])
test_nouns_q2 = np.array([numb_of_nouns(q) for q in df_test.Q2])

train_nouns_diff = np.abs(train_nouns_q1 - train_nouns_q2)
test_nouns_diff = np.abs(test_nouns_q1 - test_nouns_q2)

In [None]:
nouns_q1 = TextFeature(train_nouns_q1, test_nouns_q1, "nouns_q1")
nouns_q2 = TextFeature(train_nouns_q2, test_nouns_q2, "nouns_q2")
nouns_diff = TextFeature(train_nouns_diff, test_nouns_diff, "nouns_diff")

## Edit distance of nouns

In [None]:
df_train = pd.read_csv('../storage/datasets/q2b/preprocessed/train_quora_clean.csv')
df_test = pd.read_csv('../storage/datasets/q2b/preprocessed/test_quora_clean.csv')

In [None]:
def dist_of_nouns(q1, q2, distance):
    tags_q1 = nltk.pos_tag(q1.split())
    tags_q2 = nltk.pos_tag(q2.split())

    nouns_q1 = re.sub(' +', ' ', " ".join([word if tag=='NN' else "" for word, tag in tags_q1]))
    nouns_q2 = re.sub(' +', ' ', " ".join([word if tag=='NN' else "" for word, tag in tags_q2]))
    
    return distance(q1, q2)

train_nouns_ratio_dist = np.array([dist_of_nouns(row.Q1, row.Q2, fuzz.token_sort_ratio) for ind, row in df_train.iterrows()])
test_nouns_ratio_dist = np.array([dist_of_nouns(row.Q1, row.Q2, fuzz.token_sort_ratio) for ind, row in df_test.iterrows()])

In [None]:
nouns_ratio = TextFeature(train_nouns_ratio_dist, test_nouns_ratio_dist, "nouns_ratio")

## Edit distance of question ending

In [None]:
df_train = pd.read_csv('../storage/datasets/q2b/preprocessed/train_quora_clean.csv')
df_test = pd.read_csv('../storage/datasets/q2b/preprocessed/test_quora_clean.csv')

In [None]:
def distance_of_last_part(q1, q2, divider, distance):
    splitted_q1 = q1.split()
    q1_divided = " ".join(splitted_q1[(len(splitted_q1) // divider):])
    
    splitted_q2 = q2.split()
    q2_divided = " ".join(splitted_q2[(len(splitted_q2) // divider):])
    
    return distance(q1_divided, q2_divided)

train_divided_2_sort_ratio_dist = np.array([distance_of_last_part(row.Q1, row.Q2, 2, fuzz.token_sort_ratio) for ind, row in df_train.iterrows()])
test_divided_2_sort_ratio_dist = np.array([distance_of_last_part(row.Q1, row.Q2, 2, fuzz.token_sort_ratio) for ind, row in df_test.iterrows()])
train_divided_4_sort_ratio_dist = np.array([distance_of_last_part(row.Q1, row.Q2, 4, fuzz.token_sort_ratio) for ind, row in df_train.iterrows()])
test_divided_4_sort_ratio_dist = np.array([distance_of_last_part(row.Q1, row.Q2, 4, fuzz.token_sort_ratio) for ind, row in df_test.iterrows()])

train_divided_2_simple_ratio_dist = np.array([distance_of_last_part(row.Q1, row.Q2, 2, fuzz.ratio) for ind, row in df_train.iterrows()])
test_divided_2_simple_ratio_dist = np.array([distance_of_last_part(row.Q1, row.Q2, 2, fuzz.ratio) for ind, row in df_test.iterrows()])
train_divided_4_simple_ratio_dist = np.array([distance_of_last_part(row.Q1, row.Q2, 4, fuzz.ratio) for ind, row in df_train.iterrows()])
test_divided_4_simple_ratio_dist = np.array([distance_of_last_part(row.Q1, row.Q2, 4, fuzz.ratio) for ind, row in df_test.iterrows()])

train_divided_2_partial_ratio_dist = np.array([distance_of_last_part(row.Q1, row.Q2, 2, fuzz.partial_ratio) for ind, row in df_train.iterrows()])
test_divided_2_partial_ratio_dist = np.array([distance_of_last_part(row.Q1, row.Q2, 2, fuzz.partial_ratio) for ind, row in df_test.iterrows()])
train_divided_4_partial_ratio_dist = np.array([distance_of_last_part(row.Q1, row.Q2, 4, fuzz.partial_ratio) for ind, row in df_train.iterrows()])
test_divided_4_partial_ratio_dist = np.array([distance_of_last_part(row.Q1, row.Q2, 4, fuzz.partial_ratio) for ind, row in df_test.iterrows()])

In [None]:
divided_2_sort_ratio_dist = TextFeature(train_divided_2_sort_ratio_dist, test_divided_2_sort_ratio_dist, "divided_2_sort_ratio_dist")
divided_4_sort_ratio_dist = TextFeature(train_divided_4_sort_ratio_dist, test_divided_4_sort_ratio_dist, "divided_4_sort_ratio_dist")

divided_2_simple_dist = TextFeature(train_divided_2_simple_ratio_dist, test_divided_2_simple_ratio_dist, "divided_2_simple_dist")
divided_4_simple_dist = TextFeature(train_divided_4_simple_ratio_dist, test_divided_4_simple_ratio_dist, "divided_4_simple_dist")

divided_2_partial_ratio_dist = TextFeature(train_divided_2_partial_ratio_dist, test_divided_2_partial_ratio_dist, "divided_2_partial_ratio_dist")
divided_4_partial_ratio_dist = TextFeature(train_divided_4_partial_ratio_dist, test_divided_4_partial_ratio_dist, "divided_4_partial_ratio_dist")

## Edit Distance of Question Beginning

In [None]:
df_train = pd.read_csv('train_quora_clean.csv')
df_test = pd.read_csv('test_quora_clean.csv')

In [None]:
def distance_of_first_part(q1, q2, distance):
    splitted_q1 = q1.split()
    q1_divided = " ".join(splitted_q1[:4])
    
    splitted_q2 = q2.split()
    q2_divided = " ".join(splitted_q2[:4])

    return distance(q1_divided, q2_divided)

train_start_dist = np.array([distance_of_first_part(row.Q1, row.Q2, fuzz.ratio) for ind, row in df_train.iterrows()])
test_start_dist = np.array([distance_of_first_part(row.Q1, row.Q2, fuzz.ratio) for ind, row in df_test.iterrows()])

In [None]:
start_dist = TextFeature(train_start_dist, test_start_dist, "start_distances")

# Useful functions (CV,BO,...)

In [14]:
def metrics_calculating(y_true, y_pred):
    return Counter({
        'accuracy': accuracy_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred, average='macro'),
        'precision': precision_score(y_true, y_pred, average='macro'),
        'f1': f1_score(y_true, y_pred, average='macro')
    })


def pretty_print_grid_search(gs_obj):
    print(f"Best parameters found: {gs_obj.best_params_}, with a score of: {np.round(gs_obj.best_score_, decimals=3)}")
    
    df_gs = pd.DataFrame.from_dict(gs_obj.cv_results_)
    display(df_gs)
    

def run_cross_val(model, splits_numb, X, y, lab_encoder=None):
    train_metrics = Counter()
    validation_metrics = Counter()
    conf_mats = []
    pred_vals = None
    
    kf = KFold(n_splits=splits_numb, shuffle=False)
    for train_index, val_index in tqdm(list(kf.split(X)), desc="Splits"):
        ### Fit on the input model ###
        model.fit(X[train_index], y[train_index])
        
        ### Predict on the Train set (mainly for debugging) ###
        y_pred_train = model.predict(X[train_index])
        
        ### Predict on the Validation set ###
        y_pred_val = model.predict(X[val_index])

        try:
            proba_preds = model.decision_function(X[val_index])
        except:
            proba_preds = model.predict_proba(X[val_index])
        try:
            pred_vals = np.concatenate((pred_vals,proba_preds))
        except:
            pred_vals = proba_preds
        
        ### Metrics Bookkeeping ###
        train_metrics += metrics_calculating(y[train_index], y_pred_train)
        validation_metrics += metrics_calculating(y[val_index], y_pred_val)
        
        print(train_metrics)
        print(validation_metrics)
        
        ### Confusion Plots Generation ###
        if lab_encoder is not None:
            y_labels_true = lab_encoder.inverse_transform(y[val_index])
            y_labels_pred = lab_encoder.inverse_transform(y_pred_val)
            
            conf_mats.append(confusion_matrix(y_labels_true, y_labels_pred, labels=list(lab_encoder.classes_)))
            

    if lab_encoder is not None:
        disp = ConfusionMatrixDisplay(confusion_matrix=np.sum(np.array(conf_mats), axis=0), 
                                      display_labels=list(lab_encoder.classes_))
        fig, ax=plt.subplots(1,1,figsize=(7,7))
        disp.plot(ax=ax)
        
    train_avg_metrics = {k: v / splits_numb for k, v in train_metrics.items()}
    validation_avg_metrics = {k: v / splits_numb for k, v in validation_metrics.items()}

    try:
        LOG['predictions'].append(pred_vals.copy())
    except:
        pass

    #print(accuracy_score(pred_vals,y))

    return train_avg_metrics, validation_avg_metrics


def run_grid_search(X, y, model, params, folds_numb, scoring="accuracy", verbose_res=True):
    gs = GridSearchCV(model, params, cv=folds_numb, scoring=scoring,
                  n_jobs=10, verbose=3)
    
    gs.fit(X, y)
    
    if verbose_res:
        pretty_print_grid_search(gs)
            
    return gs

def run_bayesian_optimization(params, obj_function,log=None,iters=8,inits=2):   #log needs to be a dictionary

    if log!=None:
        if 'predictions' not in log:
            log['predictions'] = []

    optimizer = BayesianOptimization(obj_function,params,verbose=2)#,random_state=42)

    optimizer.maximize(n_iter = iters,init_points=inits) #higher values are recommended, but time.

    if log!=None:
        if 'hyperparameters' not in log:
            log['hyperparameters'] = []
        if 'scores' not in log:
            log['scores'] = []
        for r in optimizer.res:
            log['hyperparameters'].append(r['params'])
            log['scores'].append(r['target'])

    return optimizer.max

# Model Training

These need to be defined outside the Bayesian Optimization for scope reasons. Namely, 'stupid'. Try not to change their values anywhere as they are global variables.

In [None]:
metric = "accuracy"
folds_numb = 5
#X = train_features
#y = train_labels

## Feature Gathering

In [None]:
# Before progressing we must first concatenate all of our feature objects into one DataFrame
features_added = [
    start_dist
]

# Create the DataFrames
#train_features_df = pd.read_csv('../storage/datasets/q2b/train.csv', usecols=['IsDuplicate'])  # The train features will have as their last column the labels
#test_features_df = pd.DataFrame()

train_features_df = pd.read_csv('train_features.csv') 
test_features_df = pd.read_csv('test_features.csv')

for feature in features_added: #features_used
    train_features_df.insert(0, column=feature.name, value=feature.train_arr)
    test_features_df.insert(0, column=feature.name, value=feature.test_arr)

In [None]:
# Save the files
train_features_df.to_csv(f'train_features.csv', index=False)
test_features_df.to_csv(f'test_features.csv', index=False)

In [None]:
# Read the files
train_features_df = pd.read_csv('train_features.csv')
test_features_df = pd.read_csv('test_features.csv')

print(train_features_df.shape)

(283004, 40)


## Linear SVM

In [None]:
def train_LSVC(C):
    #Due to scope issues, the following parameters must be set outside this function:
    #X, y, folds_numb, metric, method_name

    model = LinearSVC(C=C,dual=False,fit_intercept=True,verbose=0)
    train_avg_metrics, validation_avg_metrics = run_cross_val(model, folds_numb, X, y)

    return validation_avg_metrics[metric]

In [None]:
params = {'C': (0.1,10)}

best = run_bayesian_optimization(params,train_LSVC) #contains 'target' and 'params'

C = best['params']['C']

model = LinearSVC(C=C,dual=False,fit_intercept=True,verbose=0)
model.fit(X,y)

# Predictions and Results

In [None]:
train_preds = model.predict(train_features)

print(metrics_calculating(train_labels, train_preds))

test_preds = model.predict(test_features)

best

## Xgboost - With tuning

In [10]:
hall_of_fame_cols = [
    #'divided_4_partial_ratio_dist',
    #'digits_diff',
    'length_q1_quora',
    'length_q2_quora',
    'length_q1_original',
    'length_q2_original',
    'length_q1_clean',
    'length_q2_clean',
    'edit_token_sort_ratio',
    'edit_token_set_ratio',
    'edit_partial_ratio',
    'edit_ratio',
    'TfIdfCosSimilarity',
    'AvgEmbeddingsCosine',
    #'q1_num_names',
    #'q2_num_names',
    #'names_cosine',
    #'stopwords_diff',
    #'nouns_diff',
    #'start_distances'
]

In [11]:
df_train = pd.read_csv('train_features.csv', usecols=hall_of_fame_cols + ['IsDuplicate'])
df_test = pd.read_csv('test_features.csv', usecols=hall_of_fame_cols)

X_train = np.array(df_train)[:, :-1]
y_train = np.array(df_train)[:, -1]

X_test = np.array(df_test)

metric="accuracy"

In [12]:
# Normalize the train and test sets
normalizer = preprocessing.Normalizer().fit(X_train)
X_train = normalizer.transform(X_train)
X_test = normalizer.transform(X_test)

In [None]:
pca = PCA(n_components=9)
pca.fit_transform(X_train)
print(sum(pca.explained_variance_ratio_))

pca.fit_transform(X_test)
print(sum(pca.explained_variance_ratio_))

0.9377779299928577
0.9378507492203685


In [None]:
xgboost_clf = xgboost.XGBClassifier(max_depth=10, n_estimators=10000, objective='binary:logistic',
                                    eval_metric='logloss', learning_rate=0.01,
                                    use_label_encoder=False, tree_method='gpu_hist'
                                    )
xgboost_clf.fit(X_train, y_train)
#run_cross_val(xgboost_clf, splits_numb=5, X=X_train, y=y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=None, n_estimators=10000, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, tree_method='gpu_hist',
              use_label_encoder=False, verbosity=1)

In [13]:
def trainXGB(learning_rate,max_depth,n_estimators):
    xgboost_clf = xgboost.XGBClassifier(learning_rate=learning_rate,max_depth=int(max_depth),n_estimators=int(n_estimators),objective='binary:logistic',tree_method='gpu_hist')

    train_avg_metrics, validation_avg_metrics = run_cross_val(xgboost_clf, splits_numb=5, X=X_train, y=y_train)

    return validation_avg_metrics[metric]


In [None]:
param_dict = {#'objective':'binary:logistic', 
              #'use_label_encoder': False, 
              'learning_rate': (0.0001,0.1),
              'max_depth': (5,15),
              #'tree_method': 'gpu_hist',
              'n_estimators': (100,1000)}

#logs = {}
LOG = logs
best = run_bayesian_optimization(param_dict,obj_function=trainXGB,log=logs,iters=18,inits=2)

## Ensembler

We insert a manual callback in our implementation of bayesian optimization and cross-validation. Every time it is called, it records the hyperparameters it received along with the model's validation predictions. Then, our ensembler reads all of these predictions and finds the models whose predictions, when combined, maximize the validation performance.

The class may seem daunting, but its interface is like any sklearn model's. Feel free to omit the long cell below.

In [21]:
class ensembler:
    
    def __init__(self,log,val_labels,max_members,method="greedy"):
        #log is a dictionary containing:
        #'hyperparameters': a list of hyperparameter combinations.
        #'predictions': a list of model predictions on the validation set.
        #'scores': a list of accuracy scores on the validation set.

        if method=="greedy":
            self.greedyEnsembling(log,val_labels,max_members)
        else:
            self.bruteEnsembling(log,val_labels,max_members)

    def greedyEnsembling(self,log,val_labels,max_members):
        ens_indexes = []
        ens_preds = []
        self.models = []

        #find the best model
        best_index = log['scores'].index(max(log['scores']))
        ens_indexes.append(best_index)
        ens_preds.append(np.array(log['predictions'][best_index]))

        #iteratively add the ensemble member from the pool (with repetition)
        #that maximizes the validation performance. Continue until you reach max_members
        #or until no member is suitable.
        valAcc = log['scores'][best_index]
        for i in range(1,max_members):
            best_p = -1
            for p in range(len(log['predictions'])):
                possAcc = self.checkAccuracy(ens_preds,log['predictions'][p],val_labels)
                if possAcc>valAcc:
                    best_p = p
                    valAcc = possAcc
            if best_p>-1:
                ens_preds.append(log['predictions'][best_p])
                ens_indexes.append(best_p)
            else:
                break

        for i in ens_indexes:
            self.models.append(self.genericModelBuilder(logs['hyperparameters'][i]))
        self.valAcc = valAcc

    def bruteEnsembling(self,log,val_labels,num_of_members):
        #this is painfully slow

        ens_indexes = []
        ens_preds = []
        self.models = []

        #find the best model
        all_indexes = range(len(log['predictions']))
        maxAcc = -1
        bestC = None
        for c in itertools.combinations(all_indexes,num_of_members):
            temp_preds = []
            for j in range(len(log['predictions'][0])):
                this_res = np.zeros(4)
                for i in c:
                    this_res+=log['predictions'][i][j]
                temp_preds.append(list(this_res).index(max(this_res)))
            acc = accuracy_score(temp_preds,val_labels)
            if acc>maxAcc:
                maxAcc = acc
                bestC = c

        self.valAcc = maxAcc
        for i in bestC:
            self.models.append(self.genericModelBuilder(log['hyperparameters'][i]))

    def elections(self, preds):
        res = []
        for j in range(len(preds[0])):
            this_res = np.zeros(len(preds[0][1]))
            for p in preds:
                this_res+=p[j]
            res.append(list(this_res).index(max(this_res)))
        return res

    def checkAccuracy(self, preds,new_preds,labels):
        temp_preds = np.concatenate((preds,[new_preds]))
        total_preds = self.elections(temp_preds)
        return accuracy_score(labels,total_preds)

    def genericModelBuilder(self,params):
        model = None
        if 'C' in params:
            model = LinearSVC(C=params['C'],tol=params['tol'],max_iter=int(params['max_iter']),dual=True,fit_intercept=True,verbose=0)
        elif 'alpha' in params:
            model = SGDClassifier(tol=params['tol'], loss='hinge', max_iter=int(params['max_iter']), alpha=params['alpha'], penalty='elasticnet',early_stopping=True,l1_ratio=params['l1_ratio'])

        else:
            model = xgboost.XGBClassifier(learning_rate=params['learning_rate'],max_depth=int(params['max_depth']),n_estimators=int(params['n_estimators']),objective='binary:logistic',tree_method='gpu_hist')    
        return model

    def fit(self,X,y):
        for m in self.models:
            m.fit(X,y)

    def predict(self,X):
        preds = []
        for m in self.models:
            try:
                preds.append(m.decision_function(X))
            except:
                preds.append(m.predict_proba(X))
        return self.elections(preds)

While the class may seem daunting, using it is as simple as can be:

In [None]:
ensemble = ensembler(logs_2,y_train,9,method="greedy")

print(ensemble.valAcc)
print(len(ensemble.models))

In [24]:
ensemble.fit(X_train, y_train)
#preds = ensemble.predict(X_test)
model = ensemble

## Create Output File

In [25]:
# Set the object model and the path that the output will be created
file_path = 'final_hope.csv'

# Get the Ids from the test file
test_ids_df = pd.read_csv('test_without_labels.csv', usecols=['Id'])

y_predicted = model.predict(np.array(X_test))
y_predicted = np.array([int(x) for x in y_predicted])

results = {
    "Id": list(test_ids_df.Id),
    "Predicted": y_predicted
}
    
results_df = pd.DataFrame.from_dict(results)

results_df.to_csv(file_path, index=False)

# Goodbye

*~ That's all, folks! ~*