In [1]:
import re
import nltk
import distance
import numpy as np
import pandas as pd
import networkx as nx
import tensorflow as K

from fuzzywuzzy import fuzz
from keras.models import Model
from nltk.corpus import stopwords
from collections import defaultdict
from keras.layers.core import Lambda
from keras.layers.noise import GaussianNoise
from keras.preprocessing.text import Tokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from keras.layers.advanced_activations import PReLU
from sklearn.model_selection import StratifiedKFold
from keras.preprocessing.sequence import pad_sequences
from keras.layers.merge import concatenate, add, multiply
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.feature_extraction.text import CountVectorizer
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Conv1D, GlobalAveragePooling1D, Bidirectional

np.random.seed(0)
SAFE_DIV = 0.0001
STOP_WORDS = stopwords.words("english")
NB_CORES = 10
FREQ_UPPER_BOUND = 100
NEIGHBOR_UPPER_BOUND = 5
WNL = WordNetLemmatizer()
MAX_SEQUENCE_LENGTH = 32
MIN_WORD_OCCURRENCE = 100
REPLACE_WORD = "memento"
EMBEDDING_DIM = 300
NUM_FOLDS = 12
BATCH_SIZE = 256
EMBEDDING_FILE = "glove.840B.300d.txt"

Using TensorFlow backend.


In [2]:
def preprocess_nlp(string):
    string = string.lower().replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'") \
        .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not") \
        .replace("n't", " not").replace("what's", "what is").replace("it's", "it is") \
        .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are") \
        .replace("he's", "he is").replace("she's", "she is").replace("'s", " own") \
        .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ") \
        .replace("€", " euro ").replace("'ll", " will").replace("=", " equal ").replace("+", " plus ")
    string = re.sub('[“”\(\'…\)\!\^\"\.;:,\-\?？\{\}\[\]\\/\*@]', ' ', string)
    string = re.sub(r"([0-9]+)000000", r"\1m", string)
    string = re.sub(r"([0-9]+)000", r"\1k", string)
    return string

def preprocess_model(string):
    string = string.lower().replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'") \
        .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not") \
        .replace("n't", " not").replace("what's", "what is").replace("it's", "it is") \
        .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are") \
        .replace("he's", "he is").replace("she's", "she is").replace("'s", " own") \
        .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ") \
        .replace("€", " euro ").replace("'ll", " will").replace("=", " equal ").replace("+", " plus ")
    string = re.sub('[“”\(\'…\)\!\^\"\.;:,\-\?？\{\}\[\]\\/\*@]', ' ', string)
    string = re.sub(r"([0-9]+)000000", r"\1m", string)
    string = re.sub(r"([0-9]+)000", r"\1k", string)
    string = ' '.join([cutter(w) for w in string.split()])
    return string

def get_token_features(q1, q2):
    token_features = [0.0] * 10

    q1_tokens = q1.split()
    q2_tokens = q2.split()

    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return token_features

    q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
    q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])

    q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
    q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])

    common_word_count = len(q1_words.intersection(q2_words))
    common_stop_count = len(q1_stops.intersection(q2_stops))
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))

    token_features[0] = common_word_count / (
        min(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[1] = common_word_count / (
        max(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[2] = common_stop_count / (
        min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[3] = common_stop_count / (
        max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[4] = common_token_count / (
        min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[5] = common_token_count / (
        max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
    token_features[7] = int(q1_tokens[0] == q2_tokens[0])
    token_features[8] = abs(len(q1_tokens) - len(q2_tokens))
    token_features[9] = (len(q1_tokens) + len(q2_tokens)) / 2
    return token_features

def get_longest_substr_ratio(a, b):
    strs = list(distance.lcsubstrings(a, b))
    if len(strs) == 0:
        return 0
    else:
        return len(strs[0]) / (min(len(a), len(b)) + 1)
    
def extract_features_nlp(df):
    df = df.copy()
    df["question1"] = df["question1"].fillna("").apply(preprocess_nlp)
    df["question2"] = df["question2"].fillna("").apply(preprocess_nlp)

    print("token features...")
    token_features = df.apply(
        lambda x: get_token_features(x["question1"], x["question2"]), axis=1)
    df["cwc_min"] = list(map(lambda x: x[0], token_features))
    df["cwc_max"] = list(map(lambda x: x[1], token_features))
    df["csc_min"] = list(map(lambda x: x[2], token_features))
    df["csc_max"] = list(map(lambda x: x[3], token_features))
    df["ctc_min"] = list(map(lambda x: x[4], token_features))
    df["ctc_max"] = list(map(lambda x: x[5], token_features))
    df["last_word_eq"] = list(map(lambda x: x[6], token_features))
    df["first_word_eq"] = list(map(lambda x: x[7], token_features))
    df["abs_len_diff"] = list(map(lambda x: x[8], token_features))
    df["mean_len"] = list(map(lambda x: x[9], token_features))

    print("fuzzy features..")
    df["token_set_ratio"] = df.apply(
        lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
    df["token_sort_ratio"] = df.apply(
        lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]),
        axis=1)
    df["fuzz_ratio"] = df.apply(
        lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
    df["fuzz_partial_ratio"] = df.apply(
        lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
    df["longest_substr_ratio"] = df.apply(
        lambda x: get_longest_substr_ratio(x["question1"], x["question2"]),
        axis=1)
    return df

def create_question_hash(train_df, test_df):
    train_qs = np.dstack([train_df["question1"],
                          train_df["question2"]]).flatten()
    test_qs = np.dstack([test_df["question1"], test_df["question2"]]).flatten()
    all_qs = np.append(train_qs, test_qs)
    all_qs = pd.DataFrame(all_qs)[0].drop_duplicates()
    all_qs.reset_index(inplace=True, drop=True)
    question_dict = pd.Series(
        all_qs.index.values, index=all_qs.values).to_dict()
    return question_dict

def get_hash(df, hash_dict):
    df = df.copy()
    df["qid1"] = df["question1"].map(hash_dict)
    df["qid2"] = df["question2"].map(hash_dict)
    return df.drop(["question1", "question2"], axis=1)

def get_kcore_dict(df):
    df = df.copy()
    g = nx.Graph()
    g.add_nodes_from(df.qid1)
    edges = list(df[["qid1", "qid2"]].to_records(index=False))
    g.add_edges_from(edges)
    g.remove_edges_from(g.selfloop_edges())
    df_output = pd.DataFrame(data=g.nodes(), columns=["qid"])
    df_output["kcore"] = 0
    for k in range(2, NB_CORES + 1):
        ck = nx.k_core(g, k=k).nodes()
        print("kcore", k)
        df_output.ix[df_output.qid.isin(ck), "kcore"] = k

    return df_output.to_dict()["kcore"]

def get_kcore_features(df, kcore_dict):
    df = df.copy()
    df["kcore1"] = df["qid1"].apply(lambda x: kcore_dict[x])
    df["kcore2"] = df["qid2"].apply(lambda x: kcore_dict[x])
    return df


def convert_to_minmax(df, col):
    df = df.copy()
    sorted_features = np.sort(np.vstack([df[col + "1"], df[col + "2"]]).T)
    df["min_" + col] = sorted_features[:, 0]
    df["max_" + col] = sorted_features[:, 1]
    return df.drop([col + "1", col + "2"], axis=1)


def get_neighbors(train_df, test_df):
    neighbors = defaultdict(set)
    for df in [train_df, test_df]:
        for q1, q2 in zip(df["qid1"], df["qid2"]):
            neighbors[q1].add(q2)
            neighbors[q2].add(q1)
    return neighbors


def get_neighbor_features(df, neighbors):
    df = df.copy()
    common_nc = df.apply(
        lambda x: len(neighbors[x.qid1].intersection(neighbors[x.qid2])),
        axis=1)
    min_nc = df.apply(
        lambda x: min(len(neighbors[x.qid1]), len(neighbors[x.qid2])), axis=1)
    df["common_neighbor_ratio"] = common_nc / min_nc
    df["common_neighbor_count"] = common_nc.apply(
        lambda x: min(x, NEIGHBOR_UPPER_BOUND))
    return df


def get_freq_features(df, frequency_map):
    df = df.copy()
    df["freq1"] = df["qid1"].map(
        lambda x: min(frequency_map[x], FREQ_UPPER_BOUND))
    df["freq2"] = df["qid2"].map(
        lambda x: min(frequency_map[x], FREQ_UPPER_BOUND))
    return df



def cutter(word):
    if len(word) < 4:
        return word
    return WNL.lemmatize(WNL.lemmatize(word, "n"), "v")


def preprocess(string, final=True):
    string = string.lower().replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'") \
        .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not") \
        .replace("n't", " not").replace("what's", "what is").replace("it's", "it is") \
        .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are") \
        .replace("he's", "he is").replace("she's", "she is").replace("'s", " own") \
        .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ") \
        .replace("€", " euro ").replace("'ll", " will").replace("=", " equal ").replace("+", " plus ")
    string = re.sub('[“”\(\'…\)\!\^\"\.;:,\-\?？\{\}\[\]\\/\*@]', ' ', string)
    string = re.sub(r"([0-9]+)000000", r"\1m", string)
    string = re.sub(r"([0-9]+)000", r"\1k", string)
    if final:
        string = ' '.join([cutter(w) for w in string.split()])
    return string


def get_embedding():
    embeddings_index = {}
    f = open(EMBEDDING_FILE, encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        if len(values) == EMBEDDING_DIM + 1 and word in top_words:
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs
    f.close()
    return embeddings_index


def is_numeric(s):
    return any(i.isdigit() for i in s)


def prepare(q):
    new_q = []
    surplus_q = []
    numbers_q = []
    new_memento = True
    for w in q.split()[::-1]:
        if w in top_words:
            new_q = [w] + new_q
            new_memento = True
        elif w not in STOP_WORDS:
            if new_memento:
                new_q = ["memento"] + new_q
                new_memento = False
            if is_numeric(w):
                numbers_q = [w] + numbers_q
            else:
                surplus_q = [w] + surplus_q
        else:
            new_memento = True
        if len(new_q) == MAX_SEQUENCE_LENGTH:
            break
    new_q = " ".join(new_q)
    return new_q, set(surplus_q), set(numbers_q)


def extract_features_model(df):
    q1s = np.array([""] * len(df), dtype=object)
    q2s = np.array([""] * len(df), dtype=object)
    features = np.zeros((len(df), 4))

    for i, (q1, q2) in enumerate(list(zip(df["question1"], df["question2"]))):
        q1s[i], surplus1, numbers1 = prepare(q1)
        q2s[i], surplus2, numbers2 = prepare(q2)
        features[i, 0] = len(surplus1.intersection(surplus2))
        features[i, 1] = len(surplus1.union(surplus2))
        features[i, 2] = len(numbers1.intersection(numbers2))
        features[i, 3] = len(numbers1.union(numbers2))

    return q1s, q2s, features

# JUST FOR THE FIRST RUN, PROCESS FEATURES

You can jump to [the later section](#START-HERE-IF-YOU-ALREADY-HAVE-THE-PROCESSED-FEATURES).

In [None]:
print("Loading raw datasets...")
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")
print("Loaded.")

In [None]:
print("Extracting features for train:")
train_df_nlp = extract_features_nlp(train_df)
train_df_nlp.drop(
    ["id", "qid1", "qid2", "question1", "question2", "is_duplicate"],
    axis=1,
    inplace=True)
# train_df_nlp.to_csv("data/nlp_features_train.csv", index=False)

In [None]:
print("Extracting features for test:")
test_df_nlp = extract_features_nlp(test_df)
test_df_nlp.drop(["test_id", "question1", "question2"], axis=1, inplace=True)
# test_df_nlp.to_csv("data/nlp_features_test.csv", index=False)

In [None]:
train_df.head()

In [None]:
train_df_nlp.head()

In [None]:
test_df.head()

In [None]:
test_df_nlp.head()

In [None]:
print("Hashing the questions...")
question_dict = create_question_hash(train_df, test_df)
train_df_non = get_hash(train_df, question_dict)
test_df_non = get_hash(test_df, question_dict)
print("Number of unique questions:", len(question_dict))

In [None]:
print("Calculating kcore features...")
all_df = pd.concat([train_df_non, test_df_non])
kcore_dict = get_kcore_dict(all_df)
train_df_non = get_kcore_features(train_df_non, kcore_dict)
test_df_non = get_kcore_features(test_df_non, kcore_dict)
train_df_non = convert_to_minmax(train_df_non, "kcore")
test_df_non = convert_to_minmax(test_df_non, "kcore")

In [None]:
print("Calculating common neighbor features...")
neighbors = get_neighbors(train_df_non, test_df_non)
train_df_non = get_neighbor_features(train_df_non, neighbors)
test_df_non = get_neighbor_features(test_df_non, neighbors)

In [None]:
print("Calculating frequency features...")
frequency_map = dict(
    zip(*np.unique(
        np.vstack((all_df["qid1"], all_df["qid2"])), return_counts=True)))
train_df_non = get_freq_features(train_df_non, frequency_map)
test_df_non = get_freq_features(test_df_non, frequency_map)
train_df_non = convert_to_minmax(train_df_non, "freq")
test_df_non = convert_to_minmax(test_df_non, "freq")

In [None]:
cols = [
    "min_kcore", "max_kcore", "common_neighbor_count", "common_neighbor_ratio",
    "min_freq", "max_freq"
]
train_df_non = train_df_non[cols] # .to_csv("data/non_nlp_features_train.csv", index=False)
test_df_non = test_df_non[cols] # .to_csv("data/non_nlp_features_test.csv", index=False)

In [None]:
train_df_non.head()

In [None]:
test_df_non.head()

In [None]:
features_train.shape, features_test.shape

In [None]:
np.save('_train', features_train)
np.save('_test', features_test)

# START HERE IF YOU ALREADY HAVE THE PROCESSED FEATURES

In [3]:
print("Loading raw datasets...")
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")
print("Loaded.")

train = train_df
test = test_df

train["question1"] = train["question1"].fillna("").apply(preprocess_model)
train["question2"] = train["question2"].fillna("").apply(preprocess_model)

print("Creating the vocabulary of words occurred more than",
      MIN_WORD_OCCURRENCE)
all_questions = pd.Series(
    train["question1"].tolist() + train["question2"].tolist()).unique()
vectorizer = CountVectorizer(
    lowercase=False, token_pattern="\S+", min_df=MIN_WORD_OCCURRENCE)
vectorizer.fit(all_questions)
top_words = set(vectorizer.vocabulary_.keys())
top_words.add(REPLACE_WORD)

embeddings_index = get_embedding()
print("Words are not found in the embedding:",
      top_words - embeddings_index.keys())
top_words = embeddings_index.keys()

print("Train questions are being prepared for LSTM...")
q1s_train, q2s_train, train_q_features = extract_features_model(train)

tokenizer = Tokenizer(filters="")
tokenizer.fit_on_texts(np.append(q1s_train, q2s_train))
word_index = tokenizer.word_index

data_1 = pad_sequences(
    tokenizer.texts_to_sequences(q1s_train), maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(
    tokenizer.texts_to_sequences(q2s_train), maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(train["is_duplicate"])

nb_words = len(word_index) + 1
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


print("Same steps are being applied for test...")
test["question1"] = test["question1"].fillna("").apply(preprocess)
test["question2"] = test["question2"].fillna("").apply(preprocess)
q1s_test, q2s_test, test_q_features = extract_features_model(test)
test_data_1 = pad_sequences(
    tokenizer.texts_to_sequences(q1s_test), maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(
    tokenizer.texts_to_sequences(q2s_test), maxlen=MAX_SEQUENCE_LENGTH)

features_train = np.load("_train.npy")
features_test = np.load("_test.npy")

Loading raw datasets...
Loaded.
Creating the vocabulary of words occurred more than 100
Words are not found in the embedding: {'quorans', 'demonetisation', 'iisc', 'c#', 'redmi', 'paytm', '\\sqrt', 'brexit', '\\frac', 'kvpy', 'oneplus'}
Train questions are being prepared for LSTM...
Same steps are being applied for test...


In [4]:
skf = StratifiedKFold(n_splits=5, shuffle=True)

In [5]:
model_count = 0
for idx_train, idx_val in skf.split(train["is_duplicate"], train["is_duplicate"]):
    print("MODEL:", model_count)
    data_1_train = data_1[idx_train]
    data_2_train = data_2[idx_train]
    labels_train = labels[idx_train]
    f_train = features_train[idx_train]

    data_1_val = data_1[idx_val]
    data_2_val = data_2[idx_val]
    labels_val = labels[idx_val]
    f_val = features_train[idx_val]

    embedding_layer = Embedding(nb_words,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)
    lstm_layer = LSTM(96, recurrent_dropout=0.2)

    sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    x1 = lstm_layer(embedded_sequences_1)

    sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
    embedded_sequences_2 = embedding_layer(sequence_2_input)
    y1 = lstm_layer(embedded_sequences_2)

    features_input = Input(shape=(f_train.shape[1],), dtype="float32")
    features_dense = BatchNormalization()(features_input)
    features_dense = Dense(200, activation="relu")(features_dense)
    features_dense = Dropout(0.25)(features_dense)
    

    addition = add([x1, y1])
    minus_y1 = Lambda(lambda x: -x)(y1)
    merged = add([x1, minus_y1])
    merged = multiply([merged, merged])
    merged = concatenate([merged, addition])
    merged = Dropout(0.4)(merged)

    merged = concatenate([merged, features_dense])
    merged = BatchNormalization()(merged)
    merged = GaussianNoise(0.1)(merged)

    merged = Dense(150, activation="relu")(merged)
    merged = Dropout(0.2)(merged)
    merged = BatchNormalization()(merged)

    out = Dense(1, activation="sigmoid")(merged)

    model = Model(inputs=[sequence_1_input, sequence_2_input, features_input], outputs=out)
    model.compile(loss="binary_crossentropy",
                  optimizer="nadam")
    early_stopping = EarlyStopping(monitor="val_loss", patience=5)
    best_model_path = "__best_model" + str(model_count) + ".h5"
    model_checkpoint = ModelCheckpoint(best_model_path, save_best_only=True, save_weights_only=True)

    hist = model.fit([data_1_train, data_2_train, f_train], labels_train,
                     validation_data=([data_1_val, data_2_val, f_val], labels_val),
                     epochs=15, batch_size=BATCH_SIZE, shuffle=True,
                     callbacks=[early_stopping, model_checkpoint], verbose=1)

    model.load_weights(best_model_path)
    print(model_count, "validation loss:", min(hist.history["val_loss"]))

    preds = model.predict([test_data_1, test_data_2, features_test], batch_size=BATCH_SIZE, verbose=1)

    submission = pd.DataFrame({"test_id": test["test_id"], "is_duplicate": preds.ravel()})
    submission.to_csv("../predictions/____preds" + str(model_count) + ".csv", index=False)

    model_count += 1

MODEL: 0
Train on 323431 samples, validate on 80859 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
0 validation loss: 0.220745312345


FileNotFoundError: [Errno 2] No such file or directory: '../predictions/____preds0.csv'

In [None]:
model_count = 0
for idx_train, idx_val in skf.split(train["is_duplicate"], train["is_duplicate"]):
    print("MODEL:", model_count)
    data_1_train = data_1[idx_train]
    data_2_train = data_2[idx_train]
    labels_train = labels[idx_train]
    f_train = features_train_1[idx_train]

    data_1_val = data_1[idx_val]
    data_2_val = data_2[idx_val]
    labels_val = labels[idx_val]
    f_val = features_train_1[idx_val]

    embedding_layer = Embedding(nb_words,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)
    lstm_layer = LSTM(96, recurrent_dropout=0.2)

    sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    x1 = lstm_layer(embedded_sequences_1)

    sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
    embedded_sequences_2 = embedding_layer(sequence_2_input)
    y1 = lstm_layer(embedded_sequences_2)

    features_input = Input(shape=(f_train.shape[1],), dtype="float32")
    features_dense = BatchNormalization()(features_input)
    features_dense = Dense(200, activation="relu")(features_dense)
    features_dense = Dropout(0.25)(features_dense)
    
    conv1 = Conv1D(filters=128, kernel_size=1, padding='same', activation='relu')
    conv2 = Conv1D(filters=128, kernel_size=2, padding='same', activation='relu')
    conv3 = Conv1D(filters=128, kernel_size=3, padding='same', activation='relu')
    conv4 = Conv1D(filters=128, kernel_size=4, padding='same', activation='relu')
    conv5 = Conv1D(filters=32, kernel_size=5, padding='same', activation='relu')
    conv6 = Conv1D(filters=32, kernel_size=6, padding='same', activation='relu')

    # Run through CONV + GAP layers
    conv1a = conv1(embedded_sequences_1)
    glob1a = GlobalAveragePooling1D()(conv1a)
    conv1b = conv1(embedded_sequences_2)
    glob1b = GlobalAveragePooling1D()(conv1b)

    conv2a = conv2(embedded_sequences_1)
    glob2a = GlobalAveragePooling1D()(conv2a)
    conv2b = conv2(embedded_sequences_2)
    glob2b = GlobalAveragePooling1D()(conv2b)

    conv3a = conv3(embedded_sequences_1)
    glob3a = GlobalAveragePooling1D()(conv3a)
    conv3b = conv3(embedded_sequences_2)
    glob3b = GlobalAveragePooling1D()(conv3b)

    conv4a = conv4(embedded_sequences_1)
    glob4a = GlobalAveragePooling1D()(conv4a)
    conv4b = conv4(embedded_sequences_2)
    glob4b = GlobalAveragePooling1D()(conv4b)

    conv5a = conv5(embedded_sequences_1)
    glob5a = GlobalAveragePooling1D()(conv5a)
    conv5b = conv5(embedded_sequences_2)
    glob5b = GlobalAveragePooling1D()(conv5b)

    conv6a = conv6(embedded_sequences_1)
    glob6a = GlobalAveragePooling1D()(conv6a)
    conv6b = conv6(embedded_sequences_2)
    glob6b = GlobalAveragePooling1D()(conv6b)

    mergea = concatenate([glob1a, glob2a, glob3a, glob4a, glob5a, glob6a])
    mergeb = concatenate([glob1b, glob2b, glob3b, glob4b, glob5b, glob6b])
    
    diff = Lambda(lambda x: K.abs(x[0] - x[1]), output_shape=(4 * 128 + 2*32,))([mergea, mergeb])
    mul = Lambda(lambda x: x[0] * x[1], output_shape=(4 * 128 + 2*32,))([mergea, mergeb])
    
    addition = add([x1, y1])
    minus_y1 = Lambda(lambda x: -x)(y1)
    merged = add([x1, minus_y1])
    merged = multiply([merged, merged])
    merged = concatenate([merged, addition])
    merged = Dropout(0.4)(merged)

    merged = concatenate([merged, features_dense])
    merged = BatchNormalization()(merged)
    merged = GaussianNoise(0.1)(merged)

    merged = Dense(150, activation="relu")(merged)
    merged = Dropout(0.3)(merged)
    merged = BatchNormalization()(merged)
    
    merged_temp = concatenate([diff, mul])
    merged_temp = Dense(150, activation="relu")(merged_temp)
    merged_temp = Dropout(0.3)(merged_temp)
    merged_temp = BatchNormalization()(merged_temp)
    
    merged = concatenate([merged_temp, merged])

    out = Dense(1, activation="sigmoid")(merged)

    model = Model(inputs=[sequence_1_input, sequence_2_input, features_input], outputs=out)
    model.compile(loss="binary_crossentropy",
                  optimizer="nadam")
    early_stopping = EarlyStopping(monitor="val_loss", patience=5)
    best_model_path = "best_model" + str(model_count) + ".h5"
    model_checkpoint = ModelCheckpoint(best_model_path, save_best_only=True, save_weights_only=True)

    hist = model.fit([data_1_train, data_2_train, f_train], labels_train,
                     validation_data=([data_1_val, data_2_val, f_val], labels_val),
                     epochs=15, batch_size=BATCH_SIZE, shuffle=True,
                     callbacks=[early_stopping, model_checkpoint], verbose=1)

    model.load_weights(best_model_path)
    print(model_count, "validation loss:", min(hist.history["val_loss"]))

    preds = model.predict([test_data_1, test_data_2, features_test_1], batch_size=BATCH_SIZE, verbose=1)

    submission = pd.DataFrame({"test_id": test["test_id"], "is_duplicate": preds.ravel()})
    submission.to_csv("predictions/preds" + str(model_count) + ".csv", index=False)

    model_count += 1