In [1]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
import tensorflow as tf
import time

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [2]:
physical_devices

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
from tensorflow import keras
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout, Input, GRU, dot
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint

In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report

In [5]:
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
cd ..

/home/harsh/projects/sembly-round2-harsh


In [7]:
df=pd.read_csv("task-1/data/cleaned_data.csv")
y_true=df['is_duplicate']

In [8]:
y_true=np.array(y_true.values.tolist())
y_true=y_true.reshape(len(y_true),1)

In [9]:
question1=df["question1"]
question2=df["question2"]

In [10]:
all_questions = list(df['question1']) + list(df['question2'])
len(all_questions)

808532

In [11]:
all_questions[1]

'what is the story of kohinor  koh i nor  diamond'

In [12]:
def cleaning(question):
    words = []
    #clean the questions of all punctuations
    for word in tqdm(question):
        clean = re.sub(r"[^a-z A-Z 0-9]", " ", word)
        clean = word_tokenize(clean)
        words.append([i.lower() for i in clean])

    return words

In [13]:
cleaned_words = cleaning(all_questions)

100%|██████████████████████████████████████████████| 808532/808532 [00:57<00:00, 14046.00it/s]


In [14]:
# cleaned_words

In [15]:
def create_tokenizer(cleaned_words, filters='!"#$%&*+,-./:;<=>?@[\]^`{|}~'):
    #tokenize the cleaned words in questions upto word level 
    token = Tokenizer(filters=filters)
    token.fit_on_texts(cleaned_words)
    return token

In [16]:
def max_length(cleaned_words):
    #get the number of words in longest question
    return len(max(cleaned_words, key=len))

In [17]:
word_tokenizer = create_tokenizer(all_questions)
vocab_size = len(word_tokenizer.word_index) + 1
max_length = max_length(cleaned_words)

In [18]:
def encoding_doc(token, words):
    return(token.texts_to_sequences(words))

In [19]:
word_tokenizer

<keras.preprocessing.text.Tokenizer at 0x7f8d9d8987c0>

In [20]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_words)

In [21]:
def padding_doc(encoded_doc, max_length):
    return(pad_sequences(encoded_doc, maxlen = max_length, padding = "post"))

In [22]:
padded_doc = padding_doc(encoded_doc, max_length)

In [23]:
type(len(padded_doc))

int

In [61]:
question1_vectors=padded_doc[:int(len(padded_doc)/2)]
question2_vectors=padded_doc[int(len(padded_doc)/2):]

In [62]:
# np.random.seed(0)
# np.random.shuffle(question1_vectors)
q1_training, q1_test = question1_vectors[:int(len(question1_vectors)*0.8),:], question1_vectors[int(len(question1_vectors)*0.8):,:]

In [63]:
# np.random.seed(0)
# np.random.shuffle(question2_vectors)
q2_training, q2_test = question2_vectors[:int(len(question2_vectors)*0.8),:], question2_vectors[int(len(question2_vectors)*0.8):,:]

In [64]:
# np.random.seed(0)
# np.random.shuffle(y_true)
y_training, y_test = y_true[:int(len(y_true)*0.8),:], y_true[int(len(y_true)*0.8):,:]

In [117]:
def feature_extractor():
    inputs = Input(max_length)
    x=inputs
    x = Embedding(vocab_size, 128, trainable = False)(x)
#     x = LSTM(128,activation="relu")(x)
    x = Dense(64, activation = "relu")(x)  
    x = Dropout(0.2)(x)
    x = Dense(32, activation = "relu")(x)  
    x = Dense(16, activation = "relu")(x)  
#     x= Dense(1)(x)
#     x=tf.nn.l2_normalize(x,axis=-1)
    outputs=x
    outputs = keras.layers.Dense(1)(x)
    model = keras.Model(inputs, outputs)
    return model

In [118]:
model=feature_extractor()
model.summary()

Model: "model_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_25 (InputLayer)       [(None, 247)]             0         
                                                                 
 embedding_12 (Embedding)    (None, 247, 128)          10616320  
                                                                 
 dense_52 (Dense)            (None, 247, 64)           8256      
                                                                 
 dropout_12 (Dropout)        (None, 247, 64)           0         
                                                                 
 dense_53 (Dense)            (None, 247, 32)           2080      
                                                                 
 dense_54 (Dense)            (None, 247, 16)           528       
                                                                 
 dense_55 (Dense)            (None, 247, 1)            17 

In [119]:
input_q1=Input(max_length)
input_q2=Input(max_length)

In [120]:
FE=feature_extractor()
q1_layer=FE(input_q1)
q2_compare_layer=FE(input_q2)

In [121]:
dot_product = dot([q1_layer, q2_compare_layer], axes=1, normalize=True)

In [122]:
# import tensorflow.keras.backend as K

In [123]:
# L1_layer = tf.keras.layers.Lambda(lambda tensors: K.sqrt(K.maximum(K.sum(K.square(tensors[0] - tensors[1]),axis=1,
#         keepdims=True),K.epsilon())))


In [124]:
# L1_distance = L1_layer([q1_layer, q2_compare_layer])

In [125]:
outputs = Dense(1, activation="sigmoid")(dot_product)
model = Model(inputs=[input_q1,input_q2], outputs=outputs)

In [126]:
model.summary()

Model: "model_20"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_26 (InputLayer)          [(None, 247)]        0           []                               
                                                                                                  
 input_27 (InputLayer)          [(None, 247)]        0           []                               
                                                                                                  
 model_19 (Functional)          (None, 247, 1)       10627201    ['input_26[0][0]',               
                                                                  'input_27[0][0]']               
                                                                                                  
 dot_6 (Dot)                    (None, 1, 1)         0           ['model_19[0][0]',        

In [127]:
callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            min_delta=1e-5,
            patience=10,
            verbose=1
        ),
        tf.keras.callbacks.ModelCheckpoint(
            filepath="task-1/saved_models/tf_model/siamese",
            monitor='val_loss', 
            mode='min', 
            save_best_only=True,
            verbose=1
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss', 
            factor=0.2,
            patience=4, 
            min_lr=0.001)
    
]

# optimizer = tf.keras.optimizers.Adam(1e-5)
# loss = tf.keras.losses.BinaryCrossentropy()
# acc = tf.keras.metrics.Accuracy()

In [128]:
model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])

In [129]:
history = model.fit([q1_training,q2_training], y_training,
                    epochs = 30, batch_size = 128, validation_split=0.2, 
                    callbacks=callbacks, verbose=1)

Epoch 1/30
Epoch 1: val_loss improved from inf to 0.65967, saving model to task-1/saved_models/tf_model/siamese
INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/siamese/assets
Epoch 2/30
Epoch 2: val_loss improved from 0.65967 to 0.64938, saving model to task-1/saved_models/tf_model/siamese
INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/siamese/assets
Epoch 3/30
Epoch 3: val_loss improved from 0.64938 to 0.64610, saving model to task-1/saved_models/tf_model/siamese
INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/siamese/assets
Epoch 4/30
Epoch 4: val_loss did not improve from 0.64610
Epoch 5/30
Epoch 5: val_loss improved from 0.64610 to 0.64241, saving model to task-1/saved_models/tf_model/siamese
INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/siamese/assets
Epoch 6/30
Epoch 6: val_loss did not improve from 0.64241
Epoch 7/30
Epoch 7: val_loss improved from 0.64241 to 0.64039, saving model to task-1/saved_models/tf_model/

Epoch 23/30
Epoch 23: val_loss improved from 0.63366 to 0.63294, saving model to task-1/saved_models/tf_model/siamese
INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/siamese/assets
Epoch 24/30
Epoch 24: val_loss did not improve from 0.63294
Epoch 25/30
Epoch 25: val_loss did not improve from 0.63294
Epoch 26/30
Epoch 26: val_loss did not improve from 0.63294
Epoch 27/30
Epoch 27: val_loss did not improve from 0.63294
Epoch 28/30
Epoch 28: val_loss improved from 0.63294 to 0.63279, saving model to task-1/saved_models/tf_model/siamese
INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/siamese/assets
Epoch 29/30
Epoch 29: val_loss improved from 0.63279 to 0.63243, saving model to task-1/saved_models/tf_model/siamese
INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/siamese/assets
Epoch 30/30
Epoch 30: val_loss did not improve from 0.63243


In [130]:
y_pred=model.predict([q1_test,q2_test])



In [131]:
y_pred.shape

(80854, 1, 1)

In [132]:
y_pred=np.round(y_pred).flatten()
# y_pred[0]
score = classification_report(y_test,y_pred)

In [133]:
print(score)

              precision    recall  f1-score   support

           0       0.68      0.81      0.74     50989
           1       0.52      0.35      0.42     29865

    accuracy                           0.64     80854
   macro avg       0.60      0.58      0.58     80854
weighted avg       0.62      0.64      0.62     80854

