In [1]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
import tensorflow as tf
import time

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [2]:
physical_devices

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
from tensorflow import keras
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint

In [34]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report

In [5]:
from tqdm import tqdm

In [6]:
cd ..

/home/harsh/projects/sembly-round2-harsh


In [7]:
df=pd.read_csv("task-1/data/cleaned_data.csv")
y_true=df['is_duplicate']

In [8]:
question1=df["question1"]
question2=df["question2"]

In [9]:
all_questions = list(df['question1']) + list(df['question2'])
len(all_questions)

808532

In [10]:
all_questions[1]

'what is the story of kohinor  koh i nor  diamond'

In [11]:
def cleaning(question):
    words = []
    #clean the questions of all punctuations
    for word in tqdm(question):
        clean = re.sub(r"[^a-z A-Z 0-9]", " ", word)
        clean = word_tokenize(clean)
        words.append([i.lower() for i in clean])

    return words

In [12]:
cleaned_words = cleaning(all_questions)

100%|██████████████████████████████████████████████| 808532/808532 [01:00<00:00, 13363.80it/s]


In [13]:
def create_tokenizer(cleaned_words, filters='!"#$%&*+,-./:;<=>?@[\]^`{|}~'):
    #tokenize the cleaned words in questions upto word level 
    token = Tokenizer(filters=filters)
    token.fit_on_texts(cleaned_words)
    return token

In [14]:
def max_length(cleaned_words):
    #get the number of words in longest question
    return len(max(cleaned_words, key=len))

In [15]:
word_tokenizer = create_tokenizer(all_questions)
vocab_size = len(word_tokenizer.word_index) + 1
max_length = max_length(cleaned_words)

In [16]:
def encoding_doc(token, words):
    return(token.texts_to_sequences(words))

In [17]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_words)

In [18]:
def padding_doc(encoded_doc, max_length):
    return(pad_sequences(encoded_doc, maxlen = max_length, padding = "post"))

In [19]:
padded_doc = padding_doc(encoded_doc, max_length)

In [20]:
type(len(padded_doc))

int

In [21]:
question1_vectors=padded_doc[:int(len(padded_doc)/2)]
question2_vectors=padded_doc[int(len(padded_doc)/2):]

In [22]:
x=np.hstack((question1_vectors[:],question2_vectors[:]))

In [23]:
x.shape

(404266, 494)

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
x_train, x_val, y_train, y_val = train_test_split(x, y_true, shuffle = True, test_size = 0.25, random_state=23)

In [26]:
print("Shape of train_X = %s and train_Y = %s" % (x_train.shape, y_train.shape))
print("Shape of val_X = %s and val_Y = %s" % (x_val.shape, y_val.shape))

Shape of train_X = (303199, 494) and train_Y = (303199,)
Shape of val_X = (101067, 494) and val_Y = (101067,)


In [27]:
model=Sequential()
model.add(Embedding(vocab_size, 128, input_length = max_length*2, trainable = False))
#model.add(Bidirectional(LSTM(128,return_sequences=True)))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(64, activation = "relu"))    
model.add(Dropout(0.2))
model.add(Dense(32, activation = "relu"))    
model.add(Dense(16, activation = "relu")) 
model.add(Dense(1, activation = "sigmoid"))

2022-12-08 13:30:04.003157: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-08 13:30:04.456154: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 4632 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2060, pci bus id: 0000:01:00.0, compute capability: 7.5


In [28]:
callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            min_delta=1e-5,
            patience=10,
            verbose=1
        ),
        tf.keras.callbacks.ModelCheckpoint(
            filepath="task-1/saved_models/tf_model/lstm",
            monitor='val_loss', 
            mode='min', 
            save_best_only=True,
            verbose=1
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss', 
            factor=0.2,
            patience=4, 
            min_lr=0.001)
    
]

# optimizer = tf.keras.optimizers.Adam(1e-5)
# loss = tf.keras.losses.BinaryCrossentropy()
# acc = tf.keras.metrics.Accuracy()

In [29]:
model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])

In [30]:
history = model.fit(x_train, y_train, epochs = 30, batch_size = 256, validation_data= (x_val, y_val), callbacks=callbacks, verbose=1)

Epoch 1/30


2022-12-08 13:30:04.838852: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 599121224 exceeds 10% of free system memory.
2022-12-08 13:30:07.613924: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8401


Epoch 1: val_loss improved from inf to 0.59860, saving model to task-1/saved_models/tf_model/lstm




INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


Epoch 2/30
Epoch 2: val_loss improved from 0.59860 to 0.58600, saving model to task-1/saved_models/tf_model/lstm




INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


Epoch 3/30
Epoch 3: val_loss improved from 0.58600 to 0.57459, saving model to task-1/saved_models/tf_model/lstm




INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


Epoch 4/30
Epoch 4: val_loss improved from 0.57459 to 0.57064, saving model to task-1/saved_models/tf_model/lstm




INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


Epoch 5/30
Epoch 5: val_loss improved from 0.57064 to 0.56133, saving model to task-1/saved_models/tf_model/lstm




INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


Epoch 6/30
Epoch 6: val_loss improved from 0.56133 to 0.55334, saving model to task-1/saved_models/tf_model/lstm




INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


Epoch 7/30
Epoch 7: val_loss improved from 0.55334 to 0.54888, saving model to task-1/saved_models/tf_model/lstm




INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


Epoch 8/30
Epoch 8: val_loss improved from 0.54888 to 0.54759, saving model to task-1/saved_models/tf_model/lstm




INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


Epoch 9/30
Epoch 9: val_loss improved from 0.54759 to 0.54227, saving model to task-1/saved_models/tf_model/lstm




INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


Epoch 10/30
Epoch 10: val_loss improved from 0.54227 to 0.54114, saving model to task-1/saved_models/tf_model/lstm




INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


Epoch 11/30
Epoch 11: val_loss improved from 0.54114 to 0.53672, saving model to task-1/saved_models/tf_model/lstm




INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


Epoch 12/30
Epoch 12: val_loss improved from 0.53672 to 0.53470, saving model to task-1/saved_models/tf_model/lstm




INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


Epoch 13/30
Epoch 13: val_loss did not improve from 0.53470
Epoch 14/30
Epoch 14: val_loss improved from 0.53470 to 0.53146, saving model to task-1/saved_models/tf_model/lstm




INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


Epoch 15/30
Epoch 15: val_loss improved from 0.53146 to 0.52809, saving model to task-1/saved_models/tf_model/lstm




INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


Epoch 16/30
Epoch 16: val_loss did not improve from 0.52809
Epoch 17/30
Epoch 17: val_loss improved from 0.52809 to 0.52684, saving model to task-1/saved_models/tf_model/lstm




INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


Epoch 18/30
Epoch 18: val_loss improved from 0.52684 to 0.52514, saving model to task-1/saved_models/tf_model/lstm




INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


Epoch 19/30
Epoch 19: val_loss did not improve from 0.52514
Epoch 20/30
Epoch 20: val_loss improved from 0.52514 to 0.52474, saving model to task-1/saved_models/tf_model/lstm




INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


INFO:tensorflow:Assets written to: task-1/saved_models/tf_model/lstm/assets


Epoch 21/30
Epoch 21: val_loss did not improve from 0.52474
Epoch 22/30
Epoch 22: val_loss did not improve from 0.52474
Epoch 23/30
Epoch 23: val_loss did not improve from 0.52474
Epoch 24/30
Epoch 24: val_loss did not improve from 0.52474
Epoch 25/30
Epoch 25: val_loss did not improve from 0.52474
Epoch 26/30
Epoch 26: val_loss did not improve from 0.52474
Epoch 27/30
Epoch 27: val_loss did not improve from 0.52474
Epoch 28/30
Epoch 28: val_loss did not improve from 0.52474
Epoch 29/30
Epoch 29: val_loss did not improve from 0.52474
Epoch 30/30
Epoch 30: val_loss did not improve from 0.52474


In [31]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 494, 128)          10616320  
                                                                 
 bidirectional (Bidirectiona  (None, 256)              263168    
 l)                                                              
                                                                 
 dense (Dense)               (None, 64)                16448     
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 16)                528       
                                                        

In [35]:
y_pred=model.predict(x_val)



In [37]:
y_pred=np.round(y_pred)

In [38]:
score = classification_report(y_val, y_pred)

In [40]:
print(score)

              precision    recall  f1-score   support

           0       0.76      0.88      0.82     63770
           1       0.72      0.54      0.61     37297

    accuracy                           0.75    101067
   macro avg       0.74      0.71      0.71    101067
weighted avg       0.75      0.75      0.74    101067

