In [1]:
import re
import math
import time

In [2]:
import tensorflow as tf
from tensorflow.keras import layers

In [3]:
import pandas as pd
import numpy as np

from gensim.models import Word2Vec
import gensim.models

from tensorflow import keras
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import GRU
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.optimizers import *
from tensorflow.keras.callbacks import *

from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt


import nltk
from nltk.tokenize import word_tokenize

# nltk.download('punkt')
# from nltk.corpus import stopwords 
# nltk.download('stopwords')



In [4]:
#Change to path where Dataset is stored
train_path = '/home/mahesh/Documents/SF-ClickBait/Bulgarian Dataset and Model/Train_Cleaned.csv'
test_path = '/home/mahesh/Documents/SF-ClickBait/Bulgarian Dataset and Model/Test_Cleaned.csv'
w2v_path = '/home/mahesh/Documents/SF-ClickBait/Bulgarian Dataset and Model/GoogleNews-vectors-negative300.bin'
ft_path = '/home/mahesh/Documents/SF-ClickBait/Bulgarian Dataset and Model/cc.bg.300.bin'

# Training Data

In [5]:
df = pd.read_csv(train_path)
df.drop(columns=['Unnamed: 0'], inplace=True) 
df.rename(columns={'click_bait_score': 'Clickbait', 'Content Title': 'Text'}, inplace=True)
df.head()

Unnamed: 0,Clickbait,Text
0,0,аЊбббб аЕбббб: аЄа аЅаИ б Та б а­б...
1,0,аЗаЅаВаЈаАаЈаВаЅ аВаЈаЏа аІаЅа­аЈ аБаЏаЎаАаЅа...
2,0,а тб - аЋбббббб аЊбб бббббб б...
3,1,аЈббббб ббб бб ! аЈббббб бб б...
4,0,"аЂбббббб бб аЗбтбб аЗббббб, ..."


# Testing / Validation Data

In [6]:
test_df = pd.read_csv(test_path)
test_df.drop(columns=['Unnamed: 0'], inplace=True) 
test_df.rename(columns={'click_bait_score': 'Clickbait', 'Content Title': 'Text'}, inplace=True)

In [7]:
test_df.head()

Unnamed: 0,Clickbait,Text
0,1,"аЊбббб аЏбтб бтбббббб, ббб..."
1,0,"""аАбббббб бб аЂбббб бб аЋбббб..."
2,1,аВбббббб ббббббббббббб бб...
3,0,аБббббббб бббббббб бб бббб...
4,1,аЏтбтббббббббб бб аДббббб...


# Preprocessing

In [8]:
from stop_words import STOP_WORDS

In [9]:
# w2v = gensim.models.KeyedVectors.load_word2vec_format(w2v_path, binary=True)
w2v = gensim.models.fasttext.load_facebook_vectors(ft_path)


AttributeError: type object 'Word2VecKeyedVectors' has no attribute 'load_fasttext_format'

In [None]:
L = []
for line  in df['Text']:
    words = [ w for w in line.split() if w not in STOP_WORDS ]
    L.append(len(words))
    
sequence_size = max(L)

In [None]:
X = df['Text']
y = df['Clickbait']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)
X_val, y_val = test_df['Text'], test_df['Clickbait']

y_train = np.array(y_train)
y_test = np.array(y_test)
y_val = np.array(y_val)

In [None]:
train_data=np.zeros((len(X_train), sequence_size, 300))
val_data=np.zeros((len(X_val), sequence_size, 300))
test_data = np.zeros((len(X_test), sequence_size, 300))

for i,sentence in enumerate(X_train) :
    sentence = sentence.replace('-', ' ')
    words = nltk.word_tokenize(sentence)

    j = 0 

    for  w  in  words :
        if w not in STOP_WORDS:
            try:
                train_data [ i , j ] = w2v [ w ]
                j += 1
            except:
                pass

for i,sentence in enumerate(X_val) :
    sentence = sentence.replace('-', ' ')
    words = nltk.word_tokenize(sentence)

    j = 0 

    for  w  in  words :
        if w not in STOP_WORDS:
            try:
                val_data [ i , j ] = w2v [ w ]
                j += 1
            except:
                pass

for i,sentence in enumerate(X_test) :
    sentence = sentence.replace('-', ' ')
    words = nltk.word_tokenize(sentence)

    j = 0 

    for  w  in  words :
        if w not in STOP_WORDS:
            try:
                test_data [ i , j ] = w2v [ w ]
                j += 1
            except:
                pass

print (train_data.shape)          
print (test_data.shape)
print (val_data.shape)

In [None]:
class PositionalEncoding(layers.Layer):
    def __init__(self) :
        super(PositionalEncoding, self).__init__()
    
    def get_angles(self, pos, i, d_model) :
        angles = 1 / np.power(10000., (2*(i//2)) / np.float32(d_model))
        return pos * angles

    def call(self, inputs) :
        seq_length = inputs.shape.as_list()[-2]
        d_model = inputs.shape.as_list()[-1]
        angles = self.get_angles(np.arange(seq_length)[:, np.newaxis],np.arange(d_model)[np.newaxis, :],d_model)
        angles[:, 0::2] = np.sin(angles[:, 0::2])
        angles[:, 1::2] = np.cos(angles[:, 1::2])
        pos_encoding = angles[np.newaxis, ...]

        return inputs + tf.cast(pos_encoding, tf.float32)

In [None]:
def scaled_dot_product_attention(queries, keys, values, mask) :
  
    product = tf.matmul(queries, keys, transpose_b  = True)
    
    keys_dim = tf.cast(tf.shape(keys)[-1], tf.float32)

    scaled_product = product / tf.math.sqrt(keys_dim)
    
    attention = tf.matmul(tf.nn.softmax(scaled_product, axis = -1), values)
    
    return attention

In [None]:
class MultiHeadAttention(layers.Layer):
    
    def __init__(self, nb_proj) :
        super(MultiHeadAttention, self).__init__()
        self.nb_proj = nb_proj
        
    def build(self, input_shape) :
        self.d_model = input_shape[-1]
        assert self.d_model % self.nb_proj == 0
        self.d_proj = self.d_model // self.nb_proj
        self.query_lin = layers.Dense(units=self.d_model)
        self.key_lin = layers.Dense(units=self.d_model)
        self.value_lin = layers.Dense(units=self.d_model)
        self.final_lin = layers.Dense(units=self.d_model)
        
    def split_proj(self, inputs, batch_size): # inputs: (batch_size, seq_length, d_model)
        shape = (batch_size,-1, self.nb_proj,self.d_proj)
        splited_inputs = tf.reshape(inputs, shape=shape) # (batch_size, seq_length, nb_proj, d_proj)
        return tf.transpose(splited_inputs, perm=[0, 2, 1, 3]) # (batch_size, nb_proj, seq_length, d_proj)
    
    def call(self, queries, keys, values, mask) :
        batch_size = tf.shape(queries)[0]
        queries = self.query_lin(queries)
        keys = self.key_lin(keys)
        values = self.value_lin(values)
        queries = self.split_proj(queries, batch_size)
        keys = self.split_proj(keys, batch_size)
        values = self.split_proj(values, batch_size)
        attention = scaled_dot_product_attention(queries, keys, values, mask)
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(attention,shape=(batch_size, -1, self.d_model))
        outputs = self.final_lin(concat_attention)
        return outputs

In [None]:
class EncoderLayer(layers.Layer):
    def __init__(self, FFN_units, nb_proj, dropout_rate) :
        super(EncoderLayer, self).__init__()
        self.FFN_units = FFN_units
        self.nb_proj = nb_proj
        self.dropout_rate = dropout_rate
    
    def build(self, input_shape) :
        self.d_model = input_shape[-1]
        self.multi_head_attention = MultiHeadAttention(self.nb_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout_rate)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)
        self.dense_1 = layers.Dense(units=self.FFN_units, activation="relu")
        self.dense_2 = layers.Dense(units=self.d_model)
        self.dropout_2 = layers.Dropout(rate=self.dropout_rate)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)
        
    def call(self, inputs, mask, training) :
        attention = self.multi_head_attention(inputs,
                                              inputs,
                                              inputs,
                                              mask)
        attention = self.dropout_1(attention, training=training)
        attention = self.norm_1(attention + inputs)
        outputs = self.dense_1(attention)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_2(outputs, training=training)
        outputs = self.norm_2(outputs + attention)
        return outputs

In [None]:
class Encoder(layers.Layer) :
    def __init__(self,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout_rate,
                 d_model,
                 name="encoder") :
        super(Encoder, self).__init__(name=name)
        self.nb_layers = nb_layers
        self.d_model = d_model
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout_rate)
        self.enc_layers = [EncoderLayer(FFN_units,
                                        nb_proj,
                                        dropout_rate) 
                           for _ in range(nb_layers)]
    
    def call(self, inputs, mask, training) :
        inputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        outputs = self.pos_encoding(inputs)
        outputs = self.dropout(outputs, training)
        for i in range(self.nb_layers) :
            outputs = self.enc_layers[i](outputs, mask, training)

        return outputs

In [None]:

class Transformer(tf.keras.Model):
    
    def __init__(self,
                 d_model,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout_rate,
                 name="transformer"):
        super(Transformer, self).__init__(name=name)
        
        self.encoder = Encoder(nb_layers,
                               FFN_units,
                               nb_proj,
                               dropout_rate,
                               d_model)
        # self.last_linear = layers.Dense(units=vocab_size_dec, name="lin_ouput")
    
    def create_padding_mask(self, seq):
        mask = tf.cast(tf.math.equal(seq, 0), tf.float32)
        
        return mask[:, tf.newaxis, tf.newaxis, :]

    def create_look_ahead_mask(self, seq):
        seq_len = tf.shape(seq)[1]
        look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
        
        return look_ahead_mask
    
    def call(self, enc_inputs, training = True):
        enc_mask = self.create_padding_mask(enc_inputs)
        enc_outputs = self.encoder(enc_inputs, enc_mask, training)
        
        return enc_outputs

<h1 style="color:blue"> Model Definition</h1>

In [None]:
#Hyper Parameters
D_MODEL = 300
NB_LAYERS = 2
FFN_UNITS = 512
NB_PROJ = 4
DROPOUT_RATE = 0.02
LEARNING_RATE = 1e-3

transformer = Transformer(d_model = D_MODEL,
                          nb_layers = NB_LAYERS,
                          FFN_units = FFN_UNITS,
                          nb_proj = NB_PROJ,
                          dropout_rate = DROPOUT_RATE)


inputs = layers.Input(shape = (sequence_size,D_MODEL))
x = transformer(inputs)
x = layers.Flatten()(x)
x = layers.Dense(1024, activation="relu")(x)
x = layers.Dense(1024, activation="relu")(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(512, activation="relu")(x)
x = layers.Dense(512, activation="relu")(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(256, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs = inputs, outputs = outputs)
model.summary()
model.compile(loss = 'binary_crossentropy' , optimizer = Adam(learning_rate=LEARNING_RATE) , metrics = ['accuracy'])

<h3 style="color:green;"> Callbacks to Stop training</h3>

In [None]:
LR_reduce=ReduceLROnPlateau(monitor='val_accuracy',
                            factor=.67,
                            patience=10,
                            min_lr=.00001,
                            verbose=1)

ES_monitor=EarlyStopping(monitor='val_loss',
                          patience=15)

class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, _, logs={}):
        if logs.get('accuracy') > 0.995:
            self.model.stop_training=True

In [None]:
history = model.fit(train_data, y_train, batch_size = 64, epochs = 300,
                    validation_data=(test_data, y_test), verbose=1, 
                    callbacks=[LR_reduce, ES_monitor, myCallback()])

<h3>Plotting Graphs</h3>

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.title('Accuracy vs Epoch')
plt.legend(['Training', 'Validation'], loc='lower right')
plt.show()

In [None]:
plt.plot(history.history['loss'], label='Training data')
plt.plot(history.history['val_loss'], label='Validation data')
plt.title('Loss')
plt.ylabel('Loss value')
plt.title('Loss vs Epoch')
plt.xlabel('No. epoch')
plt.legend(loc="upper left")
plt.show()