# Spam filter for Quora questions

## Import Libraries

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import numpy as np
import pandas as pd

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from nltk import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, BatchNormalization, GRU ,concatenate
from keras.models import Model
from sklearn.utils import class_weight
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras import models
from keras.models import load_model
from sklearn.metrics import roc_auc_score, accuracy_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


## Load Dataset

In [3]:
questions_data = pd.read_csv('/content/gdrive/MyDrive/Deep Learning/Quora_data/train.csv',encoding='iso-8859-1')
questions_data.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


## Cleaning and preprocessing of Questions

In [4]:
questions_data.isnull().sum()

qid              0
question_text    0
target           0
dtype: int64

In [5]:
questions_data.shape

(1306122, 3)

In [6]:
questions_data.drop(['qid'],1,inplace=True)
questions_data.head()

Unnamed: 0,question_text,target
0,How did Quebec nationalists see their province...,0
1,"Do you have an adopted dog, how would you enco...",0
2,Why does velocity affect time? Does velocity a...,0
3,How did Otto von Guericke used the Magdeburg h...,0
4,Can I convert montra helicon D to a mountain b...,0


In [7]:
!pip install beautifulsoup4



In [8]:
from bs4 import BeautifulSoup

questions_data['question_text'] = questions_data['question_text'].apply(lambda x: BeautifulSoup(x).get_text())
questions_data['question_text'].replace('[^a-zA-Z]',' ', regex=True, inplace=True)
questions_data['question_text'] = questions_data['question_text'].str.lower()
questions_data['question_text'] = questions_data['question_text'].replace('\s+', ' ', regex=True)

In [9]:
stop_words = set(stopwords.words('english')) 
stop_words.remove('not')
len(stop_words)
questions_data['question_text'] = questions_data['question_text'].apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [10]:
lemmatizer = WordNetLemmatizer()
questions_data['question_text'] = questions_data['question_text'].apply(lambda x: ' '.join(lemmatizer.lemmatize(term) for term in x.split()))

In [11]:
questions_data.head()

Unnamed: 0,question_text,target
0,quebec nationalist see province nation,0
1,adopted dog would encourage people adopt not shop,0
2,velocity affect time velocity affect space geo...,0
3,otto von guericke used magdeburg hemisphere,0
4,convert montra helicon mountain bike changing ...,0


## Glove Embedding

In [1]:
!wget https://nlp.stanford.edu/data/glove.42B.300d.zip
!unzip -q glove.42B.300d.zip

--2021-07-13 05:01:11--  https://nlp.stanford.edu/data/glove.42B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.42B.300d.zip [following]
--2021-07-13 05:01:12--  http://downloads.cs.stanford.edu/nlp/data/glove.42B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1877800501 (1.7G) [application/zip]
Saving to: ‘glove.42B.300d.zip’


2021-07-13 05:07:05 (5.06 MB/s) - ‘glove.42B.300d.zip’ saved [1877800501/1877800501]



In [3]:
!rm glove.42B.300d.zip

In [4]:
embeddings_index={}

f=open('glove.42B.300d.txt',encoding='utf-8')

for line in f:
    values=line.split()
    word=values[0]
    coefs=np.asarray(values[1:],dtype='float32')
    embeddings_index[word]=coefs
f.close()

In [6]:
embeddings_index['products']

array([ 1.1528e-01,  2.5675e-01, -2.3314e-03, -3.3708e-01,  1.2065e+00,
        1.9529e-01, -2.8586e+00,  1.6321e-02, -1.6521e-02, -5.7973e-01,
       -1.6601e-02, -2.7687e-01,  1.6605e-01, -1.7086e-01,  2.4364e-01,
        3.7666e-01, -3.8832e-01, -1.0947e-01,  1.5988e-01, -2.5081e-01,
       -4.9140e-01,  5.4832e-01, -3.7264e-01,  4.8865e-01, -1.0455e-01,
       -7.2800e-01,  1.1575e-01, -2.4190e-01, -4.8795e-02, -4.3294e-01,
        1.0510e-01,  1.0442e-01, -2.6265e-01, -4.4382e-01, -1.5374e-01,
       -1.6880e-02, -3.2482e-02, -2.8060e-01,  1.2353e-01, -5.3766e-02,
       -3.7838e-01, -1.4031e-01,  1.4021e-01,  2.5882e-01, -2.2439e-01,
        6.4650e-01, -2.3916e-01, -3.1649e-01, -2.6933e-01, -2.3774e-01,
       -1.1238e-01, -5.1685e-02,  5.0766e-01,  4.1459e-02, -6.4502e-02,
        3.0197e-02, -2.8650e-02,  7.9970e-02, -1.2940e-01,  2.5734e-01,
       -3.1493e-01, -1.6216e-02, -3.7095e-01,  1.6928e-01, -1.8740e-01,
        2.2701e-01,  1.0125e-01, -2.1701e-01, -2.0954e-02, -5.49

## Split into Train and Test Set

In [16]:
train,test = train_test_split(questions_data,test_size=0.2,random_state=21)

In [17]:
train.shape, test.shape

((1044897, 2), (261225, 2))

In [18]:
x_train = train['question_text']
y_train = train['target']
x_test = test['question_text']
y_test = test['target']

In [19]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((1044897,), (261225,), (1044897,), (261225,))

In [20]:
class_weight = class_weight.compute_class_weight('balanced',np.unique(y_train),y_train)
class_weight_dict = dict(enumerate(class_weight))
class_weight_dict

{0: 0.5330438118481048, 1: 8.065713094760243}

In [21]:
sent_lens=[]
for sent in train['question_text']:
    sent_lens.append(len(word_tokenize(sent)))
max(sent_lens)

110

In [22]:
np.quantile(sent_lens,0.98)

17.0

In [23]:
max_len = 17
tok = Tokenizer(char_level=False,split=' ')
tok.fit_on_texts(x_train)
sequences_train = tok.texts_to_sequences(x_train)

In [24]:
vocab_len = len(tok.index_word.keys())
vocab_len

147772

In [25]:
tok.word_index

{'get': 1,
 'best': 2,
 'would': 3,
 'people': 4,
 'like': 5,
 'not': 6,
 'good': 7,
 'one': 8,
 'make': 9,
 'india': 10,
 'year': 11,
 'way': 12,
 'time': 13,
 'think': 14,
 'many': 15,
 'life': 16,
 'much': 17,
 'someone': 18,
 'u': 19,
 'use': 20,
 'want': 21,
 'know': 22,
 'work': 23,
 'country': 24,
 'take': 25,
 'thing': 26,
 'job': 27,
 'woman': 28,
 'indian': 29,
 'ever': 30,
 'find': 31,
 'world': 32,
 'person': 33,
 'become': 34,
 'feel': 35,
 'without': 36,
 'book': 37,
 'go': 38,
 'could': 39,
 'student': 40,
 'day': 41,
 'better': 42,
 'quora': 43,
 'girl': 44,
 'mean': 45,
 'difference': 46,
 'trump': 47,
 'company': 48,
 'need': 49,
 'new': 50,
 'possible': 51,
 'school': 52,
 'college': 53,
 'start': 54,
 'used': 55,
 'friend': 56,
 'american': 57,
 'first': 58,
 'state': 59,
 'question': 60,
 'say': 61,
 'money': 62,
 'still': 63,
 'business': 64,
 'love': 65,
 'different': 66,
 'long': 67,
 'old': 68,
 'really': 69,
 'university': 70,
 'give': 71,
 'learn': 72,
 'help

In [26]:
sequences_matrix_train = sequence.pad_sequences(sequences_train, maxlen=max_len)
sequences_matrix_train

array([[    0,     0,     0, ...,     3,    19,   268],
       [    0,     0,     0, ...,  1327,  1914,   505],
       [    0,     0,     0, ...,  7302,  6804, 70102],
       ...,
       [    0,     0,     0, ...,    13,   179,  9390],
       [    0,     0,     0, ...,  1094,    94,   226],
       [    0,     0,     0, ...,   159,    55,   420]], dtype=int32)

In [27]:
sequences_test = tok.texts_to_sequences(x_test)
sequences_matrix_test = sequence.pad_sequences(sequences_test, maxlen=max_len)

In [28]:
sequences_matrix_train.shape, sequences_matrix_test.shape, y_train.shape, y_test.shape

((1044897, 17), (261225, 17), (1044897,), (261225,))

In [29]:
embedding_matrix=np.zeros((vocab_len+1,300))

for word,i in tok.word_index.items():
    embed_vector=embeddings_index.get(word)
    if embed_vector is not None:
        if (embed_vector.shape != 0):  
            embedding_matrix[i]=embed_vector

In [30]:
embedding_matrix.shape

(147773, 300)

## Build the Model

In [32]:
inputs = Input(name='text_input',shape=[max_len])
embed=Embedding(vocab_len+1,300,input_length=max_len,mask_zero=True,
                weights=[embedding_matrix],trainable=False)(inputs)
layer = LSTM(512)(embed)
layer = Dropout(0.2)(layer)
layer = Dense(256,activation='relu')(layer)
layer = Dropout(0.2)(layer)
layer = Dense(128,activation='relu')(layer)
layer = Dropout(0.2)(layer)
layer = Dense(25,activation='relu')(layer)
layer = Dropout(0.2)(layer)
layer = Dense(1,activation='sigmoid')(layer)

model = Model(inputs=inputs,outputs=layer)

In [33]:
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_input (InputLayer)      [(None, 17)]              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 17, 300)           44331900  
_________________________________________________________________
lstm_1 (LSTM)                (None, 512)               1665024   
_________________________________________________________________
dropout_4 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 256)               131328    
_________________________________________________________________
dropout_5 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 128)               3289

In [34]:
filepath='/content/gdrive/MyDrive/Deep Learning/Quora_data/weights-{epoch:02d}-{val_loss:.4f}.h5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, 
                             save_best_only=True)

In [35]:
earlystop = EarlyStopping(monitor='val_loss', patience=3,
                          verbose=1)

In [36]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

## Train the Model

In [37]:
model.fit(sequences_matrix_train,y_train,
              epochs=20,
              class_weight={0: 0.5330438118481048, 1: 8.065713094760243}, 
              batch_size=1024,
              validation_data=(sequences_matrix_test,y_test),
              callbacks = [earlystop, checkpoint])

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.26074, saving model to /content/gdrive/MyDrive/Deep Learning/Quora_data/weights-01-0.2607.h5
Epoch 2/20

Epoch 00002: val_loss improved from 0.26074 to 0.24596, saving model to /content/gdrive/MyDrive/Deep Learning/Quora_data/weights-02-0.2460.h5
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.24596
Epoch 4/20

Epoch 00004: val_loss did not improve from 0.24596
Epoch 5/20

Epoch 00005: val_loss did not improve from 0.24596
Epoch 00005: early stopping


<keras.callbacks.History at 0x7fe982c119d0>

In [38]:
best_model = load_model('/content/gdrive/MyDrive/Deep Learning/Quora_data/weights-02-0.2460.h5')

In [39]:
best_model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_input (InputLayer)      [(None, 17)]              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 17, 300)           44331900  
_________________________________________________________________
lstm_1 (LSTM)                (None, 512)               1665024   
_________________________________________________________________
dropout_4 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 256)               131328    
_________________________________________________________________
dropout_5 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 128)               3289

## Test the Model

In [40]:
predictions = best_model.predict(sequences_matrix_test)

## Evaluate with ROC-AUC Score

In [41]:
roc_auc_score(y_test,predictions)

0.957771798377729

## Make a Prediction

In [42]:
index = 216
test.iloc[index]

question_text    best way test water moving friendship forward
target                                                       0
Name: 480825, dtype: object

In [43]:
pred = best_model.predict(sequences_matrix_test[index].reshape(1,17))[0][0]
print(pred)
print("Predicted Target - ",pred.round().astype(int))

0.0036403239
Predicted Target -  0


In [44]:
index = 261224
test.iloc[index]

question_text    anyone contradict say real teaching islam musl...
target                                                           1
Name: 1238502, dtype: object

In [45]:
pred = best_model.predict(sequences_matrix_test[index].reshape(1,17))[0][0]
print(pred)
print("Predicted Target - ",pred.round().astype(int))

0.96939087
Predicted Target -  1


## Save Model

In [46]:
best_model.save('/content/gdrive/MyDrive/Deep Learning/Quora_data/quora_spam_filter_model.h5')