Toxic Comments Classification using 1D Convolution with Keras

1) Import Packages and *Functions*

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow .keras.preprocessing import text, sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, MaxPool1D
from sklearn.model_selection import train_test_split
from google.colab import files
import io 
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

In [None]:
# Check tf version
print(tf.__version__)

2) Load Dataset

In [None]:
train_df = pd.read_csv('/content/sample_data/train.csv') .fillna(' ')
train_df.sample(5, random_state=1)

In [None]:
# View comment_text column
X = train_df['comment_text'].values
print(X)

3) Explore Data

In [None]:
# View few toxic comments
train_df.loc[train_df['toxic']==1].sample(5, random_state=1)

In [None]:
# Create wordcloud for toxic word collection
comments = train_df['comment_text'].loc[train_df['toxic']==1].values

wordcloud = WordCloud(
    width = 640,
    height = 640,
    background_color='black',
    stopwords = STOPWORDS).generate(str(comments))

fig = plt.figure(
    figsize=(5,5),
    facecolor='w',
    edgecolor='w'
)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
# View toxic values
y = train_df['toxic'].values
print(y)

In [None]:
# Plot histogram for toxic column
train_df['toxic'].plot(kind='hist', title='Distribution of Toxic Comments')

In [None]:
# Count the values of 'toxic' and 'non toxic'
train_df['toxic'].value_counts()

4) Data Preparation__ Tokenization and Pad Text Data

In [None]:
max_features = 20000
max_text_length = 400

In [None]:
x_tokenizer = text.Tokenizer(max_features)
x_tokenizer.fit_on_texts(list(X))

x_tokenized = x_tokenizer.texts_to_sequences(X)
x_train_val = sequence.pad_sequences(x_tokenized, maxlen=max_text_length)

5) Prepare Embedding Matrix with Pre-trained GloVe Embeddings

In [None]:
embedding_dim = 100
embeddings_index = dict()
f = open('/content/sample_data/glove.6B.100d.txt')

for line in f:
  values = line.split()
  word = values[0]
  coef = np.asarray(values[1:], dtype = 'float32')
  embeddings_index[word] = coef 
f.close()

print(f'Found {len(embeddings_index)} word vectors')

In [None]:
embedding_matrix = np.zeros((max_features, embedding_dim))
for word, index in x_tokenizer.word_index.items():
  if index > max_features -1 :
    break
  else:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
      embedding_matrix[index] = embedding_vector

6) Create the Embedding Layer

In [None]:
model = Sequential()
model.add(Embedding(max_features, 
                    embedding_dim,
                    embeddings_initializer = tf.keras.initializers.Constant(
                    embedding_matrix),
                    trainable = False))
model.add(Dropout(0.2))

7) Build the Model

In [None]:
filters = 250
kernel_size = 3
hidden_dims = 250

In [None]:
model.add(Conv1D(filters,
                 kernel_size,
                 padding = 'valid'))

In [None]:
model.add(MaxPool1D())
model.add(Conv1D(filters,
                 5,
                 padding = 'valid',
                 activation = 'relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(hidden_dims, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(1, activation='sigmoid'))
model.summary()

In [None]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

8) Train Model

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y,
                                                  test_size = 0.15, random_state = 1)

In [None]:
batch_size = 32
epochs = 3

model.fit(x_train, y_train,
          batch_size = batch_size,
          epochs = 3,
          validation_data = (x_val, y_val))

9) Evaluate Model

In [None]:
test_df = pd.read_csv('/content/sample_data/test.csv')
test_df.head()

In [None]:
x_test = test_df['comment_text'].values

In [None]:
x_text_tokenized = x_tokenizer.texts_to_sequences(x_test)
x_testing = sequence.pad_sequences(x_text_tokenized, maxlen = max_text_length)

In [None]:
y_testing = model.predict(x_testing, verbose = 1, batch_size = 32)

In [None]:
y_testing.shape

In [None]:
y_testing[0]

In [None]:
test_df['Toxic'] = ['not toxic' if x < .5 else 'toxic' for x in y_testing]
test_df[['comment_text', 'Toxic']].head(10)