# Data processing

In [1]:
import re

import numpy as np
import pandas as pd

In [None]:
path = 'data/train.csv'

In [2]:
df = pd.read_csv(path, header=0)
df = df.drop(columns=['id'])
df.head(10)

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,alignment on this subject and which are contra...,0,0,0,0,0,0


### Clean the data

In [3]:
df['comment_text'] = df['comment_text'].apply(lambda x: x.strip().lower())
df['comment_text'] = df['comment_text'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
df['comment_text'] = df['comment_text'].apply(lambda x: ' '.join(x.split()))

df.head(3)

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explanation why the edits made under my userna...,0,0,0,0,0,0
1,daww he matches this background colour im seem...,0,0,0,0,0,0
2,hey man im really not trying to edit war its j...,0,0,0,0,0,0


### Labels

Instead of separating between different forms of toxicity, we aim to detect general misbehaviour. Hence, a comment containing any form of toxicity is labeled positive.

In [4]:
data = df['comment_text'].to_numpy()
label = df.loc[:, df.columns != 'comment_text'].to_numpy()
label = np.any(label, axis=1)  # label comment as toxic if any form of misbehaviour is spotted

data.shape, label.shape

((159571,), (159571,))

In [5]:
np.unique(label, return_counts=True)

(array([False,  True]), array([143346,  16225]))

Dataset is highly unbalanced - randomly select negative examples to match positive ones.

In [6]:
good = data[label == False]
np.random.seed(7)
np.random.shuffle(good)
good = good[:16225]
y_good = np.zeros(len(good))

bad = data[label == True]
y_bad = np.ones(len(bad))

X = np.hstack((good, bad))
y = np.hstack((y_good, y_bad))

X.shape, y.shape

((32450,), (32450,))

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=7)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(25960,) (25960,)
(6490,) (6490,)


### Tokenization of comments - replace words with number

In [8]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [9]:
# params for tokenization
max_words = 10000  # how many words to keep
words_in_sentence = 40

In [10]:
tokenizer = Tokenizer(num_words=max_words, split=' ')
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(
    X_train, maxlen=words_in_sentence, padding='post', truncating='post')

X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(
    X_test, maxlen=words_in_sentence, padding='post', truncating='post')

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(25960, 40) (25960,)
(6490, 40) (6490,)


# Models

**References**:

Georgakopoulos, S. V., Tasoulis, S. K., Vrahatis, A. G., & Plagianakos, V. P. (2018, July). Convolutional neural networks for toxic comment classification. In Proceedings of the 10th Hellenic Conference on Artificial Intelligence (p. 35). ACM.

Kim, Y. (2014). Convolutional neural networks for sentence classification. arXiv preprint arXiv:1408.5882.

### CNN

In [11]:
from keras.constraints import max_norm
from keras.layers import (
    Input, Embedding, Dense, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout
)
from keras.models import Model
from keras.optimizers import Adam

In [12]:
# params
# see Georgakopoulos et al. (2018)
embedding_dims = 100
n_filters = 128
dropout_rate = 0.5
fc_dim = 300
# l2_norm constraint, see Kim (2014)
s = 3.

learning_rate = 0.005
batch_size = 64
epochs = 1

In [13]:
# we get in (examples, words, embedding_size) tensor
input_tensor = Input(shape=(words_in_sentence,))

embedding_tensor = Embedding(
    max_words,  # vocabulary size
    embedding_dims,  # dimension of dense embedding
    input_length=words_in_sentence  
)(input_tensor)  # outputs (, seq_length, embeddin_dims) tensor

# -- convolution block --
block_1 = Conv1D(
    n_filters, 
    kernel_size=3,
    padding='valid',
    activation='relu',
    strides=1,
    kernel_constraint=max_norm(s)
)(embedding_tensor)  # output is (batch, new_steps, filters)
# max over time pooling
block_1 = GlobalMaxPooling1D()(block_1)  # output is tensor of shape (batch, filters)

block_2 = Conv1D(
    n_filters, 
    kernel_size=4,
    padding='valid',
    activation='relu',
    strides=1,
    kernel_constraint=max_norm(s)
)(embedding_tensor)
block_2 = GlobalMaxPooling1D()(block_2)

block_3 = Conv1D(
    n_filters, 
    kernel_size=5,
    padding='valid',
    activation='relu',
    strides=1,
    kernel_constraint=max_norm(s)
)(embedding_tensor)
# max-over-time pooling
block_3 = GlobalMaxPooling1D()(block_3)


# -- fully-connected block --
# concatenate results of into tensor of shape (batch, filters + filters + filters)
concat = Concatenate()([block_1, block_2, block_3])
# dropout
concat = Dropout(dropout_rate)(concat)

fc = Dense(
    fc_dim,
    activation='relu',
    kernel_constraint=max_norm(s)
)(concat)
fc = Dropout(dropout_rate)(fc)

# fully-connected softmax layer with l2 regularization
predictions = Dense(
    1,
    activation='sigmoid',
    kernel_constraint=max_norm(s)
)(fc)

In [14]:
# This creates a model that includes
# the Input layer and three Dense layers
model = Model(inputs=input_tensor, outputs=predictions)


adam = Adam(lr=learning_rate)
model.compile(optimizer=adam,
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 40, 100)      1000000     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 38, 128)      38528       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 37, 128)      51328       embedding_1[0][0]               

In [15]:
model.fit(
    X_train, y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(X_test, y_test)
)


Train on 25960 samples, validate on 6490 samples
Epoch 1/1


<keras.callbacks.callbacks.History at 0x7f7c2ddc28d0>

# Results

In [16]:
model.evaluate(X_test, y_test, batch_size=64)



[0.26893135874039953, 0.8904468417167664]

# Comparison with SVC

## Bag of Words approach

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

X.shape, y.shape

((32450,), (32450,))

In [18]:
vectorizer = TfidfVectorizer(max_features=300) #max_features=300)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=7)

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(25960, 300) (25960,)
(6490, 300) (6490,)


## Fitting

In [19]:
from sklearn.svm import SVC

In [20]:
clf = SVC(gamma='auto')
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.7996918335901386