In [20]:
import numpy as np
import pandas as pd
import vocab as vocabulary
import collections
import utils

In [2]:
train = pd.read_csv('/data/ToxicityData/train.csv')
test = pd.read_csv('/data/ToxicityData/test.csv')

In [3]:
test.head()

Unnamed: 0,id,comment_text
0,7000000,Jeff Sessions is another one of Trump's Orwell...
1,7000001,I actually inspected the infrastructure on Gra...
2,7000002,No it won't . That's just wishful thinking on ...
3,7000003,Instead of wringing our hands and nibbling the...
4,7000004,how many of you commenters have garbage piled ...


In [4]:
train.iloc[0]

id                                                                                 59848
target                                                                                 0
comment_text                           This is so cool. It's like, 'would you want yo...
severe_toxicity                                                                        0
obscene                                                                                0
identity_attack                                                                        0
insult                                                                                 0
threat                                                                                 0
asian                                                                                NaN
atheist                                                                              NaN
bisexual                                                                             NaN
black                

### Tokenization
This can be as simple as calling string.split() - good enough for English and many European languages - but we could also do something more sophisticated here. There are various types of tokenizers:  
  
1. nltk.tokenize.treebank import TreebankWordTokenizer
2. nltk.tokenize import WhitespaceTokenizer

In [5]:
from nltk.tokenize import WhitespaceTokenizer
white_token = WhitespaceTokenizer()

In [6]:
from keras.preprocessing.text import Tokenizer
keras_token = Tokenizer()

Using TensorFlow backend.


In [28]:
V = 30000
SEED = 23
VAL_SPLIT = 0.3

#### first, tokenize everything to build vocab
Only use vocabs from train data.

In [7]:
tokenize_all_one_list = white_token.tokenize(' '.join(train['comment_text'].tolist()))

In [8]:
len(set(tokenize_all_one_list))

1670966

There are 1.67 million tokens, do not have to use all of them as tokens

In [9]:
collections.Counter(tokenize_all_one_list).most_common(20)

[('the', 4261263),
 ('to', 2611234),
 ('and', 2096691),
 ('of', 2021781),
 ('a', 1880032),
 ('is', 1454734),
 ('in', 1294522),
 ('that', 1163635),
 ('for', 911179),
 ('I', 861783),
 ('you', 734810),
 ('are', 714218),
 ('be', 618319),
 ('not', 613791),
 ('have', 598834),
 ('it', 598509),
 ('on', 577471),
 ('with', 556536),
 ('as', 471924),
 ('they', 464629)]

In [10]:
collections.Counter(tokenize_all_one_list).most_common(V)[-1]

('scant', 141)

#### The 30kth token has 140 appearances, not too bad, we will use top 30k covab, and leave the rest as unknown
This step takes a long time.

In [12]:
vocab = vocabulary.Vocabulary(tokenize_all_one_list, size=30000)

### Conversion to IDs
While there are a few ML models that operate directly on strings, in most cases (and always for neural networks) you'll need to convert the tokens to integer IDs that can index into a feature vector. To do this, we'll need to keep track of a vocabulary, which in its simplest form is just a dictionary.  

And unlike before, we are now tokenizing every row and then turn them into IDs using our vocab created


In [14]:
x_train = [vocab.words_to_ids(white_token.tokenize(train_row)) for train_row in train['comment_text'].tolist()]
x_test = [vocab.words_to_ids(white_token.tokenize(test_row)) for test_row in test['comment_text'].tolist()]

In [15]:
x_train[0]

[82,
 8,
 52,
 12488,
 118,
 3554,
 2,
 13,
 102,
 35,
 1646,
 4,
 218,
 2,
 3494,
 224,
 4717,
 164,
 11879]

In [17]:
train.iloc[0,2]

"This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well done!"

### Create Sparse input matrix
For many language models, we need to convert inputs into a sparse matrix.   
For example, for simple Naive Bayes BOW, we need to convert each sentence into an array containing the entire vocabulary. Each sentence would only have few words out of the entire vocab, so we will end up with a very sparse matrix, with each sentence being a row, and each row has V entries corresponding to the vocabulary.   
  
We have a function in utils to convert sentences into sparse matrix. In this representation, instead of printing V for each row, we only print the words that have a count > 0. 

In [36]:
y_train = np.array(train['target'] > 0.5)
y_train = [1 if i else 0 for i in y_train]
y_train[:5]

[0, 0, 0, 0, 1]

In [37]:
rng = np.random.RandomState(SEED)
rng.shuffle(x_train)  # in-place
rng.shuffle(y_train)


split_idx = int(VAL_SPLIT * len(x_train))
val_x = x_train[:split_idx]
val_y = y_train[:split_idx]

train_x = x_train[split_idx:]
train_y  = y_train[split_idx:]



In [38]:
train_x_sb = utils.id_lists_to_sparse_bow(train_x, V)
val_x_sb = utils.id_lists_to_sparse_bow(val_x, V)

In [39]:
print(train_x_sb[0])

  (0, 2)	15
  (0, 3)	5
  (0, 4)	6
  (0, 5)	2
  (0, 7)	2
  (0, 8)	4
  (0, 9)	6
  (0, 10)	2
  (0, 12)	1
  (0, 13)	2
  (0, 14)	2
  (0, 18)	3
  (0, 20)	3
  (0, 21)	1
  (0, 22)	1
  (0, 26)	2
  (0, 30)	1
  (0, 33)	1
  (0, 35)	2
  (0, 36)	1
  (0, 37)	1
  (0, 43)	3
  (0, 47)	1
  (0, 51)	1
  (0, 55)	1
  :	:
  (0, 2392)	1
  (0, 3519)	1
  (0, 3643)	1
  (0, 3768)	2
  (0, 3839)	1
  (0, 3981)	1
  (0, 4346)	1
  (0, 4492)	1
  (0, 4668)	1
  (0, 4698)	1
  (0, 6773)	1
  (0, 8519)	1
  (0, 8528)	1
  (0, 9587)	1
  (0, 9857)	2
  (0, 10001)	1
  (0, 10309)	4
  (0, 10467)	1
  (0, 11152)	2
  (0, 16333)	1
  (0, 16840)	1
  (0, 17317)	1
  (0, 28213)	1
  (0, 28518)	2
  (0, 29701)	1


In [40]:
print("Training set: x = {:s} sparse, y = {:s}".format(str(train_x_sb.shape), str(len(train_x))))
print("Test set:     x = {:s} sparse, y = {:s}".format(str(val_x_sb.shape),  str(len(val_x))))

Training set: x = (1263412, 30000) sparse, y = 1263412
Test set:     x = (541462, 30000) sparse, y = 541462


## Naive Bayes
NB is used for classification, we we are going to turn the target variable into binary variable. 
This is only for testing purpose. For final model we do need a predicted probability so NB is out of the question.

In [42]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

nb = MultinomialNB()

nb.fit(train_x_sb, train_y)
y_pred_val = nb.predict(val_x_sb)


acc = accuracy_score(val_y, y_pred_val)
print("Accuracy on test set: {:.02%}".format(acc))

Accuracy on test set: 93.94%


In [43]:
from sklearn.metrics import precision_recall_fscore_support as score

precision, recall, fscore, support = score(val_y, y_pred_val)

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

precision: [0.94099549 0.04766949]
recall: [0.99823561 0.00140898]
fscore: [0.96877077 0.00273706]
support: [509524  31938]


In [47]:
total = 0
for ind, val in enumerate(y_pred_val):
    if val != val_y[ind]:
        print(vocab.ids_to_words(val_x[ind]))
        print("pred", val, "actual", val_y[ind])
        total += 1
    if total > 5:
        break
    
    

['A', 'very', 'slippery', 'slope', 'here.', '--', 'The', 'popularity', 'of', '<unk>', 'oil', 'is', 'on', 'the', 'rise.', '<unk>', 'a', '<unk>', '<unk>', 'is', 'often', 'used', 'to', 'make', 'the', 'potent', 'marijuana', '<unk>', 'The', 'report', 'cites', 'a', 'recent', 'investigation', 'by', 'The', 'Oregonian', 'into', '<unk>', '<unk>', 'oil,', 'which', 'identified', 'nine', 'major', '<unk>', '<unk>', 'since', '2011,', 'including', 'one', 'that', 'killed', 'a', 'Portland', 'man.', '<unk>', 'THC', 'production', '-', '-', 'and', 'incidence', 'of', '<unk>', '<unk>', 'oil', 'lab', '<unk>', '--', 'is', 'expected', 'to', 'rise', 'as', 'the', 'market', 'expands', 'for', 'marijuana', '<unk>', 'and', 'demand', 'increases', 'for', 'product', 'that', 'has', 'a', 'strong', '<unk>', '<unk>', 'the', 'report', '<unk>', '<unk>']
pred 0 actual 1
['Because', 'the', 'Speaker', '<unk>', 'a', 'repeal', 'of', 'the', 'disaster', '<unk>', 'and', '<unk>', 'of', 'the', 'ACA,', 'you', 'claim', 'he', 'has', 'not'

### Canonicalization
Depending on the application, we might want to do some pre-processing to remove spurious variation in the text. For example, we might want to lowercase words to avoid storing separate features for "I" and "i", and we might want to replace numbers with a special token rather than keep track of every possible value.

utils have a basic transformation in utils.canonicalize_word. It's important to write different one for different tasks since the use of language can be quite different