In [1]:
import pandas as pd
import numpy as np
import re
import csv
import tensorflow as tf
import nltk
from gensim.models import Word2Vec
from keras.preprocessing import text, sequence

Using TensorFlow backend.


# Read Data in DataFrame

In [2]:
df = pd.read_csv('train.csv')
raw_input = df['comment_text']

# Word2Vec word embedding

In [5]:
sentences = []
for num in range(len(raw_input)):
    temp = nltk.sent_tokenize(raw_input[num])
    for j in range(len(temp)):
        txt = re.sub('[^\w\s\']|\d+','',temp[j])
        temp[j] = re.sub(r'\n|\s{2,}',' ',txt)
        sentences.append(temp[j].lower().split())

In [None]:
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model = Word2Vec(sentences, iter=20, min_count=10, size=300, workers=4)

In [None]:
#model.save('./word2vec.txt')

In [None]:
model.wv.most_similar("fuck")

In [3]:
model = Word2Vec.load('./word2vec.txt')

In [4]:
vocab = model.wv.vocab

# Develope word series

In [5]:
words = []
freq = []
for w in vocab:
    words.append(w)
    freq.append(vocab[w].count)
    
wordseries = pd.DataFrame({'word': words, 'freq': freq})

In [6]:
wordseries = wordseries.sort_values(['freq'], ascending = [0])
wordseries['id'] = range(1,wordseries.shape[0]+1)

In [7]:
wordsequence = dict(zip(wordseries['word'],wordseries['id']))

In [8]:
W = np.zeros((1,300))
W = np.append(W, model[wordsequence.keys()],axis=0)

  


In [9]:
W = W.astype(np.float32, copy=False)

# Batch Generator

In [10]:
def generate_batch(data, batch_size, num_epochs, shuffle=True):
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
    for epoch in range(num_epochs):
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            print(end_index)
            yield shuffled_data[start_index:end_index]

# Placeholder Setting

In [11]:
filter_sizes = [2,3,4,5]
num_filters = 2
batch_size = 200
embedding_size = 300
num_filters_total = num_filters * len(filter_sizes)
sequence_length = 1403
num_epochs = 10

In [12]:
input_x = tf.placeholder(tf.int32, [None, sequence_length], name = "input_x")
input_y = tf.placeholder(tf.float32, [None,6], name = "input_y")
dropout_keep_prob = 0.5

In [13]:
embedded_chars = tf.nn.embedding_lookup(W, input_x)
embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)

In [14]:
pooled_outputs = []
    
for i, filter_size in enumerate(filter_sizes):
        
    filter_shape = [filter_size, embedding_size, 1, num_filters]
        
    w = tf.Variable(tf.truncated_normal(filter_shape,stddev = 0.1), name = "w")
    b = tf.Variable(tf.truncated_normal([num_filters]), name = "b")
            
    conv = tf.nn.conv2d(
        embedded_chars_expanded,
        w,
        strides = [1,1,1,1],
        padding = "VALID",
        name = "conv"
    )
    h = tf.nn.relu(tf.nn.bias_add(conv, b), name = "relu")
    pooled = tf.nn.max_pool(
        h,
        ksize = [1,sequence_length - filter_size + 1, 1, 1],
        strides = [1,1,1,1],
        padding = "VALID",
        name = "pool"
    )
    pooled_outputs.append(pooled)
    
h_pool = tf.concat(pooled_outputs, 3)
h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])

In [15]:
h_drop = tf.nn.dropout(h_pool_flat, dropout_keep_prob)

In [16]:
wd1 = tf.Variable(tf.truncated_normal([num_filters_total, int(num_filters_total/2)], stddev=0.1), name = "wd1")
bd1 = tf.Variable(tf.truncated_normal([int(num_filters_total/2)]), name = "bd1")
layer1 = tf.nn.xw_plus_b(h_drop, wd1, bd1, name = 'layer1')
layer1 = tf.nn.relu(layer1)

In [17]:
wd2 = tf.Variable(tf.truncated_normal([int(num_filters_total/2),6]), name = 'wd2')
bd2 = tf.Variable(tf.truncated_normal([6]), name = "bd2")
layer2 = tf.nn.xw_plus_b(layer1, wd2, bd2, name = 'layer2')
prediction = tf.nn.softmax(layer2)

In [18]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits = layer2, labels = input_y))

In [19]:
optimizer = tf.train.AdamOptimizer(learning_rate = 0.5).minimize(loss)
#correct_prediction = tf.equal(tf.argmax(input_y, 1), tf.argmax(prediction, 1))
#accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

# Pre-process Test Data

In [20]:
df_test = pd.read_csv('test.csv')

In [21]:
test_input = df_test['comment_text']
for i in range(len(test_input)):
    txt = re.sub('[^\w\s\']|\d+','',test_input[i])
    txt = re.sub(r'\n|\s{2,}',' ',txt.lower())
    lst = txt.split()
    temp = []
    for word in lst:
        if word not in vocab:
            temp.append(0)
        else:
            temp.append(wordsequence[word])

    test_input[i] = temp
test_input = sequence.pad_sequences(test_input, maxlen = sequence_length)    

# Test data split to blocks

In [24]:
test_input.shape

(153164, 1403)

In [22]:
def blocks(data, block_size):
    data = np.array(data)
    data_size = len(data)
    nums = int((data_size-1)/block_size) + 1
    for block_num in range(nums):
        if block_num == 0:
            print("prediction start!")
        start_index = block_num * block_size
        end_index = min((block_num + 1) * block_size, data_size)
        print(end_index)
        yield data[start_index:end_index]

In [23]:
test_blocks = blocks(test_input,300)

# Train

In [24]:
init_op = tf.global_variables_initializer()

In [25]:
batches = generate_batch(list(zip(raw_input, df['toxic'], df['severe_toxic'], df['obscene'], df['threat'], df['insult'], df['identity_hate'])), batch_size, num_epochs)

In [None]:
with tf.Session() as sess:
    
    sess.run(init_op)
    
    for batch in batches:
        batch = pd.DataFrame(batch, columns = ['a','b','c','d','e','f','g'])
        x_batch = batch['a']
        y_batch = batch.loc[:, batch.columns != 'a']
        for i in range(len(x_batch)):
            txt = re.sub('[^\w\s\']|\d+','',x_batch[i])
            txt = re.sub(r'\n|\s{2,}',' ',txt.lower())
            lst = txt.split()
            temp = []
            for word in lst:
                if word not in vocab:
                    temp.append(0)
                else:
                    temp.append(wordsequence[word])
            x_batch[i] = temp
        x_batch = sequence.pad_sequences(x_batch, maxlen=sequence_length)
        #y_batch = np.array(y_batch).reshape(batch_size,6)
        _,c = sess.run([optimizer, loss],feed_dict = {input_x: x_batch, input_y: y_batch})
        
        with open('csvfile.csv', "w") as output:
            writer = csv.writer(output, lineterminator='\n')
            writer.writerow(['toxic','severe_toxic','obscene','threat','insult','identity_hate'])
            for block in test_blocks:
                pred = sess.run(prediction, feed_dict={input_x: block})
                writer.writerows(pred)

200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400
3600
3800
4000
4200
4400
4600
4800
5000
5200
5400
5600
5800
6000
6200
6400
6600
6800
7000
7200
7400
7600
7800
8000
8200
8400
8600
8800
9000
9200
9400
9600
9800
10000
10200
10400
10600
10800
11000
11200
11400
11600
11800
12000
12200
12400
12600
12800
13000
13200
13400
13600
13800
14000
14200
14400
14600
14800
15000
15200
15400
15600
15800
16000
16200
16400
16600
16800
17000
17200
17400
17600
17800
18000
18200
18400
18600
18800
19000
19200
19400
19600
19800
20000
20200
20400
20600
20800
21000
21200
21400
21600
21800
22000
22200
22400
22600
22800
23000
23200
23400
23600
23800
24000
24200
24400
24600
24800
25000
25200
25400
25600
25800
26000
26200
26400
26600
26800
27000
27200
27400
27600
27800
28000
28200
28400
28600
28800
29000
29200
29400
29600
29800
30000
30200
30400
30600
30800
31000
31200
31400
31600
31800
32000
32200
32400
32600
32800
33000
33200
33400
33600
33800
34000
34200
34400
34600
34800
35000
352

In [3]:
df.head

<bound method NDFrame.head of                       id                                       comment_text  \
0       0000997932d777bf  Explanation\nWhy the edits made under my usern...   
1       000103f0d9cfb60f  D'aww! He matches this background colour I'm s...   
2       000113f07ec002fd  Hey man, I'm really not trying to edit war. It...   
3       0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...   
4       0001d958c54c6e35  You, sir, are my hero. Any chance you remember...   
5       00025465d4725e87  "\n\nCongratulations from me as well, use the ...   
6       0002bcb3da6cb337       COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK   
7       00031b1e95af7921  Your vandalism to the Matt Shirvington article...   
8       00037261f536c51d  Sorry if the word 'nonsense' was offensive to ...   
9       00040093b2687caa  alignment on this subject and which are contra...   
10      0005300084f90edc  "\nFair use rationale for Image:Wonju.jpg\n\nT...   
11      00054a5e18b50d

In [None]:
for num in range(len(raw_input)):
    txt = re.sub('[^\w\s\']|\d+','',raw_input[num])
    txt = re.sub(r'\n|\s{2,}',' ',txt)
    raw_input[num] = txt.lower().split()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
