In [1]:
import numpy as np
import pandas as pd
import random
import tensorflow as tf
from tqdm import tqdm

In [2]:
train_df=pd.read_pickle('./data/train_df.pickle')
test_df=pd.read_pickle('./data/test_df.pickle')


In [3]:
comment_lengths=list()
comment_list=train_df['text'].tolist()
for i in tqdm(range(len(comment_list))):
    comment=comment_list[i]
    temp1=len(comment.split())
    comment_lengths.append(temp1)

    

100%|████████████████████████████| 3600000/3600000 [00:25<00:00, 142602.81it/s]


In [4]:
comment_lengths_test=list()
comment_list_test=test_df['text'].tolist()
for i in tqdm(range(len(comment_list_test))):
    comment=comment_list_test[i]
    temp1=len(comment.split())
    comment_lengths_test.append(temp1)

100%|██████████████████████████████| 400000/400000 [00:02<00:00, 140844.08it/s]


In [5]:
#word2vec_simple_from_tf=np.load('./data/word2vec_simple.npz')
word2vec_simple_from_tf=np.load('./data/word2vec_1M_FT.npz')

In [6]:
embeddings=word2vec_simple_from_tf['arr_0']
word_dict=word2vec_simple_from_tf['arr_1'].tolist()
reverse_dict=word2vec_simple_from_tf['arr_2'].tolist()
embedding_size=embeddings.shape[1]
comment_vector_size=embedding_size*max(comment_lengths)
num_comments=len(comment_list)
train_labels=(train_df['label']).tolist()

In [7]:
comments_with_word_indices=np.zeros(shape=(num_comments,max(comment_lengths)),dtype='int32')
for i in tqdm(range(len(comment_list))):
    comment=comment_list[i]
    comment_words=comment.split()
    for j,word in enumerate(comment_words):
        comments_with_word_indices[i,j]=word_dict.get(word,0)

100%|█████████████████████████████| 3600000/3600000 [03:17<00:00, 18239.20it/s]


In [8]:
def generate_batch(batch_size):
    batch_inputs=np.zeros((batch_size,comment_vector_size))
    batch_labels=np.ndarray(shape=(batch_size,1),dtype=np.float64)
    batch_indices=random.sample(range(num_comments),batch_size)
    for i in range(batch_size):
        batch_labels[i]=train_labels[batch_indices[i]]-1
        comment_with_word_indices=comments_with_word_indices[batch_indices[i],:]
        for j,word_index in enumerate(comment_with_word_indices):
            if word_index==0:
                word_vector=np.zeros(embedding_size)
            else:
                word_vector=embeddings[word_index,:]
            batch_inputs[i,j*embedding_size:(j+1)*embedding_size]=word_vector
    return batch_inputs, batch_labels, batch_indices

In [10]:
#sanity check
a=generate_batch(2) 
b=a[0][0]
index1=a[2][0]
c=[o for o in b if o!=0]
print(len(c))
print(len(comment_list[index1].split()))
label1=a[1][0]
print(label1)
print(train_df['label'][index1])

38076
139
[ 1.]
2


In [17]:
import time
num_steps=100001
batch_size=64
graph=tf.Graph()
batch_timing=0
run_timing=0
restore_sess=1
#model_name="./ckpt/LR_word2vec.ckpt"
model_name="./ckpt/LR_word2vec_1M_FT.ckpt"
steps_display=min(int((num_steps-1)/20),1000)
with graph.as_default():
    inputs=tf.placeholder(tf.float64, shape=(batch_size,comment_vector_size))
    labels=tf.placeholder(tf.float64, shape=(batch_size,1))
    W = tf.Variable(tf.random_uniform([comment_vector_size, 1],-1.0,1.0,dtype=tf.float64))
    b = tf.Variable(tf.ones([1],dtype=tf.float64))
    y_pred=tf.matmul(inputs,W)+b
    x_entropy=tf.nn.sigmoid_cross_entropy_with_logits(logits=y_pred,labels=labels)
    loss =tf.reduce_mean(x_entropy)#+0.01*(tf.reduce_sum(tf.multiply(W,W))+tf.reduce_sum(tf.multiply(b,b)))
    optimizer = tf.train.AdamOptimizer().minimize(loss)
    init = tf.global_variables_initializer()
    saver=tf.train.Saver()    

with tf.Session(graph=graph) as sess:
    init.run()
    if restore_sess==1:
        try:
            saver.restore(sess,model_name)
        except:
            print('Unexpected Error: model cannot be restored')
    print('initialized')
    loss_val_sum=0
    for i in range(num_steps):
        time1=time.time()
        batch_inputs,batch_labels,_=generate_batch(batch_size)
        time2=time.time()
        feed_dict={inputs:batch_inputs,labels:batch_labels}
        _,loss_val,y_preds=sess.run([optimizer,loss,y_pred],feed_dict=feed_dict)
        time3=time.time()
        batch_timing += time2-time1
        run_timing += time3-time2
        loss_val_sum += loss_val
        if i % steps_display==0 and i!=0:
            print('Step:',i,',Average loss in ',steps_display, 'steps: ',loss_val_sum/steps_display,' ,batch time: ',batch_timing,' ,Run time: ',run_timing)
            batch_timing=0
            run_timing=0
            loss_val_sum=0
    Weights=W.eval()
    bias=b.eval()
    try:
        saver.save(sess,model_name)
        print('Model saved!')
    except:
        print('Model could not be saved')

INFO:tensorflow:Restoring parameters from ./ckpt/LR_word2vec_1M_FT.ckpt
initialized
Step: 1000 ,Average loss in  1000 steps:  0.366232348491  ,batch time:  65.36192917823792  ,Run time:  22.020044326782227
Step: 2000 ,Average loss in  1000 steps:  0.361558607394  ,batch time:  65.96992564201355  ,Run time:  21.88983392715454
Step: 3000 ,Average loss in  1000 steps:  0.372287089663  ,batch time:  65.52271389961243  ,Run time:  22.043843507766724
Step: 4000 ,Average loss in  1000 steps:  0.368673681609  ,batch time:  65.43512105941772  ,Run time:  22.08903741836548
Step: 5000 ,Average loss in  1000 steps:  0.361723040212  ,batch time:  65.74311780929565  ,Run time:  21.900840520858765
Step: 6000 ,Average loss in  1000 steps:  0.361959425925  ,batch time:  65.63792252540588  ,Run time:  21.87463617324829
Step: 7000 ,Average loss in  1000 steps:  0.363373453813  ,batch time:  65.25931644439697  ,Run time:  22.16904354095459
Step: 8000 ,Average loss in  1000 steps:  0.360461611408  ,batch t

In [18]:
num_comments_test=len(comment_list_test)
test_labels=(test_df['label']-1).tolist()
predictions=list()
for i in tqdm(range(num_comments_test)):
    comment_vector=np.zeros(shape=(comment_vector_size))
    comment=comment_list_test[i]
    comment_as_word=comment.split()
    for j,word in enumerate(comment_as_word):
        word_index=word_dict.get(word,0)
        if word_index==0:
            word_vector=np.zeros(embedding_size)
        else:
            word_vector=embeddings[word_index,:]
        comment_vector[j*embedding_size:(j+1)*embedding_size]=word_vector
    pred_logit=1 / (1 + np.exp(-(comment_vector.dot(Weights)+bias)))
    if pred_logit>0.5:
        pred=1
    else:
        pred=0
    predictions.append(pred)

100%|████████████████████████████████| 400000/400000 [02:40<00:00, 2486.74it/s]


In [19]:
y_true=test_labels
y_pred=predictions
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
accuracy=accuracy_score(y_true,y_pred)
precision=average_precision_score(y_true,y_pred)
recall=recall_score(y_true,y_pred)
f1_score=f1_score(y_true,y_pred)
print('Accuracy: {:4.2f}%, Precision: {:4.2f}%, Recall: {:4.2f}%, F1_score: {:4.2f}%'.format(accuracy*100,precision*100,recall*100,f1_score*100))

Accuracy: 84.42%, Precision: 78.94%, Recall: 84.90%, F1_score: 84.49%
