In [43]:
from dataset import *

data_files = ['D:/senior/aiCourse/dataSource/comment_classification/output/train.json']
vocab_file = 'D:/senior/aiCourse/dataSource/comment_classification/output/vocab.txt'
label_file = 'D:/senior/aiCourse/dataSource/comment_classification/labels.txt'
enb_file = 'D:/senior/aiCourse/dataSource/comment_classification/embedding/embedding.txt'
batch_size = 128
reverse = False
split_word = True
max_len = 1200

dataset1 = DataSet(  data_files,  vocab_file,   label_file,   batch_size, reverse=  reverse, split_word=  split_word, max_len=  max_len)

# vocab size:  50000
# vocab size:  20
# Start to preprocessing data...
# load data from D:/senior/aiCourse/dataSource/comment_classification/output/train.json ...
# Got 105000 data items with 820 batches


In [44]:
for i,(source, lengths, targets, _) in enumerate(dataset1.get_next()):
    print(len(source[9]))
    break

931


In [45]:
def dig_lists(l):
    output = []
    for e in l:
        if isinstance(e, list):
            output += dig_lists(e)
        else:
            output.append(e)
    return(output)

In [46]:
def pad_sequences(comment_to_id,maxlen,padding,truncating):
    features = np.zeros((len(comment_to_id), maxlen), dtype=int)
    for i,comment in enumerate(comment_to_id):
        if len(comment) <= maxlen and padding == 'pre':
            features[i, -len(comment):] = np.array(comment)[:maxlen]
        if len(comment) <= maxlen and padding == 'post':
            features[i, :len(comment)] = np.array(comment)[:maxlen]
        if len(comment) > maxlen and truncating == 'post':
            features[i, :] = np.array(comment)[:maxlen]
        if len(comment) > maxlen and truncating == 'pre':
            features[i, :] = np.array(comment)[len(comment)-maxlen:]           
    return features

def split_dataset(pad_comments,labels,split_frac):
    split_index = int(len(pad_comments)*split_frac)
    data_list = list(zip(pad_comments, labels))
    random.shuffle(data_list)
    pad_comments, labels = zip(*data_list)
    x_train, x_test = pad_comments[:split_index], pad_comments[split_index:]
    y_train, y_test = labels[:split_index], labels[split_index:]
    return x_train,y_train,x_test,y_test 

In [52]:
comment_to_id = [x[0] for x in dataset1._raw_data]
pad_comments = pad_sequences(comment_to_id,maxlen=1200,padding='post',truncating='post')

In [53]:
#emb_dict,emb_size = load_embed_file(enb_file)

In [54]:
y_test = [x[2].flatten().tolist() for x in dataset1._raw_data]

In [55]:
import random
x_train,y_train,x_test,y_test = split_dataset(pad_comments[:10000],y_test,0.8)

In [56]:
len(x_train[0])

1200

# Model

In [57]:
import pandas as pd
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.engine.topology import Layer
from keras import initializers as initializers, regularizers, constraints
from keras.callbacks import Callback
from keras.layers import concatenate,GlobalMaxPooling1D,GlobalAveragePooling1D,SpatialDropout1D,Embedding, Input, Dense, LSTM, GRU, Bidirectional, TimeDistributed
from keras import backend as K
from keras.models import Model
import keras.layers as layers
from sklearn.metrics import roc_auc_score

In [58]:
batch_size = 64
epochs = 100
latent_dim = 256 # LSTM 的单元个数
num_samples = 10000 # 训练样本的大小

In [59]:
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test) 

In [60]:
def get_model():
    embed_size = 256
    inp = Input(shape=(1200, ))
    x = Embedding(input_dim=len(dataset1.w2i), output_dim=embed_size,)(inp)
    print(x)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(120, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(80, activation="relu")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [61]:
model = get_model()

Tensor("embedding_1/embedding_lookup/Identity:0", shape=(?, 1200, 256), dtype=float32)


In [62]:
print(model.summary())
history = model.fit(x_train, y_train,validation_data=(x_test, y_test), epochs=5,verbose=1, batch_size=100)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_449 (InputLayer)          (None, 1200)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1200, 256)    12800000    input_449[0][0]                  
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 1200, 256)    0           embedding_1[0][0]                
__________________________________________________________________________________________________
bidirectional_5 (Bidirectional) (None, 1200, 240)    271440      spatial_dropout1d_1[0][0]        
__________________________________________________________________________________________________
global_ave

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

In [None]:

print(history.history.keys())
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')

plt.show()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

preds = model.predict(x_train)
def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])
print(accuracy(preds,y_train))

In [None]:
def convert_2_labels(answer):
    labels_input = []
    for item in answer:
        answer_temp = []
        for i in range(0,80,4):
            answer_temp.append(np.argmax(item[i:i+4]))
        labels_input.append(answer_temp)
    return labels_input



In [None]:
convert_2_labels(preds)

In [None]:
coun=0
all1=0
for line in convert_2_labels(y_train):
    for item in line:
        if item == 3:
            coun+=1
        all1+=1

In [None]:
coun/all1