In [12]:
from __future__ import print_function # Use a function definition from future version (say 3.x from 2.7 interpreter)
import pandas as pd
import math
import numpy as np
import os
import time 

import cntk as C
import cntk.tests.test_utils
import pickle
import random
from cntk import sequence
from cntk import load_model
from cntk.device import try_set_default_device, gpu,cpu
from scipy.sparse import csr_matrix


from gensim.models import Word2Vec
cntk.tests.test_utils.set_device_from_pytest_env() # (only needed for our build system)
C.cntk_py.set_fixed_random_seed(1) # fix a random seed for CNTK components
try_set_default_device(gpu(0))


vocab_size = 80000
num_labels = 19#
title_size = 52000
body_size  = 210000
input_dim  = vocab_size
label_dim  = num_labels
emb_dim    = 300
hidden_dim = 200

max_length_title = 30
max_length_body  = 100
dropout_rate=0.5


suffix = "180days_all_body_shuffled"
prefix = "/home/t-haohu/IndustryClassifier/Data/"

data_token_body        = "{}/middle/{}_token_body.txt".format(prefix,suffix)
data_train_sample_body = "{}/middle/train_{}.txt".format(prefix,suffix)
data_test_sample_body  = "{}/middle/test_{}.txt".format(prefix,suffix)
data_test_sample_body_editor  = "{}/middle/test_{}_editor.txt".format(prefix,suffix)

data_title_sample    = "{}/ready/title_{}.wl".format(prefix,suffix)
#suffix = "180days_editor_body"
data_body_sample     = "{}/ready/body_{}.wl".format(prefix,suffix)
suffix = "180days_all_body_shuffled"
data_industry_sample = "{}/ready/industry_{}.wl".format(prefix,suffix)
suffix = "180days_all_body_shuffled"

def load_data_body(input_file,title_dict,body_dict,industry_dict):
    data = open(input_file, encoding = "utf-8").readlines()
    
    data_title = np.zeros((len(data),max_length_title),dtype = np.float32)
    data_body  = np.zeros((len(data),max_length_body),dtype = np.float32)
    data_label = np.zeros((len(data),1),dtype = np.float32)
    
    
    for index,line in enumerate(data):
        row = line.strip("\n").split("\t")
        
        title    =  row[0]
        body     =  row[1]
        industry =  row[2]
        
        for jndex,token in enumerate(title.split(" ")):
            if jndex>=max_length_title:
                break
            data_title[index,jndex]=title_dict.get(token,len(title_dict)-1)
            
        for jndex,token in enumerate(body.split(" ")):
            if jndex>=max_length_body:
                break
            data_body[index,jndex]=body_dict.get(token,len(title_dict)-1)
            
        data_label[index] = industry_dict.get(industry,len(industry_dict))
    return data_title,data_body,data_label
def load_embedding(dict_file,embedding_model_file):
    model = Word2Vec.load(embedding_model_file)
    dict_list = [x.strip("\n") for x in open(dict_file,encoding = 'utf-8').readlines()]
    embedding = np.zeros((len(dict_list),emb_dim))
    count = 0
    for i,w in enumerate(dict_list):
        try:
            vec = model.wv[w]
        except:
            vec=model.wv["UNK"]
            count+=1
        embedding[i] =vec
    print(count)
    return embedding

def get_context_left(current,previous,w_l,w_ls):
    left_c = current@w_l
    left_e = previous@w_ls
    left_h=left_c+left_e
    return C.relu(left_h)
def get_context_right(current,after,w_r,w_rs):
    right_c = current@w_r
    right_e = after@w_rs
    right_h =right_c+right_e
    return C.relu(right_h)

def create_model_rcnn(input_one_hot,max_length,embed = False,embedding=None):
    first_word = C.parameter(shape=(emb_dim))
    last_word = C.parameter(shape=(emb_dim))
    w_l,w_ls,w_r,w_rs = C.parameter(shape=(emb_dim,emb_dim)),C.parameter(shape=(emb_dim,emb_dim)),C.parameter(shape=(emb_dim,emb_dim)),C.parameter(shape=(emb_dim,emb_dim))
    #version 2 : 1 dense layer version3: sigmoid activation in dense
    if embed:
        h1= C.layers.Embedding(weights=embedding,name='embed_1')(input_one_hot)#
    else:
        h1= C.layers.Embedding(emb_dim,name='embed_2')(input_one_hot)#init=embedding,
    previous = first_word
    # h1 [batch*sentence_length*emb_dim]
    context_left_list = []
    for i in range(max_length):
        current = C.squeeze(h1[i])
        context_left_list.append(get_context_left(current,previous,w_l,w_ls))
        previous = current
        
    context_right_list = []
    after = last_word
    for i in reversed(range(max_length)):
        current = C.squeeze(h1[i])
        context_right_list.append(get_context_right(current,after,w_r,w_rs))
        after = current
    total_list = []
    for i in range(max_length_title):
        total_list.append(C.splice(h1[i],context_left_list[i],context_right_list[i]))
    h3=C.element_max(*total_list)
    return h3
def create_model_rcnn_body_2fold():
    h3_static_title = create_model_rcnn(input_xt_one_hot,max_length_title,embed=True,embedding=embedding_title)
    h3_dynamic_title = create_model_rcnn(input_xt_one_hot,max_length_title,embed=False)
    h3_static_body = create_model_rcnn(input_xb_one_hot,max_length_body,embed=True,embedding=embedding_body)
    h3_dynamic_body = create_model_rcnn(input_xb_one_hot,max_length_body,embed=False)
    h3 = C.splice(h3_static_title,h3_dynamic_title,h3_static_body,h3_dynamic_body)
    drop1 = C.layers.Dropout(dropout_rate)(h3)
    h4=C.layers.Dense(num_labels,name='hidden')(drop1)

    return h4
def create_model_rcnn_body():
    #h3_static_title = create_model_rcnn(input_xt_one_hot,max_length_title,embed=True,embedding=embedding_title)
    h3_dynamic_title = create_model_rcnn(input_xt_one_hot,max_length_title,embed=False)
    #h3_static_body = create_model_rcnn(input_xb_one_hot,max_length_body,embed=True,embedding=embedding_body)
    h3_dynamic_body = create_model_rcnn(input_xb_one_hot,max_length_body,embed=False)
    h3 = C.splice(h3_dynamic_title,h3_dynamic_body)
    drop1 = C.layers.Dropout(dropout_rate)(h3)
    h4=C.layers.Dense(num_labels,name='hidden')(drop1)

    return h4
def batch_iter(data,batch_size, num_epochs, shuffle=True):
    # Generates a batch iterator for a dataset.
    data_size = len(data)
    num_batches_per_epoch = int((data_size-1)/batch_size) + 1
    print('data_size: ', data_size, 'batch_size: ', batch_size, 'num_batches_per_epoch: ', num_batches_per_epoch)
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            random.shuffle(data)
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield data[start_index:end_index]
            

def fast_hist(a, b, n):
    k = (a >= 0) & (a < n)
    return np.bincount(n * a[k].astype(int) + b[k], minlength=n**2).reshape(n, n)

title_dict =     { x:i for i,x in enumerate([x.strip("\n") for x in open(data_title_sample).readlines()])}
body_dict  =     { x:i for i,x in enumerate([x.strip("\n") for x in open(data_body_sample ).readlines()])}
industry_dict =  { x:i for i,x in enumerate([x.strip("\n") for x in open(data_industry_sample).readlines()])}




input_xt = C.input_variable(shape=(max_length_title),  dtype=np.float32)
input_xb = C.input_variable(shape=(max_length_body) ,  dtype=np.float32)
input_y  = C.input_variable(shape=(1)               ,  dtype=np.int)

input_xt_one_hot = C.one_hot(input_xt, num_classes=len(title_dict)   ,  sparse_output=True)
input_xb_one_hot = C.one_hot(input_xb, num_classes=len(body_dict)+1    ,  sparse_output=True) 
input_y_one_hot = C.one_hot(input_y  , num_classes=len(industry_dict) ,  sparse_output=True)

#test_data_title, test_data_body, test_data_label  = load_data_body(data_test_sample_body,title_dict,body_dict,industry_dict)
#train_data_title,train_data_body,train_data_label = load_data_body(data_train_sample_body,title_dict,body_dict,industry_dict)

test_data  = load_data_body(data_test_sample_body,title_dict,body_dict,industry_dict)
train_data = load_data_body(data_train_sample_body,title_dict,body_dict,industry_dict)
#test_data_editor  = load_data_body(data_test_sample_body_editor,title_dict,body_dict,industry_dict)
#embedding_title = load_embedding(data_title_sample,"word2vec_title.model")
#embedding_body = load_embedding(data_body_sample,"word2vec_body.model")

def test_body(batch_size,model,data):
    scores = model(input_xt,input_xb)
    predict = C.argmax(scores,axis = 0)
    confuse = np.zeros((num_labels,num_labels))
    #C.element_add(input_y,C.element_times(predict,C.Constant([nums_labels])))
    test_data_title,test_data_body,test_data_label = data
    batches = batch_iter(list(zip(test_data_title,test_data_body,test_data_label)), batch_size, 1)
    
    for batch in batches:
        batch_data_title,batch_data_body,batch_data_label = zip(*batch) 
        output = np.array(predict.eval({input_xb: np.array(batch_data_body),input_xt: np.array(batch_data_title)}),dtype=np.int)
        gt = np.array(batch_data_label,dtype=np.int)
        confuse+=fast_hist(output,gt,num_labels)
    precision=np.diag(confuse)/np.sum(confuse,axis=0)
    recall = np.diag(confuse)/np.sum(confuse,axis=1)
    accuarcy = np.diag(confuse).sum() / confuse.sum()
    aver_precision=np.nanmean(precision)
    aver_recall = np.nanmean(recall)
   
    print("Precision:{} Recall:{} Acc:{}".format(aver_precision,aver_recall,accuarcy))
    return accuarcy


def train_body(train_data,num_epochs,learning_rate,batch_size,l2_weight=0,tag = "cnn"):
    
    #learning_rate *= batch_size
    model = create_model_rcnn_body()
    print(C.logging.get_node_outputs(model))
    scores = model(input_xt,input_xb)

    loss =C.reduce_mean(C.losses.cross_entropy_with_softmax(scores, input_y_one_hot))
    
    # Training
    lr_schedule = C.learning_parameter_schedule(learning_rate)
    #learner = C.adam(scores.parameters, lr=lr_schedule, momentum=0.9,l2_regularization_weight=0)
    progress_printer = C.logging.ProgressPrinter(tag='Training', num_epochs=num_epochs)
    momentums = C.momentum_schedule(0.99, minibatch_size=batch_size)
    learner = C.adam(parameters=scores.parameters,#model.parameters,
                     lr=lr_schedule,
                     momentum=momentums,
                     gradient_clipping_threshold_per_sample=15,
                     gradient_clipping_with_truncation=True,
                     l2_regularization_weight=l2_weight)
    trainer = C.Trainer(scores, (loss), [learner], progress_printer)
    
    train_data_title,train_data_body,train_data_label = train_data
    batches = batch_iter(list(zip(train_data_title,train_data_body,train_data_label)), batch_size, num_epochs)

    # training loop
    count = 0
    t = time.time()
    for batch in batches:
        count += 1
        batch_data_title,batch_data_body,batch_data_label = zip(*batch)
        #print(np.array(batch_data_body).shape)
        #batch_data_title,batch_data_body,batch_data_label = transfer(batch_data_title),transfer(batch_data_body),trainsfer(batch_data_label)
        trainer.train_minibatch({input_xb: np.array(batch_data_body),input_xt: np.array(batch_data_title), input_y: np.array(batch_data_label)})
        if count%1000== 0:
            print(count,time.time()-t)
            t=time.time()
            acc1=test_body(batch_size,model,test_data)
            #acc2=test_body(batch_size,model,test_data_editor)
            #
            # save model
            model.save('/home/t-haohu/IndustryClassifier/model/{}/{}_acc{:.3f}.dnn'.format(suffix,tag,acc1))
            #model.save('model/{}/{}_acc{:.3f}.dnn'.format(suffix,tag,acc1))
            #print(a)
            #model.save('./model/{}/{}_acc1{:.3f}_acc2{:.3f}.dnn'.format(suffix,tag,acc1,acc2))
    

In [None]:
train_body(train_data,num_epochs=20,learning_rate=[5e-4*30]*2+[1e-4*30],batch_size = 30,tag = "rcnn_body")

[Output('hidden', [#], [19]), Output('Block1122615_Output_0', [#], [1 x 1800]), Output('Splice1122595_Output_0', [#], [1 x 1800]), Output('Block1118309_Output_0', [#], [1 x 900]), Output('Block1118069_Output_0', [#], [1 x 900]), Output('Block1117941_Output_0', [#], [1 x 900]), Output('Block1117877_Output_0', [#], [1 x 900]), Output('Splice1117676_Output_0', [#], [1 x 900]), Output('Slice1117673_Output_0', [#], [1 x 300]), Output('embed_2', [#], [30 x 300]), Output('OneHotOp1116550_Output_0', [#], [30 x 56957]), Output('ReLU1116608_Output_0', [#], [300]), Output('Plus1116605_Output_0', [#], [300]), Output('Times1116599_Output_0', [#], [300]), Output('Squeeze1116596_Output_0', [#], [300]), Output('Slice1116593_Output_0', [#], [1 x 300]), Output('Times1116602_Output_0', [], [300]), Output('ReLU1117148_Output_0', [#], [300]), Output('Plus1117145_Output_0', [#], [300]), Output('Times1117139_Output_0', [#], [300]), Output('Squeeze1117136_Output_0', [#], [300]), Output('Slice1117133_Output_0'

data_size:  794568 batch_size:  30 num_batches_per_epoch:  26486
Learning rate per minibatch: 0.015
1000 90.61475992202759
data_size:  340530 batch_size:  30 num_batches_per_epoch:  11351




Precision:0.1480516268911817 Recall:0.6392925085355141 Acc:0.39573605849704874
2000 325.41649293899536
data_size:  340530 batch_size:  30 num_batches_per_epoch:  11351
Precision:0.30949392649789914 Recall:0.7453396819986032 Acc:0.5738642704020204
3000 333.11036586761475
data_size:  340530 batch_size:  30 num_batches_per_epoch:  11351
Precision:0.46907916577292413 Recall:0.7269570188077118 Acc:0.6831820984935248
4000 324.4453990459442
data_size:  340530 batch_size:  30 num_batches_per_epoch:  11351
Precision:0.5375137421417711 Recall:0.7346452242951582 Acc:0.7281032508149061
5000 319.44288969039917
data_size:  340530 batch_size:  30 num_batches_per_epoch:  11351
Precision:0.5741525843031858 Recall:0.7605196454896347 Acc:0.7539277009367751
6000 317.80192017555237
data_size:  340530 batch_size:  30 num_batches_per_epoch:  11351
Precision:0.6036669767946661 Recall:0.7790596530280056 Acc:0.7715179279358647
7000 325.2304320335388
data_size:  340530 batch_size:  30 num_batches_per_epoch:  113

51000 340.2499358654022
data_size:  340530 batch_size:  30 num_batches_per_epoch:  11351
Precision:0.8128950562768685 Recall:0.845240775189688 Acc:0.8688603059935982
52000 329.8313558101654
data_size:  340530 batch_size:  30 num_batches_per_epoch:  11351
Precision:0.8143014470235848 Recall:0.8454443409295869 Acc:0.870504801339089
53000 323.29934525489807
data_size:  340530 batch_size:  30 num_batches_per_epoch:  11351
Precision:0.8116983009816912 Recall:0.8482838727922591 Acc:0.8698851789856987
54000 329.6091811656952
data_size:  340530 batch_size:  30 num_batches_per_epoch:  11351
Precision:0.8109537306454949 Recall:0.8466054353281097 Acc:0.8685284703256688
55000 334.68888783454895
data_size:  340530 batch_size:  30 num_batches_per_epoch:  11351
Precision:0.8108598403875221 Recall:0.8480546137852762 Acc:0.8697559686371245
56000 324.22154808044434
data_size:  340530 batch_size:  30 num_batches_per_epoch:  11351
Precision:0.8142209336267088 Recall:0.8464125583295666 Acc:0.87024050744427