In [1]:
from __future__ import print_function # Use a function definition from future version (say 3.x from 2.7 interpreter)
import pandas as pd
import math
import numpy as np
import os
import time 

import cntk as C
import cntk.tests.test_utils
from cntk.layers import *
from cntk.layers.typing import *
import pickle
import random
from cntk import sequence
from cntk import load_model
from cntk.device import try_set_default_device, gpu,cpu
from scipy.sparse import csr_matrix

from gensim.models import Word2Vec
cntk.tests.test_utils.set_device_from_pytest_env() # (only needed for our build system)
C.cntk_py.set_fixed_random_seed(1) # fix a random seed for CNTK components
try_set_default_device(gpu(0))


vocab_size = 80000
num_labels = 19#19
title_size = 52000
body_size  = 210000
input_dim  = vocab_size
label_dim  = num_labels
emb_dim    = 300
hidden_dim = 200

max_length_title = 53
max_length_body  = 200

suffix = "180days_all_shuffled"
#suffix = "linkedin_only"
prefix = "/home/t-haohu/IndustryClassifier/Data/"

#data_token_body        = "{}/middle/{}_token_body.txt".format(prefix,suffix)
data_train_sample = "{}/middle/train_{}.txt".format(prefix,suffix)
#data_train_sample = "{}/middle/train_{}_with_linkedin_all.txt".format(prefix,suffix)
data_test_sample  = "{}/middle/test_{}.txt".format(prefix,suffix)
#data_test_sample_editor  = "{}/middle/test_{}_editor.txt".format(prefix,suffix)

data_title_sample    = "{}/ready/title_{}.wl".format(prefix,suffix)
data_body_sample     = "{}/ready/body_{}.wl".format(prefix,suffix)
suffix = "180days_all_shuffled"
data_industry_sample = "{}/ready/industry_{}.wl".format(prefix,suffix)
filter_num=200 
dropout_rate = 0.5
emb_dim =300

def load_data(input_file,title_dict,industry_dict):
    data = open(input_file, encoding = "utf-8").readlines()
    
    data_title =np.zeros((len(data),max_length_title),dtype = np.float32)
    data_label = np.zeros((len(data),1),dtype = np.float32)
       
    for index,line in enumerate(data):
        row = line.strip("\n").split("\t")       
        title    =  row[0]
        industry =  row[1]
        
        for jndex,token in enumerate(title.split(" ")):
            if jndex>=max_length_title:
                break
            data_title[index,jndex]=title_dict.get(token,len(title_dict)-1)    
        data_label[index] = industry_dict.get(industry,len(industry_dict))
    return data_title,data_label


def load_embedding(title_file,embedding_model_file):
    model = Word2Vec.load(embedding_model_file)
    title_list = [x.strip("\n") for x in open(title_file,encoding = 'utf-8').readlines()]
    embedding = np.zeros((len(title_list),emb_dim))
    count = 0
    for i,w in enumerate(title_list):
        try:
            vec = model.wv[w]
        except:
            vec=model.wv["UNK"]
            count+=1
        embedding[i] =vec
    print(count)
    return embedding


def create_model_cnn(embed = False):
    #version 2 : 1 dense layer version3: sigmoid activation in dense
    if embed:
        h1= C.layers.Embedding(weights=embedding,name='embed_1')(input_xt_one_hot)#
    else:
        h1= C.layers.Embedding(emb_dim,name='embed_2')(input_xt_one_hot)#init=embedding,
    print(h1)
    h2=C.layers.Convolution((3,emb_dim),num_filters=filter_num,reduction_rank=0,activation=C.relu)(h1)
    h3=C.squeeze(C.layers.MaxPooling((3,1),strides =3,name='pooling_1')(h2))
    print(h3)
    #h2=BiRecurrence(C.layers.LSTM(hidden_dim), C.layers.LSTM(hidden_dim))(h1)
    h4 = C.layers.Convolution((filter_num,3),num_filters=filter_num,reduction_rank=0,activation=C.relu)(h3)
    print(h4)
    h5 = C.squeeze(C.layers.MaxPooling((1,3),strides =3,name='pooling_2')(h4))
    print(h5)
    h6 = C.layers.Convolution((filter_num,3),num_filters=filter_num,reduction_rank=0,activation=C.relu)(h5)
    print(h6)
    h7 = C.squeeze(C.layers.MaxPooling((1,3),strides =3,name='pooling_3')(h6))
    print(h7)
    drop1 = C.layers.Dropout(dropout_rate)(h7)
    #h8=C.layers.Dense(hidden_dim,name='hidden')(drop1)
    logits = C.layers.Dense(num_labels,name='hidden')(drop1)
    return logits

def create_model_cnn_2fold():
    #version 2 : 1 dense layer version3: sigmoid activation in dense
    #
    with C.layers.default_options(initial_state=0.1):


        h1_1= C.layers.Embedding(weights=embedding,name='embed_1')(input_xt_one_hot)#
        h1_2= C.layers.Embedding(300,name='embed_2')(input_xt_one_hot)#init=embedding,
        
        h1_1_expand = C.expand_dims(h1_1,-3)
        h1_2_expand = C.expand_dims(h1_2,-3)
        
        h1 = C.splice(h1_1_expand,h1_2_expand,axis = -3)
   #value,valid = to_static(h1)

        filter_num=100

        h2_1=C.layers.Convolution((3,emb_dim),num_filters=filter_num,reduction_rank=1,activation=C.relu)(h1)
        h2_2=C.layers.Convolution((4,emb_dim),num_filters=filter_num,reduction_rank=1,activation=C.relu)(h1)
        h2_3=C.layers.Convolution((5,emb_dim),num_filters=filter_num,reduction_rank=1,activation=C.relu)(h1)

        h3_1=C.layers.MaxPooling((max_length_title-2,1),name='pooling_1')(h2_1)
        h3_2=C.layers.MaxPooling((max_length_title-3,1),name='pooling_2')(h2_2)
        h3_3=C.layers.MaxPooling((max_length_title-4,1),name='pooling_3')(h2_3)

        h3=C.splice(h3_2,h3_1,h3_3,axis=0)
        drop1 =C.layers.Dropout(0.5)(h3)
        h4=C.layers.Dense(num_labels,name='hidden')(drop1)
    return h4


def batch_iter(data,batch_size, num_epochs, shuffle=True):
    # Generates a batch iterator for a dataset.
    data_size = len(data)
    num_batches_per_epoch = int((data_size-1)/batch_size) + 1
    print('data_size: ', data_size, 'batch_size: ', batch_size, 'num_batches_per_epoch: ', num_batches_per_epoch)
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            random.shuffle(data)
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield data[start_index:end_index]
            

def fast_hist(a, b, n):
    k = (a >= 0) & (a < n)
    return np.bincount(n * a[k].astype(int) + b[k], minlength=n**2).reshape(n, n)

title_dict =     { x:i for i,x in enumerate([x.strip("\n") for x in open(data_title_sample).readlines()])}
industry_dict =  { x:i for i,x in enumerate([x.strip("\n") for x in open(data_industry_sample).readlines()])}




input_xt = C.input_variable(shape=(max_length_title),  dtype=np.float32)
#input_xt = C.input_variable(**Sequence[Tensor[1]])
input_y  = C.input_variable(shape=(1)               ,  dtype=np.int)

input_xt_one_hot = C.one_hot(input_xt, num_classes=len(title_dict)   ,  sparse_output=True)
input_y_one_hot = C.one_hot(input_y  , num_classes=len(industry_dict) ,  sparse_output=True)


test_data  = load_data(data_test_sample,title_dict,industry_dict)
train_data = load_data(data_train_sample,title_dict,industry_dict)
#test_data_editor  = load_data(data_test_sample_editor,title_dict,industry_dict)
embedding = load_embedding(data_title_sample,"word2vec.model")
def test(batch_size,model,data):
    scores = model(input_xt)
    predict = C.argmax(scores,axis = 0)
    confuse = np.zeros((num_labels,num_labels))

    test_data_title,test_data_label = data
    batches = batch_iter(list(zip(test_data_title,test_data_label)), batch_size, 1)
    
    for batch in batches:
        batch_data_title,batch_data_label = zip(*batch) 
        output = np.array(predict.eval({input_xt: np.array(batch_data_title)}),dtype=np.int)
        gt = np.array(batch_data_label,dtype=np.int)
        confuse+=fast_hist(output,gt,num_labels)
        
    precision=np.diag(confuse)/np.sum(confuse,axis=0)
    recall = np.diag(confuse)/np.sum(confuse,axis=1)
    accuarcy = np.diag(confuse).sum() / confuse.sum()
    aver_precision=np.nanmean(precision)
    aver_recall = np.nanmean(recall)
   
    print("Precision:{} Recall:{} Acc:{}".format(aver_precision,aver_recall,accuarcy))
    return accuarcy
def train(train_data,num_epochs,learning_rate,batch_size,tag="CNN",l2_weight=0):
    global model
    #learning_rate *= batch_size
    model = create_model_cnn()
    print(C.logging.get_node_outputs(model))
    scores = model(input_xt)

    loss =C.reduce_mean(C.losses.cross_entropy_with_softmax(scores, input_y_one_hot))
    
    # Training
    lr_schedule = C.learning_parameter_schedule(learning_rate)
    #learner = C.adam(scores.parameters, lr=lr_schedule, momentum=0.9,l2_regularization_weight=0)
    progress_printer = C.logging.ProgressPrinter(tag='Training', num_epochs=num_epochs)
    momentums = C.momentum_schedule(0.99, minibatch_size=batch_size)
    learner = C.adam(parameters=scores.parameters,#model.parameters,
                     lr=lr_schedule,
                     momentum=momentums,
                     gradient_clipping_threshold_per_sample=15,
                     gradient_clipping_with_truncation=True,
                     l2_regularization_weight=l2_weight)
    trainer = C.Trainer(scores, (loss), [learner], progress_printer)
    
    train_data_title,train_data_label = train_data
    batches = batch_iter(list(zip(train_data_title,train_data_label)), batch_size, num_epochs)

    # training loop
    count = 0
    t = time.time()
    for batch in batches:
        count += 1
        batch_data_title,batch_data_label = zip(*batch)
        batch_data_title = list(batch_data_title)
        #print(type(batch_data_title),type(batch_data_title[0]),batch_data_title[0])
        trainer.train_minibatch({input_xt: np.array(batch_data_title), input_y: np.array(batch_data_label)})
        if count%1000== 0:
            print(count,time.time()-t)
            t=time.time()
            acc1=test(batch_size,model,test_data)
            #acc2=test(batch_size,model,test_data_editor)
            
            # save model
            model.save('./model/{}/{}_acc{:.3f}.dnn'.format(suffix,tag,acc1))
            #model.save('./model/{}/{}_acc1{:.3f}_acc2{:.3f}.dnn'.format(suffix,tag,acc1,acc2))
    

    # save model
    


0


In [2]:
train(train_data,num_epochs=20,learning_rate=[5e-4*150]*2+[1e-4*150],batch_size = 150,tag = "DeepCNN")

embed_2: Composite(Tensor[53]) -> Tensor[53,300]
Composite(Tensor[53]) -> Tensor[200,17]
Composite(Tensor[53]) -> Tensor[200,1,15]
Composite(Tensor[53]) -> Tensor[200,5]
Composite(Tensor[53]) -> Tensor[200,1,3]
Composite(Tensor[53]) -> Tensor[200]
[Output('hidden', [#], [19]), Output('Block467_Output_0', [#], [200]), Output('Squeeze447_Output_0', [#], [200]), Output('pooling_3', [#], [200 x 1 x 1]), Output('Block323_Output_0', [#], [200 x 1 x 3]), Output('Squeeze280_Output_0', [#], [200 x 5]), Output('pooling_2', [#], [200 x 1 x 5]), Output('Block186_Output_0', [#], [200 x 1 x 15]), Output('Squeeze143_Output_0', [#], [200 x 17]), Output('pooling_1', [#], [200 x 17 x 1]), Output('Block79_Output_0', [#], [200 x 51 x 1]), Output('embed_2', [#], [53 x 300]), Output('OneHotOp5_Output_0', [#], [53 x 56178])]
data_size:  768505 batch_size:  150 num_batches_per_epoch:  5124
Learning rate per minibatch: 0.075
1000 395.11466217041016
data_size:  329360 batch_size:  150 num_batches_per_epoch:  21



Precision:0.11568188416023845 Recall:0.31713140950901714 Acc:0.33024653874180226
2000 464.9960551261902
data_size:  329360 batch_size:  150 num_batches_per_epoch:  2196
Precision:0.32333796125933867 Recall:0.4414429827028993 Acc:0.5739768034976925
3000 459.36325573921204
data_size:  329360 batch_size:  150 num_batches_per_epoch:  2196
Precision:0.5446737076081135 Recall:0.6495704613583734 Acc:0.7347886810784552
4000 469.9437847137451
data_size:  329360 batch_size:  150 num_batches_per_epoch:  2196
Precision:0.6457276577045178 Recall:0.7437508973532624 Acc:0.7842755647316006
5000 458.44051790237427
data_size:  329360 batch_size:  150 num_batches_per_epoch:  2196
Precision:0.6993789015861225 Recall:0.7785658015061736 Acc:0.8093727228564489
6000 375.83518743515015
data_size:  329360 batch_size:  150 num_batches_per_epoch:  2196
Precision:0.714438190337679 Recall:0.7747835829005495 Acc:0.8141729414622297
7000 462.87565302848816
data_size:  329360 batch_size:  150 num_batches_per_epoch:  21

51000 458.2557575702667
data_size:  329360 batch_size:  150 num_batches_per_epoch:  2196
Precision:0.762288361148806 Recall:0.7794040497761026 Acc:0.8268581491377216
52000 445.0088653564453
data_size:  329360 batch_size:  150 num_batches_per_epoch:  2196
Precision:0.7606745998669724 Recall:0.7860599951952074 Acc:0.8271374787466602
53000 482.72023367881775
data_size:  329360 batch_size:  150 num_batches_per_epoch:  2196
Precision:0.7602552760038028 Recall:0.7819526123841675 Acc:0.826782244352684
54000 474.8390634059906
data_size:  329360 batch_size:  150 num_batches_per_epoch:  2196
Precision:0.7622353934004938 Recall:0.7844927443874538 Acc:0.8296969880981298
55000 412.5068564414978
data_size:  329360 batch_size:  150 num_batches_per_epoch:  2196
Precision:0.76443925729589 Recall:0.7819506313879447 Acc:0.8267154481418508
56000 424.58861541748047
data_size:  329360 batch_size:  150 num_batches_per_epoch:  2196
Precision:0.7621224687021954 Recall:0.7786533098083885 Acc:0.8247844304104931


Precision:0.7576715000502651 Recall:0.772451496320058 Acc:0.821657153266942
101000 455.4182951450348
data_size:  329360 batch_size:  150 num_batches_per_epoch:  2196
Precision:0.755478167334676 Recall:0.7775382514180663 Acc:0.8225801554529998
102000 473.5091173648834
data_size:  329360 batch_size:  150 num_batches_per_epoch:  2196
Precision:0.7614014442276719 Recall:0.7697638715350259 Acc:0.8184630799125577
