In [3]:
from __future__ import print_function # Use a function definition from future version (say 3.x from 2.7 interpreter)
import pandas as pd
import math
import numpy as np
import os
import time 

import cntk as C
import cntk.tests.test_utils
from cntk.layers import *
from cntk.layers.typing import *
import pickle
import random
from cntk import sequence
from cntk import load_model
from cntk.device import try_set_default_device, gpu,cpu
from scipy.sparse import csr_matrix

from gensim.models import Word2Vec
cntk.tests.test_utils.set_device_from_pytest_env() # (only needed for our build system)
C.cntk_py.set_fixed_random_seed(1) # fix a random seed for CNTK components
try_set_default_device(gpu(0))


vocab_size = 80000
num_labels = 19#19
title_size = 52000
body_size  = 210000
input_dim  = vocab_size
label_dim  = num_labels
emb_dim    = 300
hidden_dim = 200

max_length_title = 30
max_length_body  = 100



suffix = "180days_all_shuffled_with_linkedin"
#suffix = "linkedin_only"
prefix = "/home/t-haohu/IndustryClassifier/Data/"


data_train_sample = "{}/middle/train_{}.txt".format(prefix,suffix)
#data_train_sample = "{}/middle/train_{}_with_linkedin_all.txt".format(prefix,suffix)
suffix = "180days_all_shuffled_with_linkedin"
data_test_sample  = "{}/middle/test_{}.txt".format(prefix,suffix)
#data_test_sample_editor  = "{}/middle/test_{}_editor.txt".format(prefix,suffix)

suffix = "180days_all_shuffled_with_linkedin"
data_title_sample    = "{}/ready/title_{}.wl".format(prefix,suffix)
data_body_sample     = "{}/ready/body_{}.wl".format(prefix,suffix)
suffix = "180days_all_shuffled"
data_industry_sample = "{}/ready/industry_{}.wl".format(prefix,suffix)
suffix = "180days_all_shuffled_with_linkedin"
filter_num=200 
dropout_rate = 0.5
emb_dim =300

def load_data_dynamic(input_file,title_dict,industry_dict):
    data = open(input_file, encoding = "utf-8").readlines()
    
    data_title =[ [] for x in range(len(data))]#np.zeros((len(data),max_length_title),dtype = np.float32)
    data_label = np.zeros((len(data),1),dtype = np.float32)
    
    
    for index,line in enumerate(data):
        row = line.strip("\n").split("\t")       
        title    =  row[0]
        industry =  row[1]
        
        for jndex,token in enumerate(title.split(" ")):
            if jndex>=max_length_title:
                break
            data_title[index].append(title_dict.get(token,len(title_dict)-1))
        while len(data_title[index])<5:
            data_title[index].append(len(title_dict)-1)
        data_label[index] = industry_dict.get(industry,len(industry_dict))
    data_title = [ np.array(x) for x in data_title]
    return data_title,data_label

def load_data_static(input_file,title_dict,industry_dict):
    data = open(input_file, encoding = "utf-8").readlines()
    
    data_title =np.zeros((len(data),max_length_title),dtype = np.float32)
    data_label = np.zeros((len(data),1),dtype = np.float32)
    
    
    for index,line in enumerate(data):
        row = line.strip("\n").split("\t")       
        title    =  row[0]
        industry =  row[1]
        
        for jndex,token in enumerate(title.split(" ")):
            if jndex>=max_length_title:
                break
            data_title[index,jndex]=title_dict.get(token,len(title_dict)-1)    
        data_label[index] = industry_dict.get(industry,len(industry_dict))
    return data_title,data_label

def load_embedding(title_file,embedding_model_file):
    model = Word2Vec.load(embedding_model_file)
    title_list = [x.strip("\n") for x in open(title_file,encoding = 'utf-8').readlines()]
    embedding = np.zeros((len(title_list),emb_dim))
    count = 0
    for i,w in enumerate(title_list):
        try:
            vec = model.wv[w]
        except:
            vec=model.wv["UNK"]
            count+=1
        embedding[i] =vec
    print(count)
    return embedding

def create_model_cnn(embed = False):
    #version 2 : 1 dense layer version3: sigmoid activation in dense
    if embed:
        h1= C.layers.Embedding(weights=embedding,name='embed_1')(input_xt_one_hot)#
    else:
        h1= C.layers.Embedding(emb_dim,name='embed_2')(input_xt_one_hot)#init=embedding,

    

    h2_1=C.layers.Convolution((1,emb_dim),num_filters=filter_num,reduction_rank=0,activation=C.relu)(h1)
    h2_2=C.layers.Convolution((2,emb_dim),num_filters=filter_num,reduction_rank=0,activation=C.relu)(h1)
    h2_3=C.layers.Convolution((3,emb_dim),num_filters=filter_num,reduction_rank=0,activation=C.relu)(h1)
    
    h3_1=C.layers.MaxPooling((max_length_title-0,1),name='pooling_1')(h2_1)
    h3_2=C.layers.MaxPooling((max_length_title-1,1),name='pooling_2')(h2_2)
    h3_3=C.layers.MaxPooling((max_length_title-2,1),name='pooling_3')(h2_3)
    #h2=BiRecurrence(C.layers.LSTM(hidden_dim), C.layers.LSTM(hidden_dim))(h1)
    h3=C.splice(h3_2,h3_1,h3_3,axis=0)
    drop1 = C.layers.Dropout(dropout_rate)(h3)
    h4=C.layers.Dense(300,name='hidden')(drop1)
    h5=C.layers.Dense(num_labels,name='hidden')(h4)
 
    return h5

def create_model_cnn_dynamic(embed = False):
    #version 2 : 1 dense layer version3: sigmoid activation in dense
    if embed:
        h1= C.layers.Embedding(weights=embedding,name='embed_1')(input_xt_one_hot)#
    else:
        h1= C.layers.Embedding(emb_dim,name='embed_2')(input_xt_one_hot)#init=embedding,

    h1 = C.squeeze(h1)
    print(h1)
    h2_1=C.layers.Convolution((1,emb_dim),num_filters=filter_num,reduction_rank=0,activation=C.relu,sequential=True)(h1)
    h2_2=C.layers.Convolution((2,emb_dim),num_filters=filter_num,reduction_rank=0,activation=C.relu,sequential=True)(h1)
    h2_3=C.layers.Convolution((3,emb_dim),num_filters=filter_num,reduction_rank=0,activation=C.relu,sequential=True)(h1)
    seq_MaxPooling = C.layers.Fold(C.element_max)
    h3_1=seq_MaxPooling(h2_1)
    h3_2=seq_MaxPooling(h2_2)
    h3_3=seq_MaxPooling(h2_3)
    #h2=BiRecurrence(C.layers.LSTM(hidden_dim), C.layers.LSTM(hidden_dim))(h1)
    h3=C.splice(h3_2,h3_1,h3_3,axis=0)
    drop1 = C.layers.Dropout(dropout_rate)(h3)
    h4=C.layers.Dense(num_labels,name='hidden')(drop1)

    return h4

def create_model_cnn_2fold(dynamic = False):
    #version 2 : 1 dense layer version3: sigmoid activation in dense
    #
    with C.layers.default_options(initial_state=0.1):


        h1_1= C.layers.Embedding(weights=embedding,name='embed_1')(input_xt_one_hot)#
        h1_2= C.layers.Embedding(300,name='embed_2')(input_xt_one_hot)#init=embedding,
        
        
        
        h1_1_expand = C.expand_dims(h1_1,-3)
        h1_2_expand = C.expand_dims(h1_2,-3)
        
        h1 = C.splice(h1_1_expand,h1_2_expand,axis = -3)
        
        #bn = C.layers.BatchNormalization(name='bn')(h1)
        

        #value,valid = to_static(h1)

        filter_num=100

        h2_1=C.layers.Convolution((3,emb_dim),num_filters=filter_num,reduction_rank=1,activation=C.relu)(h1)
        h2_2=C.layers.Convolution((4,emb_dim),num_filters=filter_num,reduction_rank=1,activation=C.relu)(h1)
        h2_3=C.layers.Convolution((5,emb_dim),num_filters=filter_num,reduction_rank=1,activation=C.relu)(h1)
        if dynamic:
            seq_MaxPooling = C.layers.Fold(C.element_max)
            h3_1=seq_MaxPooling(h2_1)
            h3_2=seq_MaxPooling(h2_2)
            h3_3=seq_MaxPooling(h2_3)
        else:
            h3_1=C.layers.MaxPooling((max_length_title-2,1),name='pooling_1')(h2_1)
            h3_2=C.layers.MaxPooling((max_length_title-3,1),name='pooling_2')(h2_2)
            h3_3=C.layers.MaxPooling((max_length_title-4,1),name='pooling_3')(h2_3)
        
        h3=C.splice(h3_2,h3_1,h3_3,axis=0)
        drop1 =C.layers.Dropout(0.5)(h3)
        h4=C.layers.Dense(num_labels,name='hidden')(drop1)

    return h4

def batch_iter(data,batch_size, num_epochs, shuffle=True):
    # Generates a batch iterator for a dataset.
    data_size = len(data)
    num_batches_per_epoch = int((data_size-1)/batch_size) + 1
    print('data_size: ', data_size, 'batch_size: ', batch_size, 'num_batches_per_epoch: ', num_batches_per_epoch)
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            random.shuffle(data)
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield data[start_index:end_index]
            

def fast_hist(a, b, n):
    k = (a >= 0) & (a < n)
    return np.bincount(n * a[k].astype(int) + b[k], minlength=n**2).reshape(n, n)

title_dict =     { x:i for i,x in enumerate([x.strip("\n") for x in open(data_title_sample).readlines()])}
industry_dict =  { x:i for i,x in enumerate([x.strip("\n") for x in open(data_industry_sample).readlines()])}




#input_xt = C.input_variable(shape=(max_length_title),  dtype=np.float32)
input_xt = C.input_variable(**Sequence[Tensor[1]])
input_y  = C.input_variable(shape=(1))

input_xt_one_hot = C.one_hot(input_xt, num_classes=len(title_dict)   ,  sparse_output=True)
input_y_one_hot = C.one_hot(input_y  , num_classes=len(industry_dict) ,  sparse_output=True)


test_data  = load_data_dynamic(data_test_sample,title_dict,industry_dict)
train_data = load_data_dynamic(data_train_sample,title_dict,industry_dict)
#test_data_editor  = load_data(data_test_sample_editor,title_dict,industry_dict)
embedding = load_embedding(data_title_sample,"word2vec.model")


def test(batch_size,model,data):
    scores = model(input_xt)
    predict = C.argmax(scores,axis = 0)
    confuse = np.zeros((num_labels,num_labels))

    test_data_title,test_data_label = data
    batches = batch_iter(list(zip(test_data_title,test_data_label)), batch_size, 1)
    
    for batch in batches:
        batch_data_title,batch_data_label = zip(*batch) 
        batch_data_title = [[[x] for x in y] for y in list(batch_data_title)] 
        batch_data_label = [x.tolist() for x in batch_data_label]
        output = np.array(predict.eval({input_xt: batch_data_title}),dtype=np.int)
        gt = np.array(batch_data_label,dtype=np.int)
        confuse+=fast_hist(output,gt,num_labels)
        
    precision=np.diag(confuse)/np.sum(confuse,axis=0)
    recall = np.diag(confuse)/np.sum(confuse,axis=1)
    accuarcy = np.diag(confuse).sum() / confuse.sum()
    aver_precision=np.nanmean(precision)
    aver_recall = np.nanmean(recall)
   
    print("Precision:{} Recall:{} Acc:{}".format(aver_precision,aver_recall,accuarcy))
    return accuarcy



def train(train_data,num_epochs,learning_rate,batch_size,tag="CNN",l2_weight=0):
    global model
    #learning_rate *= batch_size
    model = create_model_cnn_dynamic()
    print(C.logging.get_node_outputs(model))
    scores = model(input_xt)
    loss =C.reduce_mean(C.losses.cross_entropy_with_softmax(scores, input_y_one_hot))
    
    # Training
    lr_schedule = C.learning_parameter_schedule(learning_rate)
    #learner = C.adam(scores.parameters, lr=lr_schedule, momentum=0.9,l2_regularization_weight=0)
    progress_printer = C.logging.ProgressPrinter(tag='Training', num_epochs=num_epochs)
    momentums = C.momentum_schedule(0.99, minibatch_size=batch_size)
    learner = C.adam(parameters=scores.parameters,#model.parameters,
                     lr=lr_schedule,
                     momentum=momentums,
                     gradient_clipping_threshold_per_sample=15,
                     gradient_clipping_with_truncation=True,
                     l2_regularization_weight=l2_weight)
    trainer = C.Trainer(scores, (loss), [learner], progress_printer)
    
    train_data_title,train_data_label = train_data
    batches = batch_iter(list(zip(train_data_title,train_data_label)), batch_size, num_epochs)

    # training loop
    count = 0
    t = time.time()
    for batch in batches:
        count += 1
        batch_data_title,batch_data_label = zip(*batch)
        batch_data_title = [[[x] for x in y] for y in list(batch_data_title)]
        batch_data_label = [x.tolist() for x in batch_data_label]
        #print(batch_data_label[0])
        trainer.train_minibatch({input_xt: batch_data_title, input_y: batch_data_label})
        if count%200== 0:
            print(count,time.time()-t)
            t=time.time()
            acc1=test(batch_size,model,test_data)
            #acc2=test(batch_size,model,test_data_editor)

            model.save('./model/{}/{}_acc{:.3f}.dnn'.format(suffix,tag,acc1))
            #model.save('./model/{}/{}_acc1{:.3f}_acc2{:.3f}.dnn'.format(suffix,tag,acc1,acc2))


8633


In [4]:
train(train_data,num_epochs=20,learning_rate=[5e-4*150]*2+[1e-4*150],batch_size = 150,tag = "cnn")

Composite(Sequence[Tensor[1]]) -> Sequence[Tensor[300]]
[Output('hidden', [#], [19]), Output('Block3465_Output_0', [#], [600 x 1]), Output('Splice3445_Output_0', [#], [600 x 1]), Output('Block3219_Output_0', [#], [200 x 1]), Output('Block3192_Output_0', [#, defaultDynamicAxis_times_1_minus_1], [200 x 1]), Output('PastValue3176_Output_0', [#, defaultDynamicAxis_times_1_minus_1], [200 x 1]), Output('Block2562_Output_0', [#, defaultDynamicAxis_times_1_minus_1], [200 x 1]), Output('Squeeze2328_Output_0', [#, *], [300]), Output('embed_2', [#, *], [1 x 300]), Output('OneHotOp2294_Output_0', [#, *], [1 x 83110]), Output('Block3125_Output_0', [#], [200 x 1]), Output('Block3098_Output_0', [#, *], [200 x 1]), Output('PastValue3082_Output_0', [#, *], [200 x 1]), Output('Block2381_Output_0', [#, *], [200 x 1]), Output('Block3345_Output_0', [#], [200 x 1]), Output('Block3318_Output_0', [#, defaultDynamicAxis_times_1_minus_2], [200 x 1]), Output('PastValue3302_Output_0', [#, defaultDynamicAxis_times

  'training.' % var.uid)


Learning rate per minibatch: 0.075
200 61.502437114715576
data_size:  890025 batch_size:  150 num_batches_per_epoch:  5934




Precision:0.1361307187925286 Recall:0.5643236335532786 Acc:0.38771719895508555
400 272.73400807380676
data_size:  890025 batch_size:  150 num_batches_per_epoch:  5934
Precision:0.35812596154097986 Recall:0.65881970251418 Acc:0.5608381787028455
600 248.49844074249268
data_size:  890025 batch_size:  150 num_batches_per_epoch:  5934
Precision:0.4907402781370913 Recall:0.7019090642211341 Acc:0.6516828178983737
800 239.59740138053894
data_size:  890025 batch_size:  150 num_batches_per_epoch:  5934
Precision:0.5583798503807208 Recall:0.7273635547544806 Acc:0.6836100109547485
1000 296.1381106376648
data_size:  890025 batch_size:  150 num_batches_per_epoch:  5934
Precision:0.5944641200932337 Recall:0.7275689259357517 Acc:0.6988668857616359
1200 241.86806511878967
data_size:  890025 batch_size:  150 num_batches_per_epoch:  5934
Precision:0.6164044229189977 Recall:0.7562862659970515 Acc:0.7092194039493273
1400 295.40894532203674
data_size:  890025 batch_size:  150 num_batches_per_epoch:  5934
Pr

10200 252.5757839679718
data_size:  890025 batch_size:  150 num_batches_per_epoch:  5934
Precision:0.7309455854856599 Recall:0.7671872454851915 Acc:0.7609134574871492
10400 270.07917761802673
data_size:  890025 batch_size:  150 num_batches_per_epoch:  5934
Precision:0.7278627556655896 Recall:0.7641673689165552 Acc:0.7583202719024746
10600 250.7258596420288
data_size:  890025 batch_size:  150 num_batches_per_epoch:  5934
Precision:0.7276227353263903 Recall:0.7670061374417999 Acc:0.7584258869132889
10800 249.7187876701355
data_size:  890025 batch_size:  150 num_batches_per_epoch:  5934
Precision:0.7295713928723506 Recall:0.7670106246670241 Acc:0.7609100867953147
11000 237.6287899017334
data_size:  890025 batch_size:  150 num_batches_per_epoch:  5934
Precision:0.7297974492781508 Recall:0.7655966735101581 Acc:0.759475295637763
11200 277.68195104599
data_size:  890025 batch_size:  150 num_batches_per_epoch:  5934
Precision:0.7284239324396786 Recall:0.7663271521522507 Acc:0.7612909749726131


KeyboardInterrupt: 

RuntimeError: SWIG director method error.

In [5]:
from cntk import load_model
from data_processor import *
process_setting(low=False,old = True,stop = False)
input_xt = C.input_variable(**Sequence[Tensor[1]])
batch_size = 20
def inference(model,val_doc_file,output_file,title_dict,industry_file):
    
    scores = model(input_xt)
    predict = C.argmax(scores,axis = 0)
    probability = C.reduce_max(C.softmax(scores),axis = 0)
    
    industry = [x.strip("\n") for x in open(industry_file,encoding ="utf-8").readlines()]
    val_doc = open(val_doc_file,encoding = "utf-8")
    output = open(output_file,"w",encoding = "utf-8")
    val_doc = [tokenize(x.strip("\n").split("\t")[0]) for x in val_doc.readlines()]
    print(val_doc[0:5])
    data_title = [[] for x  in range(len(val_doc))]
    
    for index,title in enumerate(val_doc):       
            
        for jndex,token in enumerate(title.split(" ")):
            if jndex>=max_length_title:
                break
            data_title[index].append(title_dict.get(token,len(title_dict)-1))
        while len(data_title[index])<5:
            data_title[index].append(len(title_dict)-1)        
    batches = batch_iter(data_title, batch_size, 1,shuffle =False)
    for batch in batches:
        batch_data_title = batch
        pred = np.array(predict.eval({input_xt: batch_data_title}),dtype=np.int)
        prob = np.array(probability.eval({input_xt: batch_data_title}),dtype=np.float32)
        #gt = np.array(batch_data_label,dtype=np.int)
        #confuse+=fast_hist(output,gt,num_labels)
        for pre,pro in list(zip(pred,prob)):
            output.write("\t".join([str(industry[int(pre)]),str(pro[0])])+"\n")
    output.close()
    
model_list =["model/180days_all_shuffled_with_linkedin/cnn_acc0.764.dnn","model/180days_editor_shuffled/cnn_editor_acc0.617.dnn","model/linkedin_only/linkedin_acc0.770.dnn"]
suffix_list = ["180days_all_shuffled_with_linkedin","180days_editor_shuffled","linkedin_only"]
industry_list = ["180days_all_shuffled","180days_all_shuffled","180days_all_shuffled"]
model = 0
prefix = "/home/t-haohu/IndustryClassifier/Data/"
for suffix,model_name,industry in list(zip(suffix_list,model_list,industry_list)):
    #model_file = "{}/{}".format(prefix,industry)

    #if model_name  =="model/180days_editor_shuffled/":
    #    continue
    model = load_model(model_name)
    data_industry_sample = "{}/ready/industry_{}.wl".format(prefix,industry)
    data_title_sample    = "{}/ready/title_{}.wl".format(prefix,suffix)
    data_body_sample     = "{}/ready/body_{}.wl".format(prefix,suffix)
    title_dict =     { x:i for i,x in enumerate([x.strip("\n") for x in open(data_title_sample).readlines()])}
    inference(model,"Data/middle/1day_measure_sample_valid.txt","val/cnn_1day_measure_{}.txt".format(suffix),title_dict,data_industry_sample)



['Pro Russian Rebel Leader Assassinated in East Ukraine', 'EU to recommend abolishing daylight saving time', 'Trump cancels pay raises for federal employees', 'Contradictions Mount As Lawyer For Colorado Climate Suits Undermines Own Case', 'These Sassy Pyrex Storage Containers Are Perfect For Cooks With A Sense Of Humor']
data_size:  361 batch_size:  20 num_batches_per_epoch:  19
['Pro Russian Rebel Leader Assassinated in East Ukraine', 'EU to recommend abolishing daylight saving time', 'Trump cancels pay raises for federal employees', 'Contradictions Mount As Lawyer For Colorado Climate Suits Undermines Own Case', 'These Sassy Pyrex Storage Containers Are Perfect For Cooks With A Sense Of Humor']
data_size:  361 batch_size:  20 num_batches_per_epoch:  19
['Pro Russian Rebel Leader Assassinated in East Ukraine', 'EU to recommend abolishing daylight saving time', 'Trump cancels pay raises for federal employees', 'Contradictions Mount As Lawyer For Colorado Climate Suits Undermines Own C

array([list([1]), list([1, 2])], dtype=object)