In [4]:
from __future__ import print_function # Use a function definition from future version (say 3.x from 2.7 interpreter)
import pandas as pd
import math
import numpy as np
import os
import time 

import cntk as C
import cntk.tests.test_utils
import pickle
import random
from cntk import sequence
from cntk import load_model
from cntk.device import try_set_default_device, gpu,cpu
from scipy.sparse import csr_matrix

from gensim.models import Word2Vec
cntk.tests.test_utils.set_device_from_pytest_env() # (only needed for our build system)
C.cntk_py.set_fixed_random_seed(1) # fix a random seed for CNTK components
try_set_default_device(gpu(0))


vocab_size = 80000
num_labels = 19#
title_size = 52000
body_size  = 210000
input_dim  = vocab_size
label_dim  = num_labels
emb_dim    = 300
hidden_dim = 200

max_length_title = 30
max_length_body  = 100



suffix = "180days_all_body_with_linkedin"
prefix = "/home/t-haohu/IndustryClassifier/Data/"

data_token_body        = "{}/middle/{}_token_body.txt".format(prefix,suffix)
data_train_sample_body = "{}/middle/train_{}.txt".format(prefix,suffix)
data_test_sample_body  = "{}/middle/test_{}.txt".format(prefix,suffix)
data_test_sample_body_editor  = "{}/middle/test_{}_editor.txt".format(prefix,suffix)

suffix = "180days_all_body_shuffled"
data_title_sample    = "{}/ready/title_{}.wl".format(prefix,suffix)
#suffix = "180days_editor_body"
data_body_sample     = "{}/ready/body_{}.wl".format(prefix,suffix)
suffix = "180days_all_body_shuffled"
data_industry_sample = "{}/ready/industry_{}.wl".format(prefix,suffix)
suffix = "180days_all_body_shuffled"

def load_data_body(input_file,title_dict,body_dict,industry_dict):
    data = open(input_file, encoding = "utf-8").readlines()
    
    data_title = np.zeros((len(data),max_length_title),dtype = np.float32)
    data_body  = np.zeros((len(data),max_length_body),dtype = np.float32)
    data_label = np.zeros((len(data),1),dtype = np.float32)
    
    
    for index,line in enumerate(data):
        row = line.strip("\n").split("\t")
        
        title    =  row[0]
        body     =  row[1]
        industry =  row[2]
        
        for jndex,token in enumerate(title.split(" ")):
            if jndex>=max_length_title:
                break
            data_title[index,jndex]=title_dict.get(token,len(title_dict)-1)
            
        for jndex,token in enumerate(body.split(" ")):
            if jndex>=max_length_body:
                break
            data_body[index,jndex]=body_dict.get(token,len(title_dict)-1)
            
        data_label[index] = industry_dict.get(industry,len(industry_dict))
    return data_title,data_body,data_label



def batch_iter(data,batch_size, num_epochs, shuffle=True):
    # Generates a batch iterator for a dataset.
    data_size = len(data)
    num_batches_per_epoch = int((data_size-1)/batch_size) + 1
    print('data_size: ', data_size, 'batch_size: ', batch_size, 'num_batches_per_epoch: ', num_batches_per_epoch)
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            random.shuffle(data)
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield data[start_index:end_index]
            

def fast_hist(a, b, n):
    k = (a >= 0) & (a < n)
    return np.bincount(n * a[k].astype(int) + b[k], minlength=n**2).reshape(n, n)

title_dict =     { x:i for i,x in enumerate([x.strip("\n") for x in open(data_title_sample).readlines()])}
body_dict  =     { x:i for i,x in enumerate([x.strip("\n") for x in open(data_body_sample ).readlines()])}
industry_dict =  { x:i for i,x in enumerate([x.strip("\n") for x in open(data_industry_sample).readlines()])}




input_xt = C.input_variable(shape=(max_length_title),  dtype=np.float32)
input_xb = C.input_variable(shape=(max_length_body) ,  dtype=np.float32)
input_y  = C.input_variable(shape=(1)               ,  dtype=np.int)

input_xt_one_hot = C.one_hot(input_xt, num_classes=len(title_dict)   ,  sparse_output=True)
input_xb_one_hot = C.one_hot(input_xb, num_classes=len(body_dict)    ,  sparse_output=True) 
input_y_one_hot = C.one_hot(input_y  , num_classes=len(industry_dict) ,  sparse_output=True)

#test_data_title, test_data_body, test_data_label  = load_data_body(data_test_sample_body,title_dict,body_dict,industry_dict)
#train_data_title,train_data_body,train_data_label = load_data_body(data_train_sample_body,title_dict,body_dict,industry_dict)

test_data  = load_data_body(data_test_sample_body,title_dict,body_dict,industry_dict)
train_data = load_data_body(data_train_sample_body,title_dict,body_dict,industry_dict)
#test_data_editor  = load_data_body(data_test_sample_body_editor,title_dict,body_dict,industry_dict)


def test_body(batch_size,model,data):
    scores = model(input_xt,input_xb)
    predict = C.argmax(scores,axis = 0)
    confuse = np.zeros((num_labels,num_labels))
    #C.element_add(input_y,C.element_times(predict,C.Constant([nums_labels])))
    test_data_title,test_data_body,test_data_label = data
    batches = batch_iter(list(zip(test_data_title,test_data_body,test_data_label)), batch_size, 1)
    
    for batch in batches:
        batch_data_title,batch_data_body,batch_data_label = zip(*batch) 
        output = np.array(predict.eval({input_xb: np.array(batch_data_body),input_xt: np.array(batch_data_title)}),dtype=np.int)
        gt = np.array(batch_data_label,dtype=np.int)
        confuse+=fast_hist(output,gt,num_labels)
    precision=np.diag(confuse)/np.sum(confuse,axis=0)
    recall = np.diag(confuse)/np.sum(confuse,axis=1)
    accuarcy = np.diag(confuse).sum() / confuse.sum()
    aver_precision=np.nanmean(precision)
    aver_recall = np.nanmean(recall)
   
    print("Precision:{} Recall:{} Acc:{}".format(aver_precision,aver_recall,accuarcy))
    return accuarcy


def train_body(train_data,num_epochs,learning_rate,batch_size,l2_weight=0,tag = "cnn"):
    
    #learning_rate *= batch_size
    model = create_model_cnn_with_body()
    print(C.logging.get_node_outputs(model))
    scores = model(input_xt,input_xb)

    loss =C.reduce_mean(C.losses.cross_entropy_with_softmax(scores, input_y_one_hot))
    
    # Training
    lr_schedule = C.learning_parameter_schedule(learning_rate)
    #learner = C.adam(scores.parameters, lr=lr_schedule, momentum=0.9,l2_regularization_weight=0)
    progress_printer = C.logging.ProgressPrinter(tag='Training', num_epochs=num_epochs)
    momentums = C.momentum_schedule(0.99, minibatch_size=batch_size)
    learner = C.adam(parameters=scores.parameters,#model.parameters,
                     lr=lr_schedule,
                     momentum=momentums,
                     gradient_clipping_threshold_per_sample=15,
                     gradient_clipping_with_truncation=True,
                     l2_regularization_weight=l2_weight)
    trainer = C.Trainer(scores, (loss), [learner], progress_printer)
    
    train_data_title,train_data_body,train_data_label = train_data
    batches = batch_iter(list(zip(train_data_title,train_data_body,train_data_label)), batch_size, num_epochs)

    # training loop
    count = 0
    t = time.time()
    for batch in batches:
        count += 1
        batch_data_title,batch_data_body,batch_data_label = zip(*batch)
        #print(np.array(batch_data_body).shape)
        #batch_data_title,batch_data_body,batch_data_label = transfer(batch_data_title),transfer(batch_data_body),trainsfer(batch_data_label)
        trainer.train_minibatch({input_xb: np.array(batch_data_body),input_xt: np.array(batch_data_title), input_y: np.array(batch_data_label)})
        if count%1000== 0:
            print(count,time.time()-t)
            t=time.time()
            acc1=test_body(batch_size,model,test_data)
            #acc2=test_body(batch_size,model,test_data_editor)
            #
            # save model
            model.save('/home/t-haohu/IndustryClassifier/model/{}/{}_acc{:.3f}.dnn'.format(suffix,tag,acc1))
            #model.save('model/{}/{}_acc{:.3f}.dnn'.format(suffix,tag,acc1))
            #print(a)
            #model.save('./model/{}/{}_acc1{:.3f}_acc2{:.3f}.dnn'.format(suffix,tag,acc1,acc2))
    

    


FileNotFoundError: [Errno 2] No such file or directory: '/home/t-haohu/IndustryClassifier/Data//ready/title_180days_all_body_with_linkedin.wl'

In [12]:
train_body(train_data,num_epochs=20,learning_rate=[5e-4*30]*2+[1e-4*30],batch_size = 42,tag = "")


[Output('classify', [#], [19]), Output('Splice47135_Output_0', [#], [300 x 1 x 1]), Output('pooling_t_1', [#], [50 x 1 x 1]), Output('Block46614_Output_0', [#], [50 x 29 x 1]), Output('embed', [#], [30 x 300]), Output('OneHotOp46446_Output_0', [#], [30 x 56957]), Output('pooling_t_2', [#], [50 x 1 x 1]), Output('Block46551_Output_0', [#], [50 x 30 x 1]), Output('pooling_t_3', [#], [50 x 1 x 1]), Output('Block46677_Output_0', [#], [50 x 28 x 1]), Output('pooling_b_1', [#], [50 x 1 x 1]), Output('Block46803_Output_0', [#], [50 x 99 x 1]), Output('embed', [#], [100 x 300]), Output('OneHotOp46449_Output_0', [#], [100 x 216866]), Output('pooling_b_2', [#], [50 x 1 x 1]), Output('Block46740_Output_0', [#], [50 x 100 x 1]), Output('pooling_b_3', [#], [50 x 1 x 1]), Output('Block46866_Output_0', [#], [50 x 98 x 1])]
data_size:  794568 batch_size:  42 num_batches_per_epoch:  18919
Learning rate per minibatch: 0.015
1000 55.935343742370605
data_size:  340530 batch_size:  42 num_batches_per_epoch



Precision:0.46769792427961443 Recall:0.6848300143872516 Acc:0.6712741902328723
2000 113.71530413627625
data_size:  340530 batch_size:  42 num_batches_per_epoch:  8108
Precision:0.6246191369057823 Recall:0.742052583594128 Acc:0.7608639473761489
3000 114.26779747009277
data_size:  340530 batch_size:  42 num_batches_per_epoch:  8108
Precision:0.6855723735573762 Recall:0.7667544825291299 Acc:0.7932193932986815
4000 96.80689883232117
data_size:  340530 batch_size:  42 num_batches_per_epoch:  8108
Precision:0.7214493131652675 Recall:0.777839452546766 Acc:0.811285349308431
5000 114.94631123542786
data_size:  340530 batch_size:  42 num_batches_per_epoch:  8108
Precision:0.7401092350936481 Recall:0.7895852851295894 Acc:0.82180424632191
6000 116.0528290271759
data_size:  340530 batch_size:  42 num_batches_per_epoch:  8108
Precision:0.7572837940655637 Recall:0.7952196519083278 Acc:0.8310104836578275
7000 102.12716937065125
data_size:  340530 batch_size:  42 num_batches_per_epoch:  8108
Precision:

RuntimeError: CUDA failure 2: out of memory ; GPU=0 ; hostname=stcagl-120 ; expr=cudaMalloc((void**) &deviceBufferPtr, sizeof(AllocatedElemType) * AsMultipleOf(numElements, 2))

[CALL STACK]
[0x7fb1e543b569]                                                       + 0x712569
[0x7fb1dd1bb6cf]                                                       + 0xdca6cf
[0x7fb1dd20c057]    float* Microsoft::MSR::CNTK::TracingGPUMemoryAllocator::  Allocate  <float>(int,  unsigned long,  unsigned long) + 0x57
[0x7fb1dd20c386]    Microsoft::MSR::CNTK::GPUMatrix<float>::  Resize  (unsigned long,  unsigned long,  bool) + 0xf6
[0x7fb1dd115cb9]    Microsoft::MSR::CNTK::Matrix<float>::  Resize  (unsigned long,  unsigned long,  unsigned long,  bool,  bool) + 0xc9
[0x7fb1e5884c37]    Microsoft::MSR::CNTK::ComputationNode<float>::  BeginForwardProp  () + 0xb7
[0x7fb1e593fb6e]    Microsoft::MSR::CNTK::ComputationNetwork::PARTraversalFlowControlNode::  ForwardProp  (std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase> const&,  Microsoft::MSR::CNTK::FrameRange const&) + 0x6e
[0x7fb1e565cd93]    std::_Function_handler<void (std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase> const&),void Microsoft::MSR::CNTK::ComputationNetwork::ForwardProp<std::vector<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>,std::allocator<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>>>>(std::vector<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>,std::allocator<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>>> const&)::{lambda(std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase> const&)#1}>::  _M_invoke  (std::_Any_data const&,  std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase> const&) + 0x63
[0x7fb1e56a89b9]    void Microsoft::MSR::CNTK::ComputationNetwork::  TravserseInSortedGlobalEvalOrder  <std::vector<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>,std::allocator<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>>>>(std::vector<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>,std::allocator<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>>> const&,  std::function<void (std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase> const&)> const&) + 0x5b9
[0x7fb1e564d15a]    CNTK::CompositeFunction::  Forward  (std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>> const&,  std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>>&,  CNTK::DeviceDescriptor const&,  std::unordered_set<CNTK::Variable,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<CNTK::Variable>> const&,  std::unordered_set<CNTK::Variable,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<CNTK::Variable>> const&) + 0x158a
[0x7fb1e55ef473]    CNTK::Function::  Forward  (std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>> const&,  std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>>&,  CNTK::DeviceDescriptor const&,  std::unordered_set<CNTK::Variable,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<CNTK::Variable>> const&,  std::unordered_set<CNTK::Variable,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<CNTK::Variable>> const&) + 0x93
[0x7fb1e621a719]                                                       + 0x68719
[0x55594e4d24bf]    PyCFunction_Call                                   + 0x4f
[0x55594e530b2e]    PyEval_EvalFrameEx                                 + 0x775e
[0x55594e52e490]    PyEval_EvalFrameEx                                 + 0x50c0
[0x55594e5344c3]    PyEval_EvalCodeEx                                  + 0x693
[0x55594e53563f]                                                       + 0x1ae63f
[0x55594e48789a]    PyObject_Call                                      + 0x3a
[0x55594e52bbd8]    PyEval_EvalFrameEx                                 + 0x2808
[0x55594e52ee08]    PyEval_EvalFrameEx                                 + 0x5a38
[0x55594e52e490]    PyEval_EvalFrameEx                                 + 0x50c0
[0x55594e529e90]    PyEval_EvalFrameEx                                 + 0xac0
[0x55594e52e490]    PyEval_EvalFrameEx                                 + 0x50c0
[0x55594e53403d]    PyEval_EvalCodeEx                                  + 0x20d
[0x55594e534f5c]    PyEval_EvalCode                                    + 0x1c
[0x55594e5927bb]                                                       + 0x20b7bb
[0x55594e4d24bf]    PyCFunction_Call                                   + 0x4f
[0x55594e5297cc]    PyEval_EvalFrameEx                                 + 0x3fc
[0x55594e52e490]    PyEval_EvalFrameEx                                 + 0x50c0 (x2)
[0x55594e52ee08]    PyEval_EvalFrameEx                                 + 0x5a38
[0x55594e53403d]    PyEval_EvalCodeEx                                  + 0x20d
[0x55594e53563f]                                                       + 0x1ae63f
[0x55594e48789a]    PyObject_Call                                      + 0x3a
[0x55594e52bbd8]    PyEval_EvalFrameEx                                 + 0x2808
[0x55594e52ee08]    PyEval_EvalFrameEx                                 + 0x5a38
[0x55594e52e490]    PyEval_EvalFrameEx                                 + 0x50c0
[0x55594e529e90]    PyEval_EvalFrameEx                                 + 0xac0 (x2)
[0x55594e534989]    PyEval_EvalCodeEx                                  + 0xb59
[0x55594e53563f]                                                       + 0x1ae63f
[0x55594e48789a]    PyObject_Call                                      + 0x3a
[0x55594e52bbd8]    PyEval_EvalFrameEx                                 + 0x2808
[0x55594e534989]    PyEval_EvalCodeEx                                  + 0xb59
[0x55594e53563f]                                                       + 0x1ae63f
[0x55594e48789a]    PyObject_Call                                      + 0x3a
[0x55594e52bbd8]    PyEval_EvalFrameEx                                 + 0x2808
[0x55594e52e490]    PyEval_EvalFrameEx                                 + 0x50c0
[0x55594e529e90]    PyEval_EvalFrameEx                                 + 0xac0 (x2)
[0x55594e5344c3]    PyEval_EvalCodeEx                                  + 0x693
[0x55594e53563f]                                                       + 0x1ae63f
[0x55594e48789a]    PyObject_Call                                      + 0x3a
[0x55594e52bbd8]    PyEval_EvalFrameEx                                 + 0x2808
[0x55594e534989]    PyEval_EvalCodeEx                                  + 0xb59
[0x55594e535464]                                                       + 0x1ae464
[0x55594e48789a]    PyObject_Call                                      + 0x3a
[0x55594e5637a2]                                                       + 0x1dc7a2
[0x55594e48789a]    PyObject_Call                                      + 0x3a
[0x55594e52e041]    PyEval_EvalFrameEx                                 + 0x4c71
[0x55594e53403d]    PyEval_EvalCodeEx                                  + 0x20d
[0x55594e535464]                                                       + 0x1ae464
[0x55594e48789a]    PyObject_Call                                      + 0x3a
[0x55594e52bbd8]    PyEval_EvalFrameEx                                 + 0x2808
[0x55594e529e90]    PyEval_EvalFrameEx                                 + 0xac0 (x5)
[0x55594e52e490]    PyEval_EvalFrameEx                                 + 0x50c0
[0x55594e53403d]    PyEval_EvalCodeEx                                  + 0x20d
[0x55594e534f5c]    PyEval_EvalCode                                    + 0x1c
[0x55594e5927bb]                                                       + 0x20b7bb
[0x55594e4d24bf]    PyCFunction_Call                                   + 0x4f
[0x55594e5297cc]    PyEval_EvalFrameEx                                 + 0x3fc
[0x55594e52e490]    PyEval_EvalFrameEx                                 + 0x50c0
[0x55594e53403d]    PyEval_EvalCodeEx                                  + 0x20d
[0x55594e5354d9]                                                       + 0x1ae4d9
[0x55594e48789a]    PyObject_Call                                      + 0x3a
[0x55594e582f37]                                                       + 0x1fbf37
[0x55594e5930f4]    Py_Main                                            + 0x334
[0x55594e45e871]    main                                               + 0xe1
[0x7fb222f18830]    __libc_start_main                                  + 0xf0
[0x55594e536808]                                                       + 0x1af808


In [13]:
from cntk import load_model
from data_processor import *
process_setting(low=False,old = True,stop = False)
batch_size  = 32

    
model_list =["model/180days_all_body_shuffled/rcnn_2fold_body_acc0.870.dnn"]#,"180days_editor_shuffled"]
suffix_list = ["180days_all_body_shuffled"]
industry_list = ["180days_all_body_shuffled"]#,"180days_all_shuffled"]
prefix = "/home/t-haohu/IndustryClassifier/Data/"
for suffix,model_name,industry in list(zip(suffix_list,model_list,industry_list)):
    model = load_model(model_name)
    data_industry_sample = "{}/ready/industry_{}.wl".format(prefix,industry)
    data_title_sample    = "{}/ready/title_{}.wl".format(prefix,suffix)
    data_body_sample     = "{}/ready/body_{}.wl".format(prefix,suffix)
    title_dict =     { x:i for i,x in enumerate([x.strip("\n") for x in open(data_title_sample).readlines()])}
    body_dict =     { x:i for i,x in enumerate([x.strip("\n") for x in open(data_body_sample).readlines()])}
    inference_body(model,"Data/middle/1day_measure_sample_valid_body.txt","val/1day_measure_rcnn_{}.txt".format(suffix),title_dict,body_dict,data_industry_sample)



361 361
30 100
[56415. 55945. 42479. 54306. 56956. 56953. 56566. 49771.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.] [213428. 216865. 216865. 216024. 216861. 215294. 214429. 179513. 216861.
 216865. 215749. 203252. 185580. 215449. 216860. 212890. 216851. 215706.
 216856. 216639. 216859. 216862. 211545. 216852. 216862. 205199. 216859.
 214417. 185580. 216854. 216140. 216465. 214594. 216861. 216865. 166641.
 194662. 215723. 216865. 216860. 215909. 216467. 216855. 215559. 216864.
 216852. 216516. 216823. 216790. 202565. 216601. 216140. 216859. 216837.
 211050. 216853. 207806. 216862. 209269. 215765. 216854. 198014. 216865.
 216317. 216859. 216862. 216637. 216762. 212872. 214913. 215294. 202910.
 214764. 216859. 213495. 210014. 194425. 162844. 216596. 192315. 212409.
 216677. 215952. 216863. 210014. 216803. 216865. 210977. 216861. 216862.
 215899. 216857. 216842. 213208. 216850. 