In [1]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.multioutput import ClassifierChain
from sklearn.svm import SVC
from gensim.models.doc2vec import Doc2Vec
import re
import jieba
import numpy as np
from sklearn.externals import joblib 
from sklearn.linear_model import LogisticRegression
import random
import json

divorce_model=Doc2Vec.load("../divorce/divorce.model") #divorce的Doc2Vec模型
loan_model=Doc2Vec.load("../loan/loan.model")#loan的Doc2Vec模型
labor_model=Doc2Vec.load("../labor/labor.model")#labor的Doc2Vec模型

threshold=0.2
test_ratio = 0.3
punction = "！？。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."



def content_process(content):
    text = re.sub(r'[{}]'.format(punction),' ',content).split(' ')
    sent=[]
    for sub_sentence in text:
        if sub_sentence!='':
            sent.extend(list(jieba.cut(sub_sentence)))
    return sent

def splitDataSet(X,Y,modify=False,ratio=test_ratio): #构建训练集和测试集
    if modify: #调整正负样本比例
        X_true=[]
        X_false=[]
        Y_true=[]
        Y_false=[]
        for i in range(len(Y)):
            if Y[i]==1:
                X_true.append(X[i])
                Y_true.append(Y[i])
            else:
                X_false.append(X[i])
                Y_false.append(Y[i])
        true_num = len(X_true)
        false_num = true_num*ration
        for i in range(0,len(X_false),int(len(X_false)/false_num)):
            X_true.append(X_false[i])
            Y_true.append(Y_false[i])
        X_train,X_test,Y_train,Y_test =  train_test_split(X_true,Y_true,test_size=test_ratio,shuffle=True)
    else:
        X_train,X_test,Y_train,Y_test =  train_test_split(X,Y,test_size=test_ratio,shuffle=True)
    return X_train,X_test,Y_train,Y_test


def transferTagVec(tag_dict,tags):
    tagVec=[]
    for key,val in tag_dict.items():
        if key in tags:
            tagVec.append(1)
        else:
            tagVec.append(0)
    return tagVec

def reverseTagVec(tag_dict,tagVec):
    tags=[]
    keys=list(tag_dict.keys())
    for i in range(len(keys)):
        if tagVec[i]==1:
            tags.append(keys[i])
    return tags

def trainCC(d2vModel,modelPath,tag_path,input_path):
    print("Initializing data loader")
    loader = DataLoader(input_path,tag_path)
    label_size = loader.tag_cnt
    X=[]
    Y=[]
    print("Loading data into data loader")
    for content in loader.data:
        for sub_con in content:
            text = sub_con['sentence']
            tags = sub_con['labels']
            text = content_process(text)
            X.append(d2vModel.infer_vector(text))
            Y.append(transferTagVec(loader.tag_dict,tags))
    X = np.array(X)
    Y = np.array(Y)
    base_classifier=LogisticRegression(solver='lbfgs')
    chains = [ClassifierChain(base_classifier,order='random',random_state=j) for j in range(label_size)]
    cnt=1
    for chain in chains:
        chain.fit(X,Y)
        print("ClassifierChain "+str(cnt)+" train finish")
        cnt+=1
    cnt=1
    for model in chains:
        joblib.dump(model,modelPath+str(cnt)+'.model')
        cnt+=1
    print("Train process finish")
    


class DataLoader:
    def __init__(self,file_path,tag_path=None):
        self.data=[]
        with open(file_path,'r',encoding='utf-8') as f: # read training or testing data
            for line in f.readlines():
                doc = json.loads(line)
                self.data.append(doc)
        self.tag_dict={}
        self.tag_cnt=0;
        if tag_path:
            with open(tag_path,'r',encoding='utf-8') as f: # read data tag file if tag_path passed
                for line in f:
                    self.tag_cnt+=1
                    self.tag_dict[line.strip()]=self.tag_cnt
                    
class DataWriter:
    def __init__(self,out_path):
        self.out_content=[]
        self.out_path=out_path
        
    def writeJson(self):
        with open(self.out_path,'w',encoding='utf-8') as f:
            for data in self.out_content:
                json.dump(data,f,ensure_ascii=False)
                f.write("\n")
        f.close()
                
    def writeContent(self,content):
        self.out_content.append(content)
        
        
class CCPredictor:
    def __init__(self,ccmodel_path,d2vmodel_path,tag_path):
        self.models=[]
        self.tag_cnt=0
        self.tag_dict={}
        with open(tag_path,'r',encoding='utf-8') as f: # read data tag file if tag_path passed
                for line in f:
                    self.tag_cnt+=1
                    self.tag_dict[line.strip()]=self.tag_cnt
                    
        for i in range(1,self.tag_cnt+1):
            path = ccmodel_path+str(i)+'.model'
            self.models.append(joblib.load(path))
        
        self.d2v = Doc2Vec.load(d2vmodel_path)
    
    def predict(self,content):
        content = content_process(content)
        x = self.d2v.infer_vector(content)
        chain_predict = np.array([chain.predict([x]) for chain in self.models])
        predict_label = chain_predict.mean(axis=0)
        for o in range(len(predict_label[0])):
            predict_label[0][o]= 1 if predict_label[0][o]>threshold else 0
        return reverseTagVec(self.tag_dict,predict_label[0])
    
    def predictData(self,data_writer,data_loader):
        for content in data_loader.data:
            data_line=[]
            for sub_content in content:
                sub_content['labels']= self.predict(sub_content['sentence'])
                data_line.append(sub_content)
            data_writer.writeContent(data_line)
        data_writer.writeJson()

In [2]:
if __name__=="__main__":
    train = True  #是否训练模型
    
    # ------------labor--------------------
    print("-----------------labor------------------")
    if train:
        trainCC(labor_model,"../model/ClassifierChain/labor/","../labor/tags.txt","../labor/train_selected.json")
    predictor = CCPredictor("../model/ClassifierChain/labor/","../labor/labor.model","../labor/tags.txt")
    writer = DataWriter("../output/ClassifierChain/labor/output.json")
    loader = DataLoader("../labor/data_small_selected.json")
    predictor.predictData(writer,loader)



    # ------------loan---------------------
    print("-----------------loan------------------")
    if train:
        trainCC(labor_model,"../model/ClassifierChain/loan/","../loan/tags.txt","../loan/train_selected.json")
    predictor = CCPredictor("../model/ClassifierChain/loan/","../loan/loan.model","../loan/tags.txt")
    writer = DataWriter("../output/ClassifierChain/loan/output.json")
    loader = DataLoader("../loan/data_small_selected.json")
    predictor.predictData(writer,loader)



    # ------------divorce-------------------
    print("-----------------divorce------------------")
    if train:
        trainCC(labor_model,"../model/ClassifierChain/divorce/","../divorce/tags.txt","../divorce/train_selected.json")
    predictor = CCPredictor("../model/ClassifierChain/divorce/","../divorce/divorce.model","../divorce/tags.txt")
    writer = DataWriter("../output/ClassifierChain/divorce/output.json")
    loader = DataLoader("../divorce/data_small_selected.json")
    predictor.predictData(writer,loader)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\RPJ\AppData\Local\Temp\jieba.cache


-----------------labor------------------
Initializing data loader
Loading data into data loader


Loading model cost 1.072 seconds.
Prefix dict has been built succesfully.


ClassifierChain 1 train finish
ClassifierChain 2 train finish
ClassifierChain 3 train finish
ClassifierChain 4 train finish
ClassifierChain 5 train finish
ClassifierChain 6 train finish
ClassifierChain 7 train finish
ClassifierChain 8 train finish
ClassifierChain 9 train finish
ClassifierChain 10 train finish
ClassifierChain 11 train finish
ClassifierChain 12 train finish
ClassifierChain 13 train finish
ClassifierChain 14 train finish
ClassifierChain 15 train finish
ClassifierChain 16 train finish
ClassifierChain 17 train finish
ClassifierChain 18 train finish
ClassifierChain 19 train finish
ClassifierChain 20 train finish
Train process finish
-----------------loan------------------
Initializing data loader
Loading data into data loader
ClassifierChain 1 train finish
ClassifierChain 2 train finish
ClassifierChain 3 train finish
ClassifierChain 4 train finish
ClassifierChain 5 train finish
ClassifierChain 6 train finish
ClassifierChain 7 train finish
ClassifierChain 8 train finish
Class