In [29]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.svm import SVC
from gensim.models.doc2vec import Doc2Vec
import re
import jieba
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib 
from sklearn.metrics import f1_score
import random

In [30]:
divorce_model=Doc2Vec.load("divorce/divorce.model") #divorce的Doc2Vec模型
loan_model=Doc2Vec.load("loan/loan.model")#loan的Doc2Vec模型
labor_model=Doc2Vec.load("labor/labor.model")#labor的Doc2Vec模型

In [31]:
data_input = {1: "divorce/data.txt", 2: "labor/data.txt", 3: "loan/data.txt"}
models={1:divorce_model,2:loan_model,3:labor_model}
data_type = {1: "divorce", 2: "labor", 3: "loan"}
label_size = 20
test_ratio = 0.3
punction = "！？。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."

In [32]:
def line_processing(line):  # 提取每行数据的文本内容
    line = line.strip().split('\t')
    sentence = line[1]
    sentence = re.sub(r'[{}]'.format(punction),' ',sentence).split(' ')
    sent=[]
    for sub_sentence in sentence:
        if sub_sentence!='':
            sent.extend(list(jieba.cut(sub_sentence)))
    return line[0], sent, line[2]

In [33]:
def constructDataSet(data_path,model_tag): #构建X，Y的数据集
    data_file = open(data_path,'r',encoding='utf-8')
    lines = data_file.read().splitlines()
    X=[]
    Y=[]
    d2v = models[model_tag]
    for line in lines:
        _,x,y = line_processing(line)
        x=d2v.infer_vector(x)
        X.append(x)
        y = list(map(int,y.split()))
        Y.append(y)
    Y = np.array(Y).transpose()
    data_file.close()
    return X,Y[:10]

In [34]:
def splitDataSet(X,Y,modify=False,ratio=.2): #构建训练集和测试集
    if modify: #调整正负样本比例
        X_true=[]
        X_false=[]
        Y_true=[]
        Y_false=[]
        for i in range(len(Y)):
            if Y[i]==1:
                X_true.append(X[i])
                Y_true.append(Y[i])
            else:
                X_false.append(X[i])
                Y_false.append(Y[i])
        true_num = len(X_true)
        false_num = true_num*ration
        for i in range(0,len(X_false),int(len(X_false)/false_num)):
            X_true.append(X_false[i])
            Y_true.append(Y_false[i])
        X_train,X_test,Y_train,Y_test =  train_test_split(X_true,Y_true,test_size=test_ratio,shuffle=True)
    else:
        X_train,X_test,Y_train,Y_test =  train_test_split(X,Y,test_size=test_ratio,shuffle=True)
    return X_train,X_test,Y_train,Y_test

In [35]:
def trainSVM(X,Y,model_path): #训练单个分类器
    X_train,X_test,Y_train,Y_test = splitDataSet(X,Y)
    classifier = SVC(gamma='auto')
    classifier.fit(X_train,Y_train)
    #accuracy = classifier.score(X_test,Y_test)
    Y_predict = classifier.predict(X_test)
    f1score =  f1_score(Y_test,Y_predict,average='macro')
    joblib.dump(classifier,model_path)
    print(model_path,f1score)

In [36]:
def beginTrain():
    for i in range(1,4):
        print(data_input[i].split("/")[0])
        X,Y = constructDataSet(data_input[i],i)
        tag = list(range(len(Y)))
        #random.shuffle(tag)
        for j in tag:
            model_path = data_input[i].split("/")[0]+'/label'+str(j+1)+".model"
            trainSVM(X,Y[j],model_path)

In [37]:
def predictSingleLabel(x,model_type,label_tag):
    model_path = model_type+'/'+'label'+str(label_tag+1)+'.model'
    model = joblib.load(model_path)
    return model.predict([x])

In [38]:
def predict(text,model_type):
    text = re.sub(r'[{}]'.format(punction),' ',text).split(' ')
    words=[]
    for word in text:
        if word!='':
            words.extend(list(jieba.cut(word)))
    d2v = models[model_type]
    x = d2v.infer_vector(words)
    res=[]
    for i in range(label_size):
        res.append(predictSingleLabel(x,data_type[model_type],i)[0])
    return res

In [39]:
def evaluateModel(model_type):
    data_file = open(data_input[model_type],'r',encoding='utf-8')
    lines = data_file.read().splitlines()
    f1score=0
    gmean=0
    for line in lines:
        line = line.strip().split('\t')
        x = line[1]
        y = list(map(int,line[2].split()))
        predict_label = predict(x,model_type)
        f1score+=f1_score(y,predict_label,average='macro')
        gmean+=gmean(y,predict_label)
    

In [40]:
def evaluateLabel(model_type):
    X,Y = constructDataSet(data_input[model_type],model_type)
    for i in range(len(Y)):
        model_path = data_type[model_type]+'/'+'label'+str(i+1)+'.model'
        model = joblib.load(model_path)
        y = []
        for vec in X:
            y.append(model.predict([vec]))
        f1score = f1_score(Y[i],y,average='macro')
        print(data_type[model_type],'label ',i+1,f1score)

In [59]:
beginTrain()

divorce
divorce/label1.model 0.43950775131852327
divorce/label2.model 0.46313211205796806
divorce/label3.model 0.473634180508305
divorce/label4.model 0.48632946001367056
divorce/label5.model 0.4888748542557326
divorce/label6.model 0.4908536585365854
divorce/label7.model 0.4908536585365854
divorce/label8.model 0.49225423483422615
divorce/label9.model 0.4916408967916505
divorce/label10.model 0.4940367413677022
divorce/label11.model 0.49754047471225943
divorce/label12.model 0.49616894933435496
divorce/label13.model 0.49689173680183624
divorce/label14.model 0.49763644177051997
divorce/label15.model 0.4987851936544233
divorce/label16.model 0.4984984984984985
divorce/label17.model 0.4990954103980194
divorce/label18.model 0.4988806858775899
divorce/label19.model 0.4982114751752754
divorce/label20.model 0.4984984984984985
labor
labor/label1.model 0.4803614671312976
labor/label2.model 0.4867277288372907
labor/label3.model 0.488012104283054
labor/label4.model 0.4906501476292479
labor/label5.mode

In [14]:
for j in range(1,4):
    X,Y = constructDataSet(data_input[j],j)
    print(data_type[j])
    for i in range(len(Y)):
        print("label"+str(i+1),np.sum(Y[i]),len(Y[i]),np.sum(Y[i])/len(Y[i]))
    count=0
    Y = Y.transpose()
    for i in range(len(Y)):
        if np.sum(Y[i])>0:
            count+=1
    print(count,len(Y),count/len(Y))

divorce
label1 7524 35067 0.21456069809222345
label2 4842 35067 0.13807853537513903
label3 3671 35067 0.10468531667949925
label4 1885 35067 0.053754241879830036
label5 1563 35067 0.04457181965950894
label6 1283 35067 0.03658710468531668
label7 1161 35067 0.03310805030370434
label8 1117 35067 0.03185330937918841
label9 1103 35067 0.0314540736304788
label10 916 35067 0.02612142470128611
label11 382 35067 0.010893432571933728
label12 549 35067 0.01565574471725554
label13 476 35067 0.013574015456126843
label14 323 35067 0.009210939059514643
label15 199 35067 0.005674850999515214
label16 213 35067 0.006074086748224827
label17 130 35067 0.0037071890951606923
label18 179 35067 0.0051045142156443376
label19 220 35067 0.006273704622579633
label20 208 35067 0.005931502552257108
15950 35067 0.4548435851370234
labor
label1 2104 29326 0.0717452090295301
label2 1495 29326 0.05097865375434768
label3 1471 29326 0.050160267339562165
label4 1046 29326 0.03566800791106867
label5 965 29326 0.0329059537611

In [54]:
for i in range(1,4):
    evaluateLabel(i)

divorce label  1 0.43991375179683756
divorce label  2 0.46292041904061754
divorce label  3 0.4723831304635662
divorce label  4 0.48619027385016633
divorce label  5 0.4886030537690861
divorce label  6 0.49068277875412125
divorce label  7 0.49158366317254576
divorce label  8 0.4919077908341423
divorce label  9 0.4920108357114919
divorce label  10 0.4933832240168742
divorce label  11 0.49726172726230067
divorce label  12 0.49605518430696277
divorce label  13 0.4965833070142697
divorce label  14 0.49768661099253697
divorce label  15 0.4985772503038536
divorce label  16 0.4984768524477625
divorce label  17 0.4990714816296212
divorce label  18 0.498720606103924
divorce label  19 0.49842663844151386
divorce label  20 0.49851271343992215
labor label  1 0.48139633585626374
labor label  2 0.486921986808265
labor label  3 0.4871373358283346
labor label  4 0.49092108460924205
labor label  5 0.49163589716920625
labor label  6 0.49183850285912317
labor label  7 0.4926648674832192
labor label  8 0.49