In [1]:
from sklearn.svm import SVC
from gensim.models.doc2vec import Doc2Vec
import re
import jieba
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib 
import random



In [2]:
divorce_model=Doc2Vec.load("divorce/divorce.model")
loan_model=Doc2Vec.load("loan/loan.model")
labor_model=Doc2Vec.load("labor/labor.model")

In [3]:
data_input = {1: "divorce/data.txt", 2: "labor/data.txt", 3: "loan/data.txt"}
models={1:divorce_model,2:loan_model,3:labor_model}
data_type = {1: "divorce", 2: "labor", 3: "loan"}
label_size = 20
test_ratio = 0.3
punction = "！？。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."

In [11]:
def line_processing(line):  # 提取每行数据的文本内容
    line = line.strip().split('\t')
    sentence = line[1]
    sentence = re.sub(r'[{}]'.format(punction),' ',sentence).split(' ')
    sent=[]
    for sub_sentence in sentence:
        if sub_sentence!='':
            sent.extend(list(jieba.cut(sub_sentence)))
    return line[0], sent, line[2]

In [5]:
def constructDataSet(data_path,model_tag): #构建X，Y的数据集
    data_file = open(data_path,'r',encoding='utf-8')
    lines = data_file.read().splitlines()
    X=[]
    Y=[]
    d2v = models[model_tag]
    for line in lines:
        _,x,y = line_processing(line)
        x=d2v.infer_vector(x)
        X.append(x)
        y = list(map(int,y.split()))
        Y.append(y)
    Y = np.array(Y).transpose()
    data_file.close()
    return X,Y

In [6]:
def splitDataSet(X,Y): #构建训练集和测试集
    X_train,X_test,Y_train,Y_test =  train_test_split(X,Y,test_size=test_ratio,shuffle=True)
    return X_train,X_test,Y_train,Y_test

In [7]:
def trainSVM(X,Y,model_path): #训练单个分类器
    X_train,X_test,Y_train,Y_test = splitDataSet(X,Y)
    classifier = SVC(gamma='auto')
    classifier.fit(X_train,Y_train)
    accuracy = classifier.score(X_test,Y_test)
    joblib.dump(classifier,model_path)
    print(model_path,accuracy)

In [8]:
def beginTrain():
    for i in range(1,4):
        print(data_input[i].split("/")[0])
        X,Y = constructDataSet(data_input[i],i)
        tag = list(range(len(Y)))
        random.shuffle(tag)
        for j in tag:
            model_path = data_input[i].split("/")[0]+'/'+str(j+1)+".model"
            print(model_path)
            trainSVM(X,Y[j],model_path)

In [9]:
def predictSingleLabel(x,model_type,label_tag):
    model_path = model_type+'/'+'label'+str(label_tag+1)+'.model'
    model = joblib.load(model_path)
    return model.predict([x])

In [10]:
def predict(text,model_type):
    text = re.sub(r'[{}]'.format(punction),' ',text).split(' ')
    words=[]
    for word in text:
        words.extend(list(jieba.cut(word)))
    d2v = models[model_type]
    x = d2v.infer_vector(words)
    res=[]
    for i in range(label_size):
        res.append(predictSingleLabel(x,data_type[model_type],i)[0])
    return res

In [13]:
#统计各类数据中对于每个标签含有该标签的数据个数和所占比例
for i in range(1,4):
    print(data_input[i].split('/')[0])
    _,Y = constructDataSet(data_input[i],i)
    count = 0
    for j in range(len(Y)):
        print(j,sum(Y[j]),len(Y[j]),sum(Y[j]/len(Y[j])))
    print()
    Y = np.array(Y).transpose()
    for labels in Y:
        if np.sum(labels)>0:
            count+=1
    print(count,len(Y),count/len(Y))

divorce
0 7524 35067 0.21456069809225228
1 4842 35067 0.13807853537513995
2 3671 35067 0.10468531667949789
3 1885 35067 0.05375424187983277
4 1563 35067 0.044571819659510556
5 1283 35067 0.036587104685317326
6 1161 35067 0.03310805030370456
7 1117 35067 0.03185330937918848
8 1103 35067 0.03145407363047882
9 916 35067 0.026121424701286106
10 382 35067 0.010893432571933726
11 549 35067 0.01565574471725554
12 476 35067 0.013574015456126841
13 323 35067 0.009210939059514643
14 199 35067 0.005674850999515213
15 213 35067 0.006074086748224826
16 130 35067 0.003707189095160692
17 179 35067 0.0051045142156443376
18 220 35067 0.0062737046225796326
19 208 35067 0.005931502552257107

15950 35067 0.4548435851370234
labor
0 2104 29326 0.07174520902952798
1 1495 29326 0.050978653754345744
2 1471 29326 0.05016026733956031
3 1046 29326 0.03566800791106828
4 965 29326 0.03290595376116744
5 942 29326 0.032121666780331404
6 848 29326 0.02891631998908822
7 830 29326 0.028302530177999084
8 590 29326 0.0201

In [12]:
beginTrain()

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache


divorce


Loading model cost 0.922 seconds.
Prefix dict has been built succesfully.


divorce/6.model
divorce/6.model 0.9622659443018724
divorce/17.model
divorce/17.model 0.9972436080220511
divorce/20.model
divorce/20.model 0.993821880049425
divorce/10.model
divorce/10.model 0.9724360802205113
divorce/5.model
divorce/5.model 0.956943256344454
divorce/16.model
divorce/16.model 0.9937268320501853
divorce/4.model
divorce/4.model 0.9446820644425434
divorce/14.model
divorce/14.model 0.9901150080790799
divorce/9.model
divorce/9.model 0.9703450242372398
divorce/12.model
divorce/12.model 0.9837467921300257
divorce/3.model
divorce/3.model 0.9073282007413744
divorce/8.model
divorce/8.model 0.9686341602509267
divorce/1.model
divorce/1.model 0.8388936412888509
divorce/15.model
divorce/15.model 0.993821880049425
divorce/7.model
divorce/7.model 0.9688242562494059
divorce/11.model
divorce/11.model 0.9906852960745176
divorce/19.model
divorce/19.model 0.9936317840509458
divorce/18.model
divorce/18.model 0.9937268320501853
divorce/2.model
divorce/2.model 0.8928809048569527
divorce/13.mod