In [1]:

"""
1，词到文档
2，数据的分配
3，训练
4，结果
"""

# -*- coding: utf-8 -*-
import os,sys,pymysql
import numpy as np
from gensim.models import word2vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.naive_bayes import GaussianNB,ComplementNB,BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC


In [2]:
# 文档向量研究

model = word2vec.Word2Vec.load('Zi_vec.bin')

# 直接横向相加
def wv2dv(t):
    text=""
    for uchar in t:
        if (uchar >= '\u4e00' and uchar <= '\u9fa5'):
            text+=uchar
    lth = len(text)
    dv = np.zeros(100)
    for i in text:
        try:
            dv += model.wv[i]
        except:
            lth-=1
    l = max(1,lth)
    dv = dv/l
    return dv

# 纵向扩展为400维再横向相加
def wv2dv400(t):
    text=""
    for uchar in t:
        if (uchar >= '\u4e00' and uchar <= '\u9fa5'):
            text+=uchar
    lth = len(text)
    sp,md,lp = int(lth/4),int(lth/2),int(lth/4*3)
    ta,tb,tc,td = text[:sp],text[sp:md],text[md:lp],text[lp:]
    dv = np.array([])
    for i in [ta,tb,tc,td]:
        j = wv2dv(i)
        dv = np.hstack((dv,j))
    return dv

x=wv2dv400('饮马渡秋水，水寒风似刀。平沙日未没，黯黯见临洮。昔日长城战，咸言意气高。黄尘足今古，白骨乱蓬蒿。')
y=wv2dv('黑云压城城欲摧，甲光向日金鳞开。角声满天秋色里，塞上燕脂凝夜紫。半卷红旗临易水，霜重鼓寒声不起。报君黄金台上意，提携玉龙为君死。')
y=wv2dv('一半残阳下小楼，朱帘斜控软金钩。倚阑无绪不能愁。有个盈盈骑马过，薄妆浅黛亦风流。见人羞涩却回头。')

def cos_sim(x,y):
    A=np.mat(x)
    B=np.mat(y)
    num = float(A * B.T) 
    denom = np.linalg.norm(A) * np.linalg.norm(B)  
    cos = num / denom #余弦值
    return cos


In [3]:
# database part
CONN = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='1234', db='topic')
cur = CONN.cursor()

def fromdb(topic):
    rs=[]
    for i in range(1,900):
        try:
            sql = "select content from %s where id=%s"%(topic,i)
            cur.execute(sql)
            text = cur.fetchall()[0][0]
            rs.append(text)
        except:
            pass
    return rs

In [38]:
# 训练集和测试集

topic_dic = {
    'war':      "战争边塞",
    'scene':    "山水景致",
    'farewell': "离别送别",
    'travel':   "行旅思乡",
    # 'history':  "咏史怀古",
    'love':     "爱情闺怨",
}

x_raw = []
y_raw = []
topics = ['war','scene','farewell','travel','love']
for index in range(5):
    at = fromdb(topics[index])
    for i in at:
        j = np.hstack((wv2dv(i),np.array(index/4)))
        #j = wv2dv(i) # 向量化
        x_raw.append(j)
        y_raw.append(topics[index])
x = np.array(x_raw)
y = np.array(y_raw)
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.25,random_state=6) # 存在切分不均的问题


In [32]:
# 贝叶斯
Bayes = GaussianNB()
Bayes = BernoulliNB()
Bayes.fit(xtrain,ytrain)

ypred = Bayes.predict(xtest)
accuracy = (ytest==ypred).sum()/xtest.shape[0]
print('accuracy:',accuracy)
print(classification_report(ytest,ypred,target_names=['war','scene','farewell','travel','love']))

accuracy: 0.5517826825127334
              precision    recall  f1-score   support

         war       0.56      0.46      0.51       189
       scene       0.59      0.62      0.60       138
    farewell       0.65      0.54      0.59       138
      travel       0.38      0.52      0.44        87
        love       0.59      0.92      0.72        37

    accuracy                           0.55       589
   macro avg       0.55      0.61      0.57       589
weighted avg       0.56      0.55      0.55       589



In [36]:
knn = KNeighborsClassifier(n_neighbors=25, weights='uniform')
knn.fit(xtrain, ytrain)

ypred = knn.predict(xtest)
accuracy = (ytest==ypred).sum()/xtest.shape[0]
print('accuracy:',accuracy)
print(classification_report(ytest,ypred,target_names=['war','scene','farewell','travel','love']))

accuracy: 0.7419354838709677
              precision    recall  f1-score   support

         war       0.58      0.95      0.72       189
       scene       0.93      0.91      0.92       138
    farewell       0.92      0.57      0.71       138
      travel       0.84      0.24      0.38        87
        love       0.89      0.89      0.89        37

    accuracy                           0.74       589
   macro avg       0.83      0.71      0.72       589
weighted avg       0.80      0.74      0.72       589



In [39]:
params = {'kernel':'linear'}#,'class_weight':'balanced'}
params = {'kernel':'rbf'}
SVM = SVC(**params,gamma='scale')
SVM.fit(xtrain, ytrain)

ypred = SVM.predict(xtest)
accuracy = (ytest==ypred).sum()/xtest.shape[0]
print('accuracy:',accuracy)
print(classification_report(ytest,ypred,target_names=['war','scene','farewell','travel','love']))

confusion_matrix(ytest,ypred)

accuracy: 0.8505942275042445
              precision    recall  f1-score   support

         war       0.80      0.88      0.84       189
       scene       0.94      0.95      0.94       138
    farewell       0.89      0.82      0.85       138
      travel       0.75      0.67      0.71        87
        love       0.91      0.86      0.89        37

    accuracy                           0.85       589
   macro avg       0.86      0.84      0.85       589
weighted avg       0.85      0.85      0.85       589



array([[167,   0,  10,  12,   0],
       [  0, 131,   0,   7,   0],
       [ 22,   0, 113,   0,   3],
       [ 20,   9,   0,  58,   0],
       [  1,   0,   4,   0,  32]], dtype=int64)

In [24]:
# 测试加权向量空间准确率

co1 = '战 兵 征 将 驱 匈 酋 俘 羌 胡 烟 刀 剑 枪 弓 虏 鏖 寨 霸 敌 陷 戍 革 慑 烽 堠 垒 寨 燧 柝 塞 \
碛 笳 军 氐 戈 擒 贼 漠 朔 雁 金 铁 马 孤 沙 城'
war = []
t = co1.split(" ")
for i in t:
    war.append(i)

co1 = '咏 史 怀 古 赋 讽 颂 唱 诵 句 雅 写 录 守 典 简 谱 念 晏 功 名 城 桥 淮 讴 宫 庭 祠 兴 亡 悼 \
景 慨 泯 废 存 丧 殁 遗 陈 罪 患 悔 忿 朽 国 吴 楚 商 周 秦 汉 魏 蜀 祚 据 遗 仲' 
history = []
t = co1.split(" ")
for i in t:
    history.append(i)    

co1 = '雨 雪 寒 蝉 蝶 菜 畦 苗 冷 秋 凉 霜 晴 境 色 丽 景 月 晓 梢 皑 雾 花 樱 梢 芳 鸟 峰 山 云 柳 \
岚 霭 峦 层 虹 嶂 霞 雾 杏 绿 树 杨 堤 湾 港 溪 隄'
scene = []
t = co1.split(" ")
for i in t:
    scene.append(i)    

co1 = '别 离 辞 送 去 到 过 逐 恨 向 路 归 话 客 离 暌 怆 亭 钟 驿 酒 旅 愁 怆 悽 伤 道 愁 肠 江 夜 月 \
枫 叶 寄 迹 故 村 晚 渡 江 猿 啸 楚 风 浮 孤 空 尽 流'
bye = []
t = co1.split(" ")
for i in t:
    bye.append(i)    

co1 = '月 书 思 怀 忆 愁 情 期 忘 悽 念 逢 客 家 邦 归 还 乡 行 程 途 停 还 游 蹰 来 寻 徉 俳 驿 旅 枫 \
叶 邮 渡 路 羁 故 旧 寄 迹 村 浮 舟 梦 灯 夜'
trip = []
t = co1.split(" ")
for i in t:
    trip.append(i)    

co1 = '盼 月 残 丝 宵 倚 袂 绡 夫 婿 嫂 嫁 香 鸳 鸯 彩 绣 怜 桃 花 扇 蛾 红 郎 爱 惜 好 怀 心 愁 悽 \
缱 思 意 闺 帏 怨 诉 恨 泣 哀 哭 叹 悲 君 思 姬 妃 婷 伶 裙 衫 绫 襦 裀 绉 绡 鱼 寄 眉 怜 卿 佳 鹊 柔'
love = []
t = co1.split(" ")
for i in t:
    love.append(i)
    
def cutp(poet):
    return [letter for letter in poet if letter not in ['，','。',' ','！','；']]

def count(title,target,topic):
    simi = []
    title_similarity = 0
    if title != '':
        for i in title:
            distance = []
            for t in topic:
                distance.append(min(model.wv.similarity(i,t),0.8))
            dis = 0
            distancelist = sorted(distance)[-10:]
            for wt in range(10):
                dis += (wt/10+1)*distancelist[wt]
            simi.append(dis)
        title_similarity = sum(simi)/len(simi)*3
        
    simi = []
    wid = int(len(target)/4)
    for i in target:
        distance = []
        for t in topic:
            distance.append(min(model.wv.similarity(i,t),0.8))
        dis = 0
        distancelist = sorted(distance)[-10:]
        for wt in range(10):
            dis += (wt/10+1)*distancelist[wt]
        simi.append(dis)
    similarity = sum(sorted(simi)[-wid:])/wid*10
    return round(title_similarity+similarity,2)#,round(title_similarity,3),round(similarity,3)

def classify(title,poet):
    target = cutp(poet)
    re = {#'咏史怀古':count(title,target,history),
    'war':count(title,target,war),
    'scene':count(title,target,scene),
    'farewell':count(title,target,bye),
    'travel':count(title,target,trip),
    'love':count(title,target,love)}
    return re

title= '望岳'

content = '岱宗夫如何？齐鲁青未了。\
造化钟神秀，阴阳割昏晓。\
荡胸生曾云，决眦入归鸟。\
会当凌绝顶，一览众山小。'

classify(title,content)


{'war': 53.58,
 'scene': 67.74,
 'farewell': 59.21,
 'travel': 52.4,
 'love': 46.46}