In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
from SVM_DataProcess import prepare_data, build_word2vec
from SVM_Config import Config

splist = []
# 构建word2id词典
word2id = {}
with open(Config.word2id_path, encoding='utf-8') as f:
    for line in f.readlines():
        sp = line.strip().split()  # 去掉\n \t 等
        splist.append(sp)
    word2id = dict(splist)  # 转成字典

# 转换索引的数据类型为整数
for key in word2id:
    word2id[key] = int(word2id[key])

# 构建id2word
id2word = {}
for key, val in word2id.items():
    id2word[val] = key

# 构建word2vec词向量
w2vec = build_word2vec(Config.pre_word2vec_path, word2id, None)

print("id2word.shape", id2word)
print("w2Vec.shape: ", w2vec.shape) # w2Vec.shape:  (54848, 50)

# 得到数字索引表示的句子和标签
train_array, train_lable, val_array, val_lable, test_array, test_lable = prepare_data(word2id,
                                                                                      train_path=Config.train_path,
                                                                                      val_path=Config.val_path,
                                                                                      test_path=Config.test_path,
                                                                                      seq_lenth=Config.max_sen_len)

print("train_array.shape", train_array.shape)
print("test_array.shape", test_array.shape)
train_lable = train_lable.ravel()
val_lable = val_lable.ravel()
test_lable = test_lable.ravel()

id2word.shape {0: '_PAD_', 1: '死囚', 2: '爱', 3: '刽子手', 4: '女贼', 5: '衙役', 6: '我们', 7: '你们', 8: '难道', 9: '还有', 10: '别的', 11: '选择', 12: '没想到', 13: '胡军', 14: '除了', 15: '蓝宇', 16: '东宫', 17: '西宫', 18: '我', 19: '个', 20: '去', 21: '阿兰', 22: '这样', 23: '真', 24: '他', 25: '恶心', 26: '爱个', 27: '分明', 28: '只是', 29: '欲', 30: '其实', 31: '对', 32: '锦衣卫', 33: '爱情', 34: '很萌', 35: '因为', 36: '很', 37: '言情小说', 38: '可惜', 39: '女主角', 40: '我要', 41: '不是', 42: '被', 43: '乔花', 44: '偷', 45: '令牌', 46: '青龙', 47: '吃醋', 48: '想出', 49: '箭', 50: '那里', 51: '萌到', 52: '让', 53: '想起', 54: '雏菊', 55: '里', 56: '郑', 57: '大叔', 58: '徐子珊', 59: '吴尊', 60: '真是', 61: '可怕', 62: '他们', 63: '完全', 64: '电影', 65: '料', 66: '脱脱', 67: '这个', 68: '名字', 69: '想要', 70: '雷死', 71: '观众', 72: '导演', 73: '到底', 74: '想', 75: '什么', 76: '剧情', 77: '混乱', 78: '老套', 79: '无趣', 80: '对白', 81: '更是', 82: '白痴', 83: '失望', 84: '两星', 85: '半', 86: '小', 87: '明星', 88: '本色', 89: '出演', 90: '老', 91: '演员', 92: '自己', 93: '发挥', 94: '基本上', 95: '王力宏', 96: '表演', 97: '指导', 98: '上', 99: '没有', 100:

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
import joblib

# 创建SVM模型
svm_model = SVC(C=1.0, kernel='rbf')

# 执行交叉验证
scores = cross_val_score(svm_model, train_array, train_lable, cv=5)

# 输出每次交叉验证的准确率
print("Cross-validation scores:", scores)

# 输出平均准确率
print("Average accuracy:", scores.mean())

# 在整个训练集上重新训练模型
svm_model.fit(train_array, train_lable)

# 保存模型
joblib.dump(svm_model, 'SVM_model.pkl')

# 在测试集上进行预测
test_pred = svm_model.predict(test_array)

# 计算预测准确率和f1值
accuracy = accuracy_score(test_lable, test_pred)
f1 = f1_score(test_lable, test_pred, average='micro')

print(f"Test accuracy: {accuracy}")
print(f"Test f1: {f1}")

Cross-validation scores: [0.79175    0.8465     0.84525    0.71567892 0.62565641]
Average accuracy: 0.7649670667666916
Test accuracy: 0.5745257452574526
Test f1: 0.5745257452574526
