# 所有模型的组合

In [7]:
import pandas as pd
import numpy as np
from collections import Counter
import re
import jieba
from tqdm import tqdm
from sklearn.metrics import roc_curve, auc
import joblib
from sklearn.ensemble import RandomForestClassifier
import gensim
from sklearn.svm import SVC
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import shap

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [3]:
# V wqqpython
plt.rcParams["font.sans-serif"] = ['Simhei']
plt.rcParams["axes.unicode_minus"] = False

### 数据的处理
* 过滤掉所有特殊字符，只保留所有中文字符

In [4]:
def data_process():  # 数据预处理函数
    label_list = []
    text_file_list = []
    with open('label/index', encoding='utf-8') as f:
        f = f.read().splitlines()[:5000]  # 修改文件个数
        for i in f:
            if i.split(" ")[0] == 'spam':
                label_list.append(0)
            else:
                label_list.append(1)
            text_file_list.append(i.split(" ")[1][3:])
    data = []
    for file_path in text_file_list:
        with open(file_path, errors='ignore', encoding='utf-8') as f:
            text = f.readlines()
            res = re.findall('[\u4e00-\u9fa5]', str(text))
            res = "".join(res)
            data.append(res)
    return data, label_list

# 获取训练集的文本和标签
train_text_data, train_text_data_label = data_process()

### 利用TF-IDF来进行分词处理

### 利用Word2Vec来进行分词处理
* 和TF-IDF形成对比

In [5]:
# 使用word2vec之前先进行word2vec的语料库训练 只需要训练一次 就可以
with open("word2vec_txt.txt", "a+", encoding='utf-8') as f:
    words = []
    for i in tqdm(train_text_data):
        i = "".join(re.findall('[\u4e00-\u9fa5]', str(i)))
        i = " ".join(list(jieba.cut(i, cut_all=False)))
        f.write(i)
        f.write("\n")
model = Word2Vec(LineSentence(open('word2vec_txt.txt', 'r', encoding='utf-8')), sg=0, vector_size=64, window=3,
                 min_count=3, workers=4)
# 模型保存
model.save('test.model')

# 通过模型加载词向量(recommend)
model = gensim.models.Word2Vec.load('test.model')
dic = model.wv.index_to_key
print(dic)
print(len(dic))

  0%|          | 0/5000 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\94843\AppData\Local\Temp\jieba.cache
Loading model cost 0.559 seconds.
Prefix dict has been built successfully.
100%|██████████| 5000/5000 [00:08<00:00, 567.16it/s]


['的', '我', '是', '了', '在', '公司', '你', '和', '有', '与', '他', '就', '管理', '不', '也', '都', '企业', '可以', '您', '人', '等', '一个', '为', '她', '我们', '或', '发票', '说', '自己', '对', '中国', '可', '很', '还', '有限公司', '及', '到', '服务', '上', '会', '工作', '中', '咨询', '合作', '要', '做', '本', '来', '没有', '如果', '给', '信息', '去', '多', '好', '注册', '优惠', '什么', '请', '元', '如何', '把', '能', '月', '让', '更', '网站', '联系人', '设计', '后', '如', '这', '广告', '贵', '年', '这个', '朋友', '想', '培训', '分析', '将', '就是', '产品', '现在', '提供', '您好', '问题', '项目', '时间', '他们', '左右', '电脑', '需要', '发展', '我司', '客户', '每月', '课程', '经理', '并', '吧', '而', '技术', '深圳市', '点数', '时候', '啊', '不是', '知道', '财务', '个', '以', '深圳', '业务', '方面', '实业', '北京', '电话', '但', '觉得', '代开', '专业', '方法', '生产', '从', '最', '过', '全套', '案例', '看', '代理', '增值税', '软件', '邮件', '又', '所', '国际', '负责人', '上海', '运输', '这样', '手机', '市场', '根据', '希望', '一次', '机会', '因为', '那', '再', '免费', '被', '用', '以上', '网络', '但是', '直接', '承诺', '着', '吗', '还是', '贵司', '向', '系统', '部分', '进行', '大', '上网', '销售', '小时', '呢', '能够', '进项', '新', '此', '方式', '使用', '得', '广

### 构建训练集和测试集

In [None]:
# 分割数据集和测试集
x_train, x_test, y_train, y_test = train_test_split(train_text_data, train_text_data_label, test_size=0.2)
train_vec = []
train_vec_label = []
test_vec = []
test_vec_label = []

# 对每一个向量取均值，用来作为一段正文文本的向量值
for idx, line in enumerate(x_train):
    vec = np.zeros(64).reshape((1, 64))
    count = 0
    for word in jieba.cut(line, cut_all=False):
        try:
            vec = vec + model.wv[word].reshape((1, 64))
            count += 1
        except KeyError:
            continue
    if count != 0:
        vec /= count
    train_vec.append(vec[0])
    train_vec_label.append(y_train[idx])


for idx, line in enumerate(x_test):
    vec = np.zeros(64).reshape((1, 64))
    count = 0
    for word in jieba.cut(line, cut_all=False):
        try:
            vec = vec + model.wv[word].reshape((1, 64))
            count += 1
        except KeyError:
            continue
    if count != 0:
        vec /= count
    test_vec.append(vec[0])
    test_vec_label.append(y_test[idx])

print(train_vec[:10])
print('*************************************************************')
print(test_vec[:10])

### 使用随机森林机器学习算法来进行分类

In [27]:
print('使用随机森林模型')

random_forest_model = RandomForestClassifier()
random_forest_model.fit(train_vec, y_train)

joblib.dump(random_forest_model, 'random_forest.pkl', compress=3)
y_pred = random_forest_model.predict(test_vec)
# print(type(y_pred), len(y_pred), y_pred)
# print(type(y_test), len(y_test), y_test)

random_forest_accuracy = accuracy_score(y_test, y_pred)
random_forest_precision = precision_score(y_test, y_pred, average='macro')
random_forest_recall = recall_score(y_test, y_pred, average='macro')
random_forest_f1 = f1_score(y_test, y_pred, average='macro')

print(f'Accuracy: {random_forest_accuracy}')
print(f'precision: {random_forest_precision}')
print(f'Recall: {random_forest_recall}')
print(f'F1 Score: {random_forest_f1}')
print(random_forest_model.predict_proba(test_vec))

使用随机森林模型
Accuracy: 0.975
precision: 0.9774816176470589
Recall: 0.9663898552351027
F1 Score: 0.9716223502369534
[[0.05       0.95      ]
 [0.04       0.96      ]
 [0.93816586 0.06183414]
 ...
 [0.98       0.02      ]
 [1.         0.        ]
 [1.         0.        ]]


In [8]:
# pred_probas = clf.predict_proba(test_vec)[:, 1]
# fpr, tpr, _ = roc_curve(y_test, pred_probas)
# roc_auc = auc(fpr, tpr)
# plt.plot(fpr, tpr, label='area = %.2f' % roc_auc)
# # 保存到csv文件
# df_tmp = pd.DataFrame({'fpr': fpr, 'tpr': tpr, })
# df_tmp.to_csv('plot.csv', index=False, encoding='utf_8_sig')
# # 绘制折线图
# plt.plot([0, 1], [0, 1], 'k--')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.legend(loc='lower right')
# plt.savefig('ROC曲线.png')
# plt.show()

### 使用决策树模型

In [9]:
from sklearn.tree import DecisionTreeClassifier

print('使用决策树模型')

decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(train_vec, y_train)

joblib.dump(decision_tree_model, 'decision_tree.pkl', compress=3)
y_pred = decision_tree_model.predict(test_vec)
# print(type(y_pred), len(y_pred), y_pred)
# print(type(y_test), len(y_test), y_test)

decision_tree_accuracy = accuracy_score(y_test, y_pred)
decision_tree_precision = precision_score(y_test, y_pred, average='macro')
decision_tree_recall = recall_score(y_test, y_pred, average='macro')
decision_tree_f1 = f1_score(y_test, y_pred, average='macro')

print(f'Accuracy: {decision_tree_accuracy}')
print(f'precision: {decision_tree_precision}')
print(f'Recall: {decision_tree_recall}')
print(f'F1 Score: {decision_tree_f1}')

使用决策树模型
Accuracy: 0.966
precision: 0.9667506274115774
Recall: 0.9566603074851308
F1 Score: 0.961436556329849


### 使用支持向量机模型

In [31]:
from sklearn.svm import SVC

print('使用支持向量机模型')

SVC_model = SVC(probability=True)
SVC_model.fit(train_vec, y_train)

joblib.dump(SVC_model, 'SVC.pkl', compress=3)
y_pred = SVC_model.predict(test_vec)
# print(type(y_pred), len(y_pred), y_pred)
# print(type(y_test), len(y_test), y_test)

SVC_accuracy = accuracy_score(y_test, y_pred)
SVC_precision = precision_score(y_test, y_pred, average='macro')
SVC_recall = recall_score(y_test, y_pred, average='macro')
SVC_f1 = f1_score(y_test, y_pred, average='macro')

print(f'Accuracy: {SVC_accuracy}')
print(f'precision: {SVC_precision}')
print(f'Recall: {SVC_recall}')
print(f'F1 Score: {SVC_f1}')
print(SVC_model.predict_proba(test_vec))

使用支持向量机模型
Accuracy: 0.974
precision: 0.9736085172512507
Recall: 0.9678599483784087
F1 Score: 0.9706465437434237
[[1.31393720e-02 9.86860628e-01]
 [1.00328129e-02 9.89967187e-01]
 [9.63717771e-01 3.62822289e-02]
 ...
 [9.74744443e-01 2.52555573e-02]
 [9.87166196e-01 1.28338036e-02]
 [9.99973121e-01 2.68786824e-05]]


### 使用朴素贝叶斯模型

In [11]:
from sklearn.naive_bayes import GaussianNB

print('使用朴素贝叶斯模型')

naive_bayes_model = GaussianNB()
naive_bayes_model.fit(train_vec, y_train)

joblib.dump(naive_bayes_model, 'naive_bayes.pkl', compress=3)
y_pred = naive_bayes_model.predict(test_vec)
# print(type(y_pred), len(y_pred), y_pred)
# print(type(y_test), len(y_test), y_test)

naive_bayes_accuracy = accuracy_score(y_test, y_pred)
naive_bayes_precision = precision_score(y_test, y_pred, average='macro')
naive_bayes_recall = recall_score(y_test, y_pred, average='macro')
naive_bayes_f1 = f1_score(y_test, y_pred, average='macro')

print(f'Accuracy: {naive_bayes_accuracy}')
print(f'precision: {naive_bayes_precision}')
print(f'Recall: {naive_bayes_recall}')
print(f'F1 Score: {naive_bayes_f1}')

使用朴素贝叶斯模型
Accuracy: 0.907
precision: 0.892266183904427
Recall: 0.9026708562450902
F1 Score: 0.8970417777895866


### 使用K近邻算法

In [26]:
from sklearn.neighbors import KNeighborsClassifier

print('使用K近邻模型')

K_Nearest_model = KNeighborsClassifier()
K_Nearest_model.fit(train_vec, y_train)

joblib.dump(K_Nearest_model, 'K-Nearest.pkl', compress=3)
y_pred = K_Nearest_model.predict(test_vec)
# print(type(y_pred), len(y_pred), y_pred)
# print(type(y_test), len(y_test), y_test)

K_Nearest_accuracy = accuracy_score(y_test, y_pred)
K_Nearest_precision = precision_score(y_test, y_pred, average='macro')
K_Nearest_recall = recall_score(y_test, y_pred, average='macro')
K_Nearest_f1 = f1_score(y_test, y_pred, average='macro')

print(f'Accuracy: {K_Nearest_accuracy}')
print(f'precision: {K_Nearest_precision}')
print(f'Recall: {K_Nearest_recall}')
print(f'F1 Score: {K_Nearest_f1}')
print(K_Nearest_model.predict_proba(test_vec))

使用K近邻模型
Accuracy: 0.981
precision: 0.9804387155133424
Recall: 0.9768263943440691
F1 Score: 0.9785981808453719
[[0. 1.]
 [0. 1.]
 [1. 0.]
 ...
 [1. 0.]
 [1. 0.]
 [1. 0.]]


### 集成学习方法进行判断

#### 使用Voting软投票的方式去集成学习

In [22]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import joblib

# 数据标准化
scaler = StandardScaler()
X_train = scaler.fit_transform(train_vec)
X_test = scaler.transform(test_vec)

# 定义基础模型
knn = KNeighborsClassifier(n_neighbors=5)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
svc = SVC(probability=True, kernel='linear', random_state=42)

# 定义 VotingClassifier
voting_clf = VotingClassifier(
    estimators=[('knn', knn), ('rf', rf), ('svc', svc)],
    #estimators=[('knn', knn), ('rf', rf)],
    #estimators=[('knn', knn), ('svc', svc)],
    #estimators=[('rf', rf), ('svc', svc)],
    voting='soft'  # 使用软投票（基于概率的投票）
)

# 训练模型
voting_clf.fit(X_train, y_train)

# 预测
y_pred = voting_clf.predict(X_test)

# 保存模型
joblib.dump(voting_clf, 'Voting_model.pkl', compress=3)

# 评估模型
Voting_accuracy = accuracy_score(y_test, y_pred)
Voting_precision = precision_score(y_test, y_pred, average='macro')
Voting_recall = recall_score(y_test, y_pred, average='macro')
Voting_f1 = f1_score(y_test, y_pred, average='macro')

print(f'Accuracy: {Voting_accuracy}')
print(f'precision: {Voting_precision}')
print(f'Recall: {Voting_recall}')
print(f'F1 Score: {Voting_f1}')

Accuracy: 0.979
precision: 0.9796699495731007
Recall: 0.9731006621030187
F1 Score: 0.9762733044168939


#### 使用Stacking的方式去集成学习

In [25]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib


# 数据标准化
scaler = StandardScaler()
X_train = scaler.fit_transform(train_vec)
X_test = scaler.transform(test_vec)

# 定义基础模型
estimators = [
    #('knn', KNeighborsClassifier(n_neighbors=5)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('svc', SVC(probability=True, kernel='linear', random_state=42))
]

# 定义元学习器
stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression()
)

# 训练模型
stacking_clf.fit(X_train, y_train)

# 预测
y_pred = stacking_clf.predict(X_test)

# 保存模型
joblib.dump(stacking_clf, 'Stacking_model.pkl', compress=3)

# 评估模型
Stacking_accuracy = accuracy_score(y_test, y_pred)
Stacking_precision = precision_score(y_test, y_pred, average='macro')
Stacking_recall = recall_score(y_test, y_pred, average='macro')
Stacking_f1 = f1_score(y_test, y_pred, average='macro')

print(f'Accuracy: {Stacking_accuracy}')
print(f'precision: {Stacking_precision}')
print(f'Recall: {Stacking_recall}')
print(f'F1 Score: {Stacking_f1}')


Accuracy: 0.98
precision: 0.9796746678504613
Recall: 0.9753338570306362
F1 Score: 0.9774547290960248


### 使用模型预测

In [None]:
def class_predict(line, word_model, predict_model_name):
    vec = np.zeros(64).reshape((1, 64))
    count = 0
    for word in jieba.cut(line, cut_all=False):
        try:
            vec = vec + word_model.wv[word].reshape((1, 64))
            count += 1
        except KeyError:
            continue
    if count != 0:
        vec /= count
    clf = joblib.load(filename=predict_model_name)

    result = clf.predict(vec)
    return result[0]

### 附件名检查

In [3]:
import os

def is_executable_or_script(filename):
    # Windows常见扩展名
    windows_extensions = [
        ".exe", ".bat", ".cmd", ".msi", ".com", ".vbs", ".ps1", ".wsf", ".scr", ".cpl"
    ]
    
    # Unix/Linux常见扩展名
    unix_extensions = [
        ".sh", ".bash", ".bin", ".run", ".out", ".py", ".pl", ".php", ".rb", 
        ".js", ".cgi", ".ksh", ".zsh", ".tcl", ".lua", ".groovy", ".r", ".awk"
    ]
    
    # 跨平台扩展名
    cross_platform_extensions = [
        ".jar", ".class", ".pyc", ".dll", ".so", ".tar.gz", ".deb", ".rpm", ".pkg", ".dmg"
    ]
    
    # 获取文件扩展名
    _, ext = os.path.splitext(filename)
    
    # 判断扩展名是否在已知的可执行或脚本扩展名列表中
    if ext.lower() in windows_extensions + unix_extensions + cross_platform_extensions:
        return True, f"文件名以'{ext}'结尾，这是一个常见的脚本或可执行文件扩展名。"
    else:
        # 如果在Unix/Linux系统中，检查文件是否有可执行权限（没有扩展名的情况）
        if os.name != 'nt' and os.access(filename, os.X_OK):
            return True, "文件在Unix/Linux系统中有执行权限，可能是一个可执行文件或脚本。"
    
    return False, "文件没有已知的可执行或脚本文件扩展名，且在Unix/Linux系统中没有执行权限。"

# 示例用法
filename = "example.out"
is_script, reason = is_executable_or_script(filename)
print(is_script)  # True
print(reason)     # 文件名以'.sh'结尾，这是一个常见的脚本或可执行文件扩展名。

True
文件名以'.out'结尾，这是一个常见的脚本或可执行文件扩展名。


### 附件名相关性检查

In [62]:
from numpy import dot
from numpy.linalg import norm


def cosine_similarity(vec1, vec2):
    return dot(vec1, vec2) / (norm(vec1) * norm(vec2))

def attach_text_similarity(text, accachment):
    count = 0
    for word in jieba.cut(accachment, cut_all=True):
        if word in jieba.cut(text, cut_all=True):
            count += 1
    
    similarity = count / len(list(jieba.cut(accachment, cut_all=True)))

    return similarity >= 0.5, similarity
        

In [73]:
model = gensim.models.Word2Vec.load('test.model')
print(attach_text_similarity('你好，这些是你的成绩单', '成绩报告'))

(True, 0.5)
