### 支持向量机

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score

# 从feature_vectors.csv中导入数据
data = pd.read_csv('features/feature.csv')
# data = pd.read_csv('feature_vectors.csv')

# 划分训练集和测试集
X = data.drop(columns=['label'])  # 特征向量
y = data['label']  # 标签
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# 使用支持向量机（SVM）模型
model = SVC(kernel='linear')

# 训练模型
model.fit(X_train, y_train)

# 在测试集上评估模型性能
y_pred = model.predict(X_test)
precision = precision_score(y_test, y_pred, average='weighted')  # 使用加权平均
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"精确度: {precision:.2f}")
print(f"召回率: {recall:.2f}")
print(f"F1 分数: {f1:.2f}")

精确度: 0.89
召回率: 0.88
F1 分数: 0.87


### 贝叶斯

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

# 从feature_vectors.csv中导入数据
data = pd.read_csv('features/feature_NB.csv')

# 划分训练集和测试集
X = data.drop(columns=['label'])  # 特征向量
y = data['label']  # 标签
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

model = GaussianNB()
# model = MultinomialNB()

# 训练模型
model.fit(X_train, y_train)

# 在测试集上评估模型性能
y_pred = model.predict(X_test)
precision = precision_score(y_test, y_pred, average='weighted')  # 使用加权平均
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"精确度: {precision:.2f}")
print(f"召回率: {recall:.2f}")
print(f"F1 分数: {f1:.2f}")

精确度: 0.88
召回率: 0.75
F1 分数: 0.73


### 决策树

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier

# 从feature_vectors.csv中导入数据
data = pd.read_csv('features/feature.csv')

# 划分训练集和测试集
X = data.drop(columns=['label'])  # 特征向量
y = data['label']  # 标签
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

model = DecisionTreeClassifier()

# 训练模型
model.fit(X_train, y_train)

# 在测试集上评估模型性能
y_pred = model.predict(X_test)
precision = precision_score(y_test, y_pred, average='weighted')  # 使用加权平均
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"精确度: {precision:.2f}")
print(f"召回率: {recall:.2f}")
print(f"F1 分数: {f1:.2f}")

精确度: 0.85
召回率: 0.85
F1 分数: 0.85


### 随机森林

In [12]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score

# 从feature_vectors.csv中导入数据
data = pd.read_csv('features/feature_NB.csv')

# 划分训练集和测试集
X = data.drop(columns=['label'])  # 特征向量
y = data['label']  # 标签
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# 建立随机森林模型
rfc = RandomForestClassifier(n_estimators=100, random_state=42)  # 可根据需要调整参数
# 训练模型
rfc.fit(X_train, y_train)
# 预测测试集标签
y_pred = rfc.predict(X_test)

# 计算精确度、召回率和 F1 分数
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"精确度: {precision:.2f}")
print(f"召回率: {recall:.2f}")
print(f"F1 分数: {f1:.2f}")

精确度: 0.83
召回率: 0.82
F1 分数: 0.82


### 梯度提升分类器

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier

# 从feature_vectors.csv中导入数据
data = pd.read_csv('features/feature_NB.csv')
# data = pd.read_csv('feature_vectors.csv')

# 划分训练集和测试集
X = data.drop(columns=['label'])  # 特征向量
y = data['label']  # 标签
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# 创建梯度提升分类器
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
# 在训练集上拟合模型
gbc.fit(X_train, y_train)
# 预测测试集标签
y_pred = gbc.predict(X_test)

# 计算精确度、召回率和 F1 分数
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"精确度: {precision:.2f}")
print(f"召回率: {recall:.2f}")
print(f"F1 分数: {f1:.2f}")

精确度: 0.91
召回率: 0.90
F1 分数: 0.90


### 梯度提升算法 适用于高维特征

#### XGBoost

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

# 从feature_vectors.csv中导入数据
data = pd.read_csv('features/feature.csv')
# data = pd.read_csv('feature_vectors.csv')

# 划分训练集和测试集
X = data.drop(columns=['label'])  # 特征向量
y = data['label']  # 标签
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# 创建 LabelEncoder 对象
le = LabelEncoder()

# 将字符串标签编码为整数
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

# 创建并训练 XGBoost 模型
model = xgb.XGBClassifier()
model.fit(X_train, y_train_encoded)

# 预测测试集标签
y_pred_encoded = model.predict(X_test)

# 计算精确度、召回率和 F1 分数
precision = precision_score(y_test_encoded, y_pred_encoded, average='weighted')
recall = recall_score(y_test_encoded, y_pred_encoded, average='weighted')
f1 = f1_score(y_test_encoded, y_pred_encoded, average='weighted')

print(f"精确度: {precision:.2f}")
print(f"召回率: {recall:.2f}")
print(f"F1 分数: {f1:.2f}")

精确度: 0.92
召回率: 0.93
F1 分数: 0.92


#### LightGBM

- 直方图算法（Histogram-based Algorithm）：
LightGBM 的直方图算法可以帮助减少候选分裂点的数量，从而提高计算效率。
你可以考虑使用直方图算法来替代预排序算法，从而加速模型训练。
- GOSS 算法（Gradient-based One-Side Sampling）：
GOSS 是基于梯度的单边采样算法，可以在不改变数据分布的前提下减少样本数量，从而提高训练速度。
你可以尝试使用 GOSS 算法来加速模型训练，特别是对于大规模数据集。
- EFB 算法（Exclusive Feature Bundling）：
EFB 是互斥特征绑定算法，用于减少特征的数量，从而进一步降低数据规模。
如果你的特征维度很高，可以考虑使用 EFB 来合并互斥的特征，从而提高训练效率。

In [2]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score

data = pd.read_csv('features/feature.csv')

# 划分训练集和测试集
X = data.drop(columns=['label'])  # 特征向量
y = data['label']  # 标签
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# 创建模型
model = lgb.LGBMClassifier(verbose=-1)
# 训练模型
model.fit(X_train, y_train)

# 预测
y_pred = model.predict(X_test)

# 计算准确率
# accuracy = (y_pred == y_test).mean()
# print(f"Accuracy: {accuracy:.2f}")
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"精确度: {precision:.2f}")
print(f"召回率: {recall:.2f}")
print(f"F1 分数: {f1:.2f}")

精确度: 0.93
召回率: 0.93
F1 分数: 0.93


### 逻辑回归 适合用于二分类

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression

# 从feature_vectors.csv中导入数据
data = pd.read_csv('features/feature.csv')

# 划分训练集和测试集
X = data.drop(columns=['label'])  # 特征向量
y = data['label']  # 标签
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# 创建逻辑回归模型
lr = LogisticRegression(penalty='l2', C=1.0, solver='lbfgs', max_iter=100)
# 在训练集上拟合模型
lr.fit(X_train, y_train)
# 预测测试集标签
y_pred = lr.predict(X_test)

# 计算精确度、召回率和 F1 分数
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"精确度: {precision:.2f}")
print(f"召回率: {recall:.2f}")
print(f"F1 分数: {f1:.2f}")

精确度: 0.80
召回率: 0.78
F1 分数: 0.75


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### 保存模型/向量化器

In [3]:
import joblib
# 保存模型到文件
model_filename = 'lgb_model.pkl'
joblib.dump(model, model_filename)
# 保存特征向量化器，如果有的话
# scaler_filename = 'lgb_scaler.pkl'
# joblib.dump(scaler, scaler_filename)

print(f"模型已保存到文件：{model_filename}")
# print(f"向量化器已保存到文件：{scaler_filename}")

模型已保存到文件：lgb_model.pkl


### 加载模型/向量化器

In [None]:
import joblib
# 加载模型/向量化器
loaded_model = joblib.load('lgb_model.pkl')
# scaler = joblib.load('nb_scaler.pkl')

# 对模型进行操作...