In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV

# 加载数据
train_data = pd.read_csv('data/soft-label/train-dataset-code-split.csv')
test_data = pd.read_csv('data/soft-label/test-dataset-code-split.csv')

# 提取元特征和标签
X_train = train_data[['rta-0','rta-1','rta-2','rta-3','rta-4','rta-5','rta-6','rta-7',
                      'llama-0','llama-1','llama-2','llama-3','llama-4','llama-5','llama-6','llama-7']]
y_train = train_data['label']
X_test = test_data[['rta-0','rta-1','rta-2','rta-3','rta-4','rta-5','rta-6','rta-7',
                      'llama-0','llama-1','llama-2','llama-3','llama-4','llama-5','llama-6','llama-7']]
y_test = test_data['label']

# 定义超参数搜索范围
param_grid = {
    'n_estimators': [50, 100, 200, 300],        # 森林中的树数量
    'max_depth': [None, 10, 20, 30],            # 树的最大深度
    'min_samples_split': [2, 5, 10],            # 内部节点再分裂所需最小样本数
    'min_samples_leaf': [1, 2, 4],              # 叶子节点的最小样本数
    'class_weight': [None, 'balanced']          # 是否平衡类别权重
}

# 初始化 RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# 初始化 GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           scoring='accuracy', cv=5, n_jobs=-1, verbose=2)

# 执行超参数搜索
grid_search.fit(X_train, y_train)

# 输出最佳参数和得分
print("最佳参数: ", grid_search.best_params_)
print("最佳得分: ", grid_search.best_score_)

# 使用最佳模型进行测试集评估
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# 评估结果
print("测试集分类报告：")
print(classification_report(y_test, y_pred,digits=4))
accuracy = accuracy_score(y_test, y_pred)
print(f"测试集准确率: {accuracy:.4f}")

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# 加载数据
train_data = pd.read_csv('data/soft-label/train-dataset-softlabels.csv')
test_data = pd.read_csv('data/soft-label/test-dataset-softlabels.csv')

# 提取元特征和标签
X_train = train_data[['rta-0','rta-1','rta-2','rta-3','rta-4','rta-5','rta-6','rta-7',
                      'llama-0','llama-1','llama-2','llama-3','llama-4','llama-5','llama-6','llama-7']]
y_train = train_data['label']
X_test = test_data[['rta-0','rta-1','rta-2','rta-3','rta-4','rta-5','rta-6','rta-7',
                      'llama-0','llama-1','llama-2','llama-3','llama-4','llama-5','llama-6','llama-7']]
y_test = test_data['label']

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 定义超参数搜索范围
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],        # 正则化强度
    'solver': ['liblinear', 'lbfgs'],    # 优化算法
    'class_weight': [None, 'balanced']   # 类别权重
}

# 初始化 LogisticRegression 模型
lr = LogisticRegression(max_iter=500, random_state=42)

# 使用 GridSearchCV 进行超参数搜索
grid_search = GridSearchCV(estimator=lr, param_grid=param_grid, 
                           scoring='accuracy', cv=5, n_jobs=-1, verbose=2)

# 执行超参数搜索
grid_search.fit(X_train, y_train)

# 输出最佳参数和得分
print("最佳参数: ", grid_search.best_params_)
print("最佳得分: ", grid_search.best_score_)

# 使用最佳模型进行测试集评估
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# 输出分类报告和测试集准确率
print("\n测试集分类报告：")
print(classification_report(y_test, y_pred,digits=4))
accuracy = accuracy_score(y_test, y_pred)
print(f"测试集准确率: {accuracy:.4f}")


In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import RandomizedSearchCV

# 加载数据
train_data = pd.read_csv('data/soft-label/train-dataset-softlabels.csv')
test_data = pd.read_csv('data/soft-label/test-dataset-softlabels.csv')

# 提取元特征和标签
X_train = train_data[['rta-0','rta-1','rta-2','rta-3','rta-4','rta-5','rta-6','rta-7',
                      'llama-0','llama-1','llama-2','llama-3','llama-4','llama-5','llama-6','llama-7']]
y_train = train_data['label']
X_test = test_data[['rta-0','rta-1','rta-2','rta-3','rta-4','rta-5','rta-6','rta-7',
                      'llama-0','llama-1','llama-2','llama-3','llama-4','llama-5','llama-6','llama-7']]
y_test = test_data['label']

# 定义随机搜索参数范围
param_distributions = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [3, 5, 7, 9, 11],
    'learning_rate': np.linspace(0.01, 0.2, 10),
    'subsample': np.linspace(0.6, 1.0, 5),
    'colsample_bytree': np.linspace(0.6, 1.0, 5),
    'gamma': np.linspace(0, 0.5, 5),
    'reg_alpha': [0, 0.1, 1, 10],
    'reg_lambda': [1, 10, 100]
}

# 初始化 XGBoost 模型
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# 使用 RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=xgb, param_distributions=param_distributions, 
                                   n_iter=100, scoring='accuracy', cv=5, verbose=2, random_state=42, n_jobs=-1)

# 执行随机搜索
random_search.fit(X_train, y_train)

# 输出最佳参数和得分
print("最佳参数: ", random_search.best_params_)
print("最佳得分: ", random_search.best_score_)

# 使用最佳模型评估测试集
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

print("测试集分类报告：")
print(classification_report(y_test, y_pred, digits=4))
accuracy = accuracy_score(y_test, y_pred)
print(f"测试集准确率: {accuracy:.4f}")

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score

# 加载数据
train_data = pd.read_csv('data/soft-label/train-dataset-softlabels.csv')
test_data = pd.read_csv('data/soft-label/test-dataset-softlabels.csv')

# 提取元特征和标签
X_train = train_data[['rta-0','rta-1','rta-2','rta-3','rta-4','rta-5','rta-6','rta-7',
                      'llama-0','llama-1','llama-2','llama-3','llama-4','llama-5','llama-6','llama-7']]
y_train = train_data['label']
X_test = test_data[['rta-0','rta-1','rta-2','rta-3','rta-4','rta-5','rta-6','rta-7',
                      'llama-0','llama-1','llama-2','llama-3','llama-4','llama-5','llama-6','llama-7']]
y_test = test_data['label']

# 数据标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 定义超参数搜索范围
param_grid = {
    'n_neighbors': list(range(1, 52, 2)),  # 1 到 51 的奇数
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski'],
    'p': [1, 2, 3]  # Minkowski 距离参数
}

# 初始化 KNN 模型
knn = KNeighborsClassifier()

# 使用 GridSearchCV
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, 
                           scoring='accuracy', cv=3, n_jobs=-1, verbose=2)

# 执行搜索
grid_search.fit(X_train_scaled, y_train)

# 输出最佳参数和得分
print("最佳参数: ", grid_search.best_params_)
print("最佳得分: ", grid_search.best_score_)

# 使用最佳模型进行测试集评估
best_knn = grid_search.best_estimator_
y_pred = best_knn.predict(X_test_scaled)

# 评估结果
print("\n测试集分类报告：")
print(classification_report(y_test, y_pred, digits=4))
accuracy = accuracy_score(y_test, y_pred)
print(f"测试集准确率: {accuracy:.4f}")

In [None]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import classification_report, accuracy_score

# 提取元特征和标签
X_train = train_data[['rta-0','rta-1','rta-2','rta-3','rta-4','rta-5','rta-6','rta-7',
                      'llama-0','llama-1','llama-2','llama-3','llama-4','llama-5','llama-6','llama-7']]
y_train = train_data['label']
X_test = test_data[['rta-0','rta-1','rta-2','rta-3','rta-4','rta-5','rta-6','rta-7',
                      'llama-0','llama-1','llama-2','llama-3','llama-4','llama-5','llama-6','llama-7']]
y_test = test_data['label']

# 标准化数据
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 定义超参数搜索范围
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1, 'scale', 'auto'],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

# 初始化 SVM 模型
svm = SVC(class_weight='balanced', probability=True, random_state=42)

# 选择搜索方式：GridSearchCV
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, 
                           scoring='accuracy', cv=5, n_jobs=-1, verbose=2)

# 执行搜索
print("开始执行 GridSearchCV 超参数调优...")
grid_search.fit(X_train_scaled, y_train)

# 输出最佳参数和得分
print("最佳参数: ", grid_search.best_params_)
print("最佳交叉验证得分: ", grid_search.best_score_)

# 使用最佳模型进行测试集评估
best_svm = grid_search.best_estimator_
y_pred = best_svm.predict(X_test_scaled)

# 输出分类报告和测试集准确率
print("\n测试集分类报告：")
print(classification_report(y_test, y_pred, digits=4))
accuracy = accuracy_score(y_test, y_pred)
print(f"测试集准确率: {accuracy:.2f}")

In [None]:
import optuna
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from functools import partial

# 加载数据
train_data = pd.read_csv('data/ensemble/stacking/stacking_train_dataset.csv')
test_data = pd.read_csv('data/ensemble/stacking/stacking_test_dataset.csv')

# 提取元特征和标签
X_train = train_data[['rta_prediction', 'llama_prediction']]
y_train = train_data['label']
X_test = test_data[['rta_prediction', 'llama_prediction']]
y_test = test_data['label']

# 定义目标函数
def objective(trial, X_train, y_train, X_test, y_test, n_repeats=5):
    # 定义超参数搜索空间
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 11),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 1, 100),
        'random_state': 42,
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }

    scores = []
    for _ in range(n_repeats):  # 多次运行并取平均值
        model = XGBClassifier(**params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        scores.append(score)

    return np.mean(scores)  # 返回平均得分

# 创建 Optuna 优化器
study = optuna.create_study(direction='maximize')
study.optimize(partial(objective, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test), n_trials=50)

# 输出最佳参数和得分
print("最佳参数: ", study.best_params)
print("最佳平均得分: ", study.best_value)

# 使用最佳参数训练最终模型
best_params = study.best_params
best_model = XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss', random_state=42)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

# 评估结果
print("测试集分类报告：")
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
print(f"测试集准确率: {accuracy:.4f}")

In [None]:
import optuna
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from functools import partial

# 加载数据
train_data = pd.read_csv('data/ensemble/stacking/stacking_train_dataset.csv')
test_data = pd.read_csv('data/ensemble/stacking/stacking_test_dataset.csv')

# 提取元特征和标签
X_train = train_data[['rta_prediction', 'llama_prediction']]
y_train = train_data['label']
X_test = test_data[['rta_prediction', 'llama_prediction']]
y_test = test_data['label']

# 数据标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 定义目标函数
def objective(trial, X_train, y_train, X_test, y_test, n_repeats=5):
    # 定义超参数搜索空间
    n_neighbors = trial.suggest_int('n_neighbors', 1, 101, step=2)  # 奇数1到51
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan', 'chebyshev', 'minkowski'])
    p = trial.suggest_int('p', 1, 5)  # Minkowski 距离的p参数

    # 多次运行取平均值
    scores = []
    for _ in range(n_repeats):
        knn = KNeighborsClassifier(
            n_neighbors=n_neighbors,
            weights=weights,
            metric=metric,
            p=p
        )
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        scores.append(score)

    return np.mean(scores)  # 返回平均测试准确率

# 使用 Optuna 进行超参数优化
study = optuna.create_study(direction='maximize')
study.optimize(partial(objective, X_train=X_train_scaled, y_train=y_train, 
                       X_test=X_test_scaled, y_test=y_test), 
               n_trials=100)

# 输出最佳参数和得分
print("最佳参数: ", study.best_params)
print("最佳平均得分: ", study.best_value)

# 使用最佳参数重新训练模型
best_params = study.best_params
best_knn = KNeighborsClassifier(
    n_neighbors=best_params['n_neighbors'],
    weights=best_params['weights'],
    metric=best_params['metric'],
    p=best_params['p']
)
best_knn.fit(X_train_scaled, y_train)
y_pred = best_knn.predict(X_test_scaled)

# 评估结果
print("\n测试集分类报告：")
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
print(f"测试集准确率: {accuracy:.4f}")


# sklearn库 MLPClassifier

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import itertools

# 1. 加载数据集
train_data = pd.read_csv('data/soft-label/train-dataset-softlabels.csv')
test_data = pd.read_csv('data/soft-label/test-dataset-softlabels.csv')

# 2. 数据预处理
X_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]
X_test = test_data.iloc[:, :-1]
y_test = test_data.iloc[:, -1]

# 标准化特征（很重要，MLP对数据缩放敏感）
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 3. 定义MLPClassifier模型
mlp = MLPClassifier(max_iter=4000, random_state=42)

# 4. 超参数调优
# 动态生成 hidden_layer_sizes 参数
hidden_layer_sizes = []
neuron_options = [16, 32, 64, 128]
for layers in range(1, 4):  # 隐藏层数：1层、2层、3层
    for combination in itertools.product(neuron_options, repeat=layers):
        hidden_layer_sizes.append(combination)
# 定义超参数网格
param_grid = {
    'hidden_layer_sizes': hidden_layer_sizes,
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd', 'lbfgs'],
    'learning_rate': ['constant', 'adaptive'],
    'alpha': [1e-5, 0.0001, 0.001, 0.01, 0.1]
}

# 使用GridSearchCV进行超参数搜索
grid_search = GridSearchCV(estimator=mlp, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy', verbose=2)
grid_search.fit(X_train, y_train)

# 获取最佳模型
best_mlp = grid_search.best_estimator_

# 打印最佳参数
print("最佳超参数：", grid_search.best_params_)

# 5. 评估模型
# 在测试集上进行预测
y_pred = best_mlp.predict(X_test)

# 输出评估指标
print("分类报告：\n", classification_report(y_test, y_pred))
print("混淆矩阵：\n", confusion_matrix(y_test, y_pred))
print("准确率：", accuracy_score(y_test, y_pred))
