In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

# EMアルゴリズム
def em_algorithm(predicted_labels, num_classes, num_iterations=100, tol=1e-4, random_seed=0):
    num_instances, num_classifiers = predicted_labels.shape
    np.random.seed(random_seed)
    Pr = np.random.dirichlet(np.ones(num_classes))
    Pi = np.random.dirichlet(np.ones(num_classes), size=(num_classifiers, num_classes))
    log_likelihoods = []

    for iteration in range(num_iterations):
        likelihood = np.zeros((num_instances, num_classes))
        for k in range(num_classes):
            prod_terms = np.ones(num_instances)
            for j in range(num_classifiers):
                prod_terms *= Pi[j, k, predicted_labels[:, j].astype(int)]
            likelihood[:, k] = Pr[k] * prod_terms
        likelihood_sum = likelihood.sum(axis=1, keepdims=True)
        likelihood /= likelihood_sum

        log_likelihood = np.sum(np.log(np.maximum(likelihood_sum, 1e-10)))
        log_likelihoods.append(log_likelihood)

        if iteration > 0 and np.abs(log_likelihoods[-1] - log_likelihoods[-2]) < tol:
            print("収束しました。")
            break
        
        Pr = likelihood.mean(axis=0)
        for j in range(num_classifiers):
            for k in range(num_classes):
                Pi[j, k, :] = np.dot(likelihood[:, k], (predicted_labels[:, j] == np.arange(num_classes).reshape(-1, 1)).T)
                Pi[j, k, :] /= np.maximum(likelihood[:, k].sum(), 1e-10)
    
    return likelihood.argmax(axis=1), log_likelihoods

# 第一段階のフィルタリング
def first_stage_filtering(X, integrated_labels, num_folds=7):
    kf = KFold(n_splits=num_folds)
    high_quality_datasets = []
    suspected_datasets = []
    models = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = integrated_labels[train_index], integrated_labels[test_index]

        model = AdaBoostClassifier(n_estimators=10, algorithm='SAMME.R')
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        models.append(model)

        high_quality_set = test_index[y_pred == y_test]
        suspected_set = test_index[y_pred != y_test]

        high_quality_datasets.append(high_quality_set)
        suspected_datasets.append(suspected_set)

    high_quality_final = np.concatenate(high_quality_datasets)
    suspected_final = np.concatenate(suspected_datasets)

    return high_quality_final, suspected_final, models

# 第二段階のフィルタリング
def second_stage_filtering(X, integrated_labels, high_quality_final, suspected_final, models, alpha=0.7):
    M = len(models)
    Si = np.array([np.sum([integrated_labels[idx] != integrated_labels[model.predict(X[idx].reshape(1, -1))[0]] for model in models]) for idx in suspected_final])
    prob_product = np.array([np.prod([model.predict_proba(X[idx].reshape(1, -1))[0][integrated_labels[idx]] for model in models]) for idx in suspected_final])
    denominator = np.sum(prob_product)
    Ri = prob_product / np.maximum(denominator, 1e-10) + Si

    sorted_indices = np.argsort(Ri)[::-1]
    top_alpha = int(alpha * len(sorted_indices))
    tough_instances = suspected_final[sorted_indices[:top_alpha]]

    high_quality_final = np.intersect1d(high_quality_final, np.unique(np.concatenate([model.predict(X[high_quality_final]) for model in models])))

    return tough_instances, high_quality_final

# ラベルの更新
def update_labels(X, tough_instances, high_quality_final, integrated_labels):
    high_quality_data = X[high_quality_final]
    high_quality_labels = integrated_labels[high_quality_final]
    correcting_models = []

    for _ in range(10):  # Use 10 models for correction
        model = AdaBoostClassifier(n_estimators=50, algorithm='SAMME.R')
        model.fit(high_quality_data, high_quality_labels)
        correcting_models.append(model)

    corrected_labels = np.copy(integrated_labels)
    for idx in tough_instances:
        votes = np.array([model.predict(X[idx].reshape(1, -1))[0] for model in correcting_models])
        if np.bincount(votes).max() != len(votes):  # すべてのモデルが同じラベルを予測しない場合にのみ修正
            corrected_labels[idx] = np.argmax(np.bincount(votes))

    return corrected_labels

# 再度EMアルゴリズムを適用
def reapply_em_algorithm(X, corrected_labels, tough_instances, models):
    tough_labels = np.zeros((len(tough_instances), len(models)), dtype=int)
    for i, model in enumerate(models):
        tough_labels[:, i] = model.predict(X[tough_instances])
    
    new_labels, _ = em_algorithm(tough_labels, num_classes=2)
    corrected_labels[tough_instances] = new_labels
    
    return corrected_labels

# 5回の7-foldクロスバリデーションを実行
def repeated_kfold_cross_validation(X, integrated_labels, num_repeats=5, num_folds=7):
    all_high_quality = []
    all_suspected = []
    all_models = []

    for _ in range(num_repeats):
        high_quality_final, suspected_final, models = first_stage_filtering(X, integrated_labels, num_folds)
        all_high_quality.append(high_quality_final)
        all_suspected.append(suspected_final)
        all_models.extend(models)

    high_quality_final = np.concatenate(all_high_quality)
    suspected_final = np.concatenate(all_suspected)

    return high_quality_final, suspected_final, all_models

# MNISTデータセットのロードと前処理
mnist = fetch_openml('mnist_784')
X, y = mnist.data, mnist.target.astype(int)

# データの標準化
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 2クラス分類に変換
y = np.where(y % 2 == 0, 0, 1)

# データをトレーニングセットとテストセットに分割
X_train, X_test = X[:60000], X[60000:]
y_train, y_test = y[:60000], y[60000:]

# 初期ラベルの予測 (DecisionTreeClassifierを使用)
num_classifiers = 3
predicted_labels = np.zeros((X_train.shape[0], num_classifiers), dtype=int)

# 基本モデルとしてDecisionTreeを使用
models = []
for i in range(num_classifiers):
    dt = DecisionTreeClassifier()
    dt.fit(X_train, y_train)
    predicted_labels[:, i] = dt.predict(X_train)
    models.append(dt)

# テストデータの分類
test_predictions = np.zeros((X_test.shape[0], num_classifiers), dtype=int)
for i in range(num_classifiers):
    test_predictions[:, i] = models[i].predict(X_test)

# テストデータの統合ラベル推定
test_integrated_labels, _ = em_algorithm(test_predictions, num_classes=2, random_seed=2022)

# 複数の初期化方法を試す
best_seed = 2022
alpha = 0.7  # 一つのα値のみを試す

print(f"使用するシード: {best_seed}")
integrated_labels, log_likelihoods = em_algorithm(predicted_labels, num_classes=2, random_seed=best_seed)

high_quality_final, suspected_final, models = repeated_kfold_cross_validation(X_train, integrated_labels)

print(f"\nAlpha: {alpha}")
tough_instances, high_quality_final = second_stage_filtering(X_train, integrated_labels, high_quality_final, suspected_final, models, alpha=alpha)
corrected_labels = update_labels(X_train, tough_instances, high_quality_final, integrated_labels)
corrected_labels = reapply_em_algorithm(X_train, corrected_labels, tough_instances, models)

integrated_accuracy = accuracy_score(y_train, integrated_labels)
corrected_accuracy = accuracy_score(y_train, corrected_labels)

print("統合ラベルの評価:")
print(classification_report(y_train, integrated_labels, digits=4))
print("修正後のラベルの評価:")
print(classification_report(y_train, corrected_labels, digits=4))

# 修正されたインスタンスラベルの数を出力
num_corrected_labels = np.sum(corrected_labels != integrated_labels)
print(f"修正されたインスタンスラベルの数: {num_corrected_labels}")

# 強化された統合ラベルセットの評価
def evaluate_enhanced_labels(X, y, corrected_labels, alpha_values):
    results = []
    for alpha in alpha_values:
        kf = KFold(n_splits=7)
        f1_scores = []
        
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = corrected_labels[train_index], y[test_index]
            
            model = AdaBoostClassifier(n_estimators=50, algorithm='SAMME.R')
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            
            f1 = f1_score(y_test, y_pred, average='weighted')
            f1_scores.append(f1)
        
        results.append({
            'alpha': alpha,
            'f1_score': np.mean(f1_scores)
        })
    
    return pd.DataFrame(results)

# α値の範囲を指定して評価を実行
alpha_values = np.arange(0.1, 1.1, 0.1)
evaluation_results = evaluate_enhanced_labels(X_train, y_train, corrected_labels, alpha_values)

print("強化された統合ラベルセットの評価結果:")
print(evaluation_results)


収束しました。
使用するシード: 2022
収束しました。

Alpha: 0.7
収束しました。
統合ラベルの評価:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000     29492
           1     1.0000    1.0000    1.0000     30508

    accuracy                         1.0000     60000
   macro avg     1.0000    1.0000    1.0000     60000
weighted avg     1.0000    1.0000    1.0000     60000

修正後のラベルの評価:
              precision    recall  f1-score   support

           0     0.9804    0.9498    0.9649     29492
           1     0.9529    0.9817    0.9671     30508

    accuracy                         0.9660     60000
   macro avg     0.9667    0.9657    0.9660     60000
weighted avg     0.9664    0.9660    0.9660     60000

修正されたインスタンスラベルの数: 2040
強化された統合ラベルセットの評価結果:
   alpha  f1_score
0    0.1  0.866386
1    0.2  0.866386
2    0.3  0.866386
3    0.4  0.866386
4    0.5  0.866386
5    0.6  0.866386
6    0.7  0.866386
7    0.8  0.866386
8    0.9  0.866386
9    1.0  0.866386
