In [3]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import load_iris
from sklearn.metrics import f1_score
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils.validation import check_array, check_X_y

In [18]:
from scipy import stats
import pandas as pd


df = pd.read_csv(
    R"C:\Users\HaruMomozu\Documents\オンラインデータ\NHANES_age_prediction.csv"
)
df = df.drop(columns=["SEQN", "age_group"])

obj = "BMXBMI"
features_list = [
    "RIDAGEYR",  # 年齢（連続変数）
    "RIAGENDR",  # 性別（1:Male, 2:Female)
    "PAQ605",  # 運動有無(1:日常的に運動する, 2:運動しない)
    "LBXGLU",  # 断食後の血糖値（連続変数）
    "DIQ010",  # 糖尿病の有無(0:なし、1:あり)
    "LBXGLT",  # 口内の健康状態（連続変数）
    "LBXIN",  # 血中インスリン濃度（連続変数）
]


# 外れ値の除去
def remove_outliers_zscore(data, metric, threshold=2):
    z_scores = np.abs(stats.zscore(data[metric]))
    data = data[(z_scores < threshold)]
    return data


df = remove_outliers_zscore(df, obj)

# process_features
from sklearn.preprocessing import StandardScaler


X = df[features_list]
# 数値列の標準化
scaler = StandardScaler()
scaled_features = scaler.fit_transform(X)

X_scaled = pd.DataFrame(scaled_features, columns=features_list)
y = df[obj]  # 目的変数

# 行を詰める
df = df.reset_index(drop=True)

print(df)

      RIDAGEYR  RIAGENDR  PAQ605  BMXBMI  LBXGLU  DIQ010  LBXGLT  LBXIN
0         61.0       2.0     2.0    35.7   110.0     2.0   150.0  14.91
1         26.0       2.0     2.0    20.3    89.0     2.0    80.0   3.85
2         16.0       1.0     2.0    23.2    89.0     2.0    68.0   6.14
3         32.0       1.0     2.0    28.9   104.0     2.0    84.0  16.15
4         38.0       2.0     1.0    35.9   103.0     2.0    81.0  10.92
...        ...       ...     ...     ...     ...     ...     ...    ...
2165      38.0       2.0     2.0    33.5   100.0     2.0    73.0   6.53
2166      61.0       1.0     2.0    30.0    93.0     2.0   208.0  13.02
2167      34.0       1.0     2.0    23.7   103.0     2.0   124.0  21.41
2168      60.0       2.0     2.0    27.4    90.0     2.0   108.0   4.99
2169      26.0       1.0     2.0    24.5   108.0     2.0   108.0   3.76

[2170 rows x 8 columns]


In [51]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.mixture import GaussianMixture
from sklearn.utils import check_X_y, check_array
import numpy as np


class Wrapper(BaseEstimator, TransformerMixin):
    def __init__(self, n_features_to_select):
        self.n_features_to_select = n_features_to_select  # 選択する特徴量の数

    def FSSEM(self, X, y):
        X, y = check_X_y(X, y)  # データの整合性確認

        self.n_features_ = X.shape[1]  # 特徴量の総数
        self.selected_features_ = []  # 特徴量サブセット初期化

        # 現在選択されている特徴量のインデックスを格納する用
        current_features = []
        # 選択されていない特徴量のインデックスを格納する用
        remaining_features = list(range(X.shape[1]))

        # SFS
        while len(current_features) < self.n_features_to_select:
            best_score = -np.inf  # 最良スコアの初期化（マイナス無限大）
            best_feature = None

            # 追加すべき特徴量を1つ決める
            for feature in remaining_features:
                temp_features = current_features + [
                    feature
                ]  # 現在の特徴量リストに新しい特徴量を加えた仮のリストを作成
                print("X:", X[:, temp_features])
                score = self.evaluate_subset(
                    X[:, temp_features], y
                )  # 仮のリストに基づいて評価
                print(score)
                if score > best_score:  # 評価スコアが最良スコアよりも大きいときは更新
                    best_score = score
                    best_feature = feature

            # current_featuresとremaining_featuresの更新
            if best_feature is not None:
                current_features.append(best_feature)
                remaining_features.remove(best_feature)
                self.selected_features_ = current_features
            else:
                break

        return self

    def transform(self, X):
        check_array(X)  # Xを検証
        return X[:, self.selected_features_]  # 選択された特徴量だけを抽出して返す

    def evaluate_subset(self, X_subset, y):  # EMクラスタリング
        n_clusters = 3

        gmm = GaussianMixture(
            n_components=n_clusters, random_state=42
        )  # n_componensはクラスタ数
        gmm.fit(X_subset)  # ガウスモデルをX_subsetデータで訓練
        labels = gmm.predict(X_subset)  # 各データが属するクラスタのインデックス
        print("labels:", labels)
        responsibilities = gmm.predict_proba(
            X_subset
        )  # (d, k)各データが各クラスタに属する確率
        print("respon:", responsibilities)
        # 各データポイントに対するクラスタ確率の平均を計算

        # 各データポイントに対するクラスタ確率の平均を計算
        mean_responsibilities = responsibilities.mean(axis=0)
        print("mean:", mean_responsibilities)

        covariances = gmm.covariances_  # covariances[j]はクラスタjに対応する共分散行列
        print("covariances", covariances)
        # クラスごとの平均を計算
        class_means = np.array(
            [X_subset[labels == label].mean(axis=0) for label in np.unique(labels)]
        )
        print("class_means", class_means)
        # 全体の平均を計算
        overall_mean = X_subset.mean(axis=0)
        print("overallmean", overall_mean)

        # 各クラスタの共分散行列の計算
        S_W = np.sum(
            [mean_responsibilities[j] * covariances[j] for j in range(n_clusters)],
            axis=0,
        )
        print("Sw:", S_W)

        # クラスタ間散布行列 (S_B) を計算するためのベクトル化
        # (X - class_means[j]) の計算を一度に行うために、np.newaxis を使用
        deviations = [class_means[j] - overall_mean for j in range(n_clusters)]
        responsibility_matrix = responsibilities[
            :, :, np.newaxis
        ]  # 各データポイントに対するクラスタ確率

        # クラスタ間散布行列 (S_B) の計算
        S_B = np.sum(
            [
                responsibility_matrix[:, j, np.newaxis]
                * (deviations[j] @ deviations[j].T)
                for j in range(n_clusters)
            ],
            axis=0,
        )
        print("Sb:", S_B)
        # 散乱識別力
        scatter_discriminability = np.trace(np.linalg.inv(S_W).dot(S_B))
        return scatter_discriminability

    def get_feature_names_out(self):
        # 選択された特徴量のインデックスを返す
        return np.array(self.selected_features_)


In [52]:
# 特徴量選択のためのインスタンスを作成
n_features_to_select = 3  # 選択したい特徴量の数
fssem = Wrapper(n_features_to_select=n_features_to_select)

# FSSEM
fssem.FSSEM(X_scaled, y)  # 選択された特徴量
selected_features = fssem.get_feature_names_out()
print(f"Selected features indices: {selected_features}")


X: [[ 0.94929614]
 [-0.77321245]
 [-1.26535776]
 ...
 [-0.3794962 ]
 [ 0.90008161]
 [-0.77321245]]
labels: [1 2 0 ... 2 1 2]
respon: [[1.79750335e-25 8.50095223e-01 1.49904777e-01]
 [2.75245447e-01 3.17439495e-04 7.24437113e-01]
 [9.47385203e-01 1.21630303e-06 5.26135811e-02]
 ...
 [4.21398360e-04 4.25482820e-03 9.95323773e-01]
 [2.50166461e-24 8.14983510e-01 1.85016490e-01]
 [2.75245447e-01 3.17439495e-04 7.24437113e-01]]
mean: [0.25473293 0.30228542 0.44298165]
covariances [[[0.03979333]]

 [[0.23394824]]

 [[0.26042752]]]
class_means [[-1.20781462]
 [ 1.25019485]
 [-0.11197105]]
overallmean [1.5144056e-16]
Sw: [[0.19622043]]
Sb: [[[1.33056735]]

 [[0.4111113 ]]

 [[1.38272237]]

 ...

 [[0.01974387]]

 [[1.27612841]]

 [[0.4111113 ]]]
[6.78098285]
X: [[ 0.99907877]
 [ 0.99907877]
 [-1.00092208]
 ...
 [-1.00092208]
 [ 0.99907877]
 [-1.00092208]]
labels: [1 1 0 ... 0 1 0]
respon: [[0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 ...
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]]
mean: [0.49953917 0.50046083

  return fit_method(estimator, *args, **kwargs)


IndexError: index 2 is out of bounds for axis 0 with size 2