In [1]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.utils.validation import check_array, check_X_y

import japanize_matplotlib  # noqa
import matplotlib.pyplot as plt
import seaborn as sns  # データ可視化ライブラリ
from lightning.pytorch import seed_everything

plt.style.use("ggplot")
seed_everything(8)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\HaruMomozu\Desktop\momozu\ABtesting\.venv\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\HaruMomozu\Desktop\momozu\ABtesting\.venv\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\HaruMomozu\Desktop\momozu\ABtesting\.venv\Lib\site-packages\ipykernel\kernelapp.py", line 739, in

8

# データの前処理

In [2]:
from scipy import stats
import pandas as pd


df = pd.read_csv(
    R"C:\Users\HaruMomozu\Documents\オンラインデータ\NHANES_age_prediction.csv"
)
df = df.drop(columns=["SEQN", "age_group"])

obj = "BMXBMI"
features_list = [
    "RIDAGEYR",  # 年齢（連続変数）
    "RIAGENDR",  # 性別（1:Male, 2:Female)
    "PAQ605",  # 運動有無(1:日常的に運動する, 2:運動しない)
    "LBXGLU",  # 断食後の血糖値（連続変数）
    "DIQ010",  # 糖尿病の有無(0:なし、1:あり)
    "LBXGLT",  # 口内の健康状態（連続変数）
    "LBXIN",  # 血中インスリン濃度（連続変数）
]


# 外れ値の除去
def remove_outliers_zscore(data, metric, threshold=2):
    z_scores = np.abs(stats.zscore(data[metric]))
    data = data[(z_scores < threshold)]
    return data


df = remove_outliers_zscore(df, obj)

# process_features
from sklearn.preprocessing import StandardScaler


X = df[features_list]
# 数値列の標準化
scaler = StandardScaler()
scaled_features = scaler.fit_transform(X)

X_scaled = pd.DataFrame(scaled_features, columns=features_list)
y = df[obj]  # 目的変数

# 行を詰める
df = df.reset_index(drop=True)

print(df)

      RIDAGEYR  RIAGENDR  PAQ605  BMXBMI  LBXGLU  DIQ010  LBXGLT  LBXIN
0         61.0       2.0     2.0    35.7   110.0     2.0   150.0  14.91
1         26.0       2.0     2.0    20.3    89.0     2.0    80.0   3.85
2         16.0       1.0     2.0    23.2    89.0     2.0    68.0   6.14
3         32.0       1.0     2.0    28.9   104.0     2.0    84.0  16.15
4         38.0       2.0     1.0    35.9   103.0     2.0    81.0  10.92
...        ...       ...     ...     ...     ...     ...     ...    ...
2165      38.0       2.0     2.0    33.5   100.0     2.0    73.0   6.53
2166      61.0       1.0     2.0    30.0    93.0     2.0   208.0  13.02
2167      34.0       1.0     2.0    23.7   103.0     2.0   124.0  21.41
2168      60.0       2.0     2.0    27.4    90.0     2.0   108.0   4.99
2169      26.0       1.0     2.0    24.5   108.0     2.0   108.0   3.76

[2170 rows x 8 columns]


# Wrapperクラス

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.utils import check_X_y, check_array
import numpy as np


class Wrapper(BaseEstimator, TransformerMixin):
    def __init__(self, n_features_to_select, n_clusters, random_state=None):
        self.n_features_to_select = n_features_to_select
        self.n_clusters = n_clusters
        self.random_state = random_state

    def initialize_gmm_with_kmeans(self, X_subset, n_clusters, random_state):
        kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
        kmeans.fit(X_subset)

        gmm = GaussianMixture(
            n_components=n_clusters,
            random_state=random_state,
            means_init=kmeans.cluster_centers_,
        )

        return gmm

    def FSSEM(self, X, y):
        X, y = check_X_y(X, y)

        self.n_features_ = X.shape[1]
        self.selected_features_ = []

        current_features = []
        remaining_features = list(range(X.shape[1]))

        while len(current_features) < self.n_features_to_select:
            best_score = -np.inf
            best_feature = None

            for feature in remaining_features:
                temp_features = current_features + [feature]
                score = self.evaluate_subset(X[:, temp_features], y)
                if score > best_score:
                    best_score = score
                    best_feature = feature

            if best_feature is not None:
                current_features.append(best_feature)
                remaining_features.remove(best_feature)
                self.selected_features_ = current_features
            else:
                break

        final_features = X[:, self.selected_features_]
        self.final_gmm_ = self.initialize_gmm_with_kmeans(
            final_features, self.n_clusters, self.random_state
        )
        self.final_gmm_.fit(final_features)
        self.final_cluster_assignments_ = self.final_gmm_.predict(final_features)

        return self

    def transform(self, X):
        check_array(X)
        return X[:, self.selected_features_]

    def evaluate_subset(self, X_subset, y):
        gmm = self.initialize_gmm_with_kmeans(
            X_subset, self.n_clusters, self.random_state
        )
        gmm.fit(X_subset)

        means = gmm.means_
        covariances = gmm.covariances_
        weights = gmm.weights_
        overall_mean = np.sum(
            [weights[k] * means[k] for k in range(self.n_clusters)], axis=0
        )

        S_W = np.sum(
            [weights[k] * covariances[k] for k in range(self.n_clusters)], axis=0
        )

        S_B = np.sum(
            [
                weights[k] * np.outer(means[k] - overall_mean, means[k] - overall_mean)
                for k in range(self.n_clusters)
            ],
            axis=0,
        )

        scatter_discriminability = np.trace(np.linalg.inv(S_W).dot(S_B))
        return scatter_discriminability

    def get_feature_index_out(self):
        return np.array(self.selected_features_)

    def get_final_cluster_assignments(self):
        return self.final_cluster_assignments_


Wrapperクラス確認

In [8]:
# 特徴量選択のためのインスタンスを作成
n_features_to_select = 3  # 選択したい特徴量の数
n_clusters = 3
fssem = Wrapper(
    n_features_to_select=n_features_to_select, n_clusters=n_clusters, random_state=0
)

# FSSEM
fssem.FSSEM(X_scaled, y)  # 選択された特徴量
selected_features = fssem.get_feature_index_out()
print(f"Selected features indices: {selected_features}")
cluster_label = fssem.get_final_cluster_assignments()
print(f"cluster_label:{cluster_label}")
print(len(cluster_label))

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Selected features indices: [1 0 3]
cluster_label:[1 1 0 ... 0 1 0]
2170
