In [1]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.utils.validation import check_array, check_X_y

import japanize_matplotlib  # noqa
import matplotlib.pyplot as plt
import seaborn as sns  # データ可視化ライブラリ
from lightning.pytorch import seed_everything

from sklearn.decomposition import PCA
from scipy import stats
import pandas as pd
from sklearn.cluster import KMeans
from abc import ABCMeta, abstractmethod

from numpy.typing import NDArray

from typing import Optional

plt.style.use("ggplot")
seed_everything(8)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\HaruMomozu\Desktop\momozu\ABtesting\.venv\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\HaruMomozu\Desktop\momozu\ABtesting\.venv\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\HaruMomozu\Desktop\momozu\ABtesting\.venv\Lib\site-packages\ipykernel\kernelapp.py", line 739, in

8

# データの前処理

In [2]:
# 外れ値の除去
def remove_outliers_zscore(data, metric, threshold=2):
    z_scores = np.abs(stats.zscore(data[metric]))
    data = data[(z_scores < threshold)]
    return data

In [3]:
df1 = pd.read_csv(
    R"C:\Users\HaruMomozu\Documents\オンラインデータ\NHANES_age_prediction.csv"
)
df1 = df1.drop(columns=["SEQN", "age_group"])

obj1 = "BMXBMI"
features_list1 = [
    "RIDAGEYR",  # 年齢（連続変数）
    "RIAGENDR",  # 性別（1:Male, 2:Female)
    "PAQ605",  # 運動有無(1:日常的に運動する, 2:運動しない)
    "LBXGLU",  # 断食後の血糖値（連続変数）
    "DIQ010",  # 糖尿病の有無(0:なし、1:あり)
    "LBXGLT",  # 口内の健康状態（連続変数）
    "LBXIN",  # 血中インスリン濃度（連続変数）
]
df1 = df1[df1["PAQ605"] != 7.0]

In [68]:
df2 = pd.read_csv(
    R"C:\Users\HaruMomozu\Documents\オンラインデータ\OnlineNewsPopularity\OnlinenewsPopularity.csv"
)
df2 = df2.drop(columns=["url"])
df2 = df2.drop(columns=[" timedelta"])

obj2 = " shares"
features_list2 = [col for col in list(df2.columns) if col != " shares"]

In [None]:
df3 = pd.read_csv(
    R"C:\Users\HaruMomozu\Documents\オンラインデータ\USCensus1990.data.txt",
    delimiter=",",
)

obj3 = "iFertil"
features_list3 = [col for col in list(df3.columns) if col != obj3]
features_list3_20 = features_list3[:20]
print(features_list3_20)

In [None]:
df4 = pd.read_csv(
    R"C:\Users\HaruMomozu\Documents\aug_first_cpn_data_for_ab_test_sensibility_tsukuba.csv"
)

obj4 = "GMV"
features_list4 = [
    "hist_4_day_buy_num",
    "hist_4_day_gmv",
    "his_4_day_is_buy",
    "hist_30_day_buy_days",
    "hist_30_day_buy_num",
    "hist_30_day_gmv",
    "hist_30_day_buy_recency",
    "hist_30_day_pay_days",
    "hist_30_day_atpu",
    "hist_30_day_gpv",
    "hist_30_day_pay_recency",
    "hist_30_day_list_days",
    "hist_30_day_list_num",
    "hist_30_day_list_recency",
    "hist_30_day_like_count",
    "hist_30_day_like_count_not_deleted",
    "hist_30_day_like_recency",
]

In [74]:
df = df1  # 選ぶ
obj = obj1  # 選ぶ
features_list = features_list1  # 選ぶ

df = remove_outliers_zscore(df, obj)

X = df[features_list]
# 数値列の標準化
scaler = StandardScaler()
scaled_features = scaler.fit_transform(X)
X_scaled = pd.DataFrame(scaled_features, columns=features_list)

y = df[obj]  # 目的変数

# 行を詰める
df = df.reset_index(drop=True)

# FSSEM でクラスタリング

Wrapperクラス

In [78]:
class Wrapper(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        n_features_to_select,
        n_clusters,
        criterion="ml",
        clustering_method="gmm",
        random_state=None,
    ):
        self.n_features_to_select = n_features_to_select  # 特徴量数
        self.n_clusters = n_clusters  # クラスタ数
        self.criterion = criterion  # 特徴量選択基準
        self.clustering_method = clustering_method  # クラスタリング手法
        self.random_state = random_state

    def FSS(self, X, y):
        X, y = check_X_y(X, y)

        n_features = X.shape[1]  # 総特徴量数
        self.selected_features_ = []  # ここに選択した特徴量を入れる

        # 選ばれた特徴量と残っている特徴量の初期化
        current_features = []
        remaining_features = list(range(n_features))

        # 特徴量サブセットのスコアを格納するための辞書（余計な計算を避けるため）
        cluster_cache = {}

        while len(current_features) < self.n_features_to_select:
            # print(current_features)
            best_score = -np.inf  # best score初期化（-∞）
            best_feature = None  # 選ぶ特徴量の初期化

            for feature in remaining_features:
                temp_features = tuple(
                    current_features + [feature]
                )  # 特徴量をひとつ加え、score計算

                if temp_features in cluster_cache:
                    score = cluster_cache[temp_features]
                else:
                    score = self.evaluate_subset(X[:, temp_features], y)
                    cluster_cache[temp_features] = score

                if score > best_score:
                    best_score = score
                    best_feature = feature

            if best_feature is not None:
                current_features.append(
                    best_feature
                )  # best feature をcurrent features に追加
                remaining_features.remove(
                    best_feature
                )  # best feature をremaining features から取り除く
                self.selected_features_ = current_features
            else:
                break

        # 選ばれた特徴量サブセットでクラスタリング
        final_features = X[:, self.selected_features_]
        if self.clustering_method == "gmm":
            self.final_model_ = GaussianMixture(
                n_components=self.n_clusters, random_state=self.random_state
            )
        elif self.clustering_method == "kmeans":
            self.final_model_ = KMeans(
                n_clusters=self.n_clusters, random_state=self.random_state
            )
        else:
            raise ValueError(f"Unknown clustering method: {self.clustering_method}")

        self.final_model_.fit(final_features)
        self.final_cluster_assignments_ = self.final_model_.predict(final_features)

        return self

    def evaluate_subset(self, X_subset, y):  # 特徴量サブセットの評価
        if self.clustering_method == "gmm":
            return self.evaluate_gmm(X_subset, y)
        elif self.clustering_method == "kmeans":
            return self.evaluate_kmeans(X_subset, y)
        else:
            raise ValueError(f"Unknown clustering method: {self.clustering_method}")

    def evaluate_gmm(self, X_subset, y):  # EMクラスタリングのときの評価基準
        gmm = GaussianMixture(
            n_components=self.n_clusters, random_state=self.random_state
        )
        gmm.fit(X_subset)

        if self.criterion == "scatter":
            return self.scatter_discriminability_gmm(gmm, X_subset)
        elif self.criterion == "ml":
            return gmm.score(X_subset)
        else:
            raise ValueError(f"Unknown criterion: {self.criterion}")

    def evaluate_kmeans(self, X_subset, y):  # kmeansのときの評価基準
        kmeans = KMeans(n_clusters=self.n_clusters, random_state=self.random_state)
        kmeans.fit(X_subset)

        if self.criterion == "scatter":
            return self.scatter_discriminability_kmeans(kmeans, X_subset)
        elif self.criterion == "ml":
            return -kmeans.score(X_subset)
        else:
            raise ValueError(f"Unknown criterion: {self.criterion}")

    def scatter_discriminability_gmm(self, gmm, X_subset):  # EMのときの散乱分離性
        means = gmm.means_  # 平均ベクトル
        covariances = gmm.covariances_  # 共分散行列
        weights = gmm.weights_  # 混合比率
        overall_mean = np.sum(
            weights[:, np.newaxis] * means, axis=0
        )  # 標本平均 #np.newaxisを使って1次元配列から2次元配列にする

        S_W = np.sum(weights[:, np.newaxis, np.newaxis] * covariances, axis=0)
        S_B = np.sum(
            weights[:, np.newaxis, np.newaxis]
            * np.einsum("...i,...j->...ij", means - overall_mean, means - overall_mean),
            axis=0,
        )
        scatter_discriminability = np.trace(np.linalg.solve(S_W, S_B))
        return scatter_discriminability

    def scatter_discriminability_kmeans(
        self, kmeans, X_subset
    ):  # kmeansのときの散乱分離性
        labels = kmeans.labels_
        cluster_centers = kmeans.cluster_centers_

        sw_i_list = []
        for i in range(self.n_clusters):
            cluster_points = X_subset[labels == i]

            if cluster_points.shape[0] == 1:
                # データポイントが1つの場合はゼロ行列を使用
                sw_i = np.zeros((X_subset.shape[1], X_subset.shape[1]))
            else:
                # 共分散行列を計算し、スカラー値ではなく2次元行列になることを保証
                sw_i = np.cov(cluster_points, rowvar=False) * np.sum(labels == i)
                if np.isscalar(sw_i):  # スカラー値のとき
                    sw_i = np.array([[sw_i]])
            sw_i_list.append(sw_i)

        # 全クラスターの S_W を合計
        S_W = np.sum(sw_i_list, axis=0)

        # クラスター間散布行列 S_B を計算
        overall_mean = np.mean(X_subset, axis=0)
        S_B = sum(
            np.sum(labels == i)
            * np.outer(
                cluster_centers[i] - overall_mean, cluster_centers[i] - overall_mean
            )
            for i in range(self.n_clusters)
        )

        # 散乱分離性を計算
        scatter_discriminability = np.trace(np.linalg.solve(S_W, S_B))
        return scatter_discriminability

    def transform(self, X):
        check_array(X)
        return X[:, self.selected_features_]  # 選択された特徴量のデータをかえす

    def get_feature_index_out(self):
        return np.array(self.selected_features_)  # 選択された特徴量のインデックス

    def get_final_cluster_assignments(self):
        return self.final_cluster_assignments_  # 最終的なクラスタリング結果


Wrapperクラス確認

In [79]:
# import warnings
# from sklearn.exceptions import ConvergenceWarning

# warnings.filterwarnings("ignore", category=ConvergenceWarning)

clusters = 5
n_features_to_select = 5  # 選択したい特徴量の数

fssem_tr = Wrapper(
    n_features_to_select=n_features_to_select,
    n_clusters=clusters,
    criterion="scatter",
    clustering_method="gmm",
    random_state=0,
)
fssem_ml = Wrapper(
    n_features_to_select=n_features_to_select,
    n_clusters=clusters,
    criterion="ml",
    clustering_method="gmm",
    random_state=0,
)
fsskmeans_tr = Wrapper(
    n_features_to_select=n_features_to_select,
    n_clusters=clusters,
    criterion="scatter",
    clustering_method="kmeans",
    random_state=0,
)
fsskmeans_ml = Wrapper(
    n_features_to_select=n_features_to_select,
    n_clusters=clusters,
    criterion="ml",
    clustering_method="kmeans",
    random_state=0,
)
instance_list = [fssem_tr, fssem_ml, fsskmeans_tr, fsskmeans_ml]

In [80]:
fsskmeans_tr.FSS(X_scaled, y)  # 選択された特徴量
selected_features = fsskmeans_tr.get_feature_index_out()
print(f"Selected features indices: {selected_features}")

print("選択された特徴量")
for n in selected_features:
    print(X_scaled.columns[n])

FSSEM_cluster = fsskmeans_tr.get_final_cluster_assignments()
cluster_size = np.unique(FSSEM_cluster, return_counts=True)

print(FSSEM_cluster)
print(cluster_size)


[[84.20047893]]
[[nan]]
[[nan]]
[[278.88507961]]
[[nan]]
[[238.39349393]]
[[179.88513473]]
[[4.09910731e+02 8.60299566e-27]
 [8.60299566e-27 1.51269876e-25]]
[[2.76837345e+02 5.11708874e-28]
 [5.11708874e-28 4.81245081e-26]]
[[300.53759787 -19.63865038]
 [-19.63865038 605.02361222]]
[[ 2.76762900e+02 -4.38810170e-28]
 [-4.38810170e-28  1.62208178e-27]]
[[426.92618694 -77.89130051]
 [-77.89130051 634.62192432]]
[[417.10988249 -77.01605036]
 [-77.01605036 610.48616593]]
[[ 5.88679877e+02 -7.88565835e+00 -7.16665205e+00]
 [-7.88565835e+00  5.94985871e+02  2.81420666e-28]
 [-7.16665205e+00  2.81420666e-28  5.17648049e+01]]
[[ 7.52462796e+02 -2.97362384e-28  1.91065355e+00]
 [-2.97362384e-28  1.67427705e-27 -2.18473301e-27]
 [ 1.91065355e+00 -2.18473301e-27  8.27735754e+01]]
[[ 3.20927745e+02 -3.69059418e-01 -7.29271624e+01]
 [-3.69059418e-01  6.11849384e+02 -1.78513370e+01]
 [-7.29271624e+01 -1.78513370e+01  8.85013322e+02]]


  return fit_method(estimator, *args, **kwargs)
  avg = a.mean(axis, **keepdims_kw)
  ret = um.true_divide(
  sw_i = np.cov(cluster_points, rowvar=False) * np.sum(labels == i)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  return fit_method(estimator, *args, **kwargs)
  avg = a.mean(axis, **keepdims_kw)
  ret = um.true_divide(
  sw_i = np.cov(cluster_points, rowvar=False) * np.sum(labels == i)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  return fit_method(estimator, *args, **kwargs)
  avg = a.mean(axis, **keepdims_kw)
  ret = um.true_divide(
  sw_i = np.cov(cluster_points, rowvar=False) * np.sum(labels == i)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


[[ 6.34924784e+02  1.19303716e-27 -4.42777687e+00]
 [ 1.19303716e-27  1.78878283e-27 -5.67384244e-30]
 [-4.42777687e+00 -5.67384244e-30  1.08508409e+03]]
[[393.71492531  26.02393158 -48.66057993]
 [ 26.02393158 587.82426197 -14.15174778]
 [-48.66057993 -14.15174778 974.97434933]]
[[ 1.42108063e+03 -6.36553262e+01  3.29450412e+00 -1.52199698e+01]
 [-6.36553262e+01  5.98866779e+02 -6.91674624e-28 -1.52090180e+01]
 [ 3.29450412e+00 -6.91674624e-28  5.59536748e+01  6.54206657e+00]
 [-1.52199698e+01 -1.52090180e+01  6.54206657e+00  3.84971792e+02]]
[[ 7.69922565e+02 -5.83475261e+01  3.74517216e+00  7.88555533e+01]
 [-5.83475261e+01  6.18146652e+02 -1.27505769e-27 -3.91533714e+01]
 [ 3.74517216e+00 -1.27505769e-27  5.57138734e+01  1.47856406e+00]
 [ 7.88555533e+01 -3.91533714e+01  1.47856406e+00  9.42316540e+02]]
[[ 783.16853399   -4.96776461    2.26324736    6.98846373]
 [  -4.96776461  523.07872318   38.27679289   -4.50107583]
 [   2.26324736   38.27679289  106.46539053   13.06995926]
 [  

FSSEM のクラスタリング結果

In [None]:
X_with_clusterlabelrow = X_scaled.copy()
X_with_clusterlabelrow["FSSEM_cluster"] = FSSEM_cluster


# 目的変数の分布
sns.boxplot(  # 箱ひげ図
    x=X_with_clusterlabelrow["FSSEM_cluster"].astype(
        str
    ),  # x軸、as.type(str)で文字列に変換
    y=y,  # y軸
    hue=X_with_clusterlabelrow["FSSEM_cluster"].astype(
        str
    ),  # クラスタラベルに基づいて色を付ける
    palette="tab10",  # カラーパレット指定
    legend=False,
)
plt.title("クラスタリング結果（FSSEM）", fontdict={"fontsize": "large"})
plt.xlabel("クラスタラベル")
plt.ylabel("y")