In [3]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import load_iris
from sklearn.metrics import f1_score
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils.validation import check_array, check_X_y

In [9]:
from scipy import stats
import pandas as pd


df = pd.read_csv(
    R"C:\Users\HaruMomozu\Documents\オンラインデータ\NHANES_age_prediction.csv"
)
df = df.drop(columns=["SEQN", "age_group"])

obj = "BMXBMI"
features_list = [
    "RIDAGEYR",  # 年齢（連続変数）
    "RIAGENDR",  # 性別（1:Male, 2:Female)
    "PAQ605",  # 運動有無(1:日常的に運動する, 2:運動しない)
    "LBXGLU",  # 断食後の血糖値（連続変数）
    "DIQ010",  # 糖尿病の有無(0:なし、1:あり)
    "LBXGLT",  # 口内の健康状態（連続変数）
    "LBXIN",  # 血中インスリン濃度（連続変数）
]


# 外れ値の除去
def remove_outliers_zscore(data, metric, threshold=2):
    z_scores = np.abs(stats.zscore(data[metric]))
    data = data[(z_scores < threshold)]
    return data


df = remove_outliers_zscore(df, obj)

# process_features
from sklearn.preprocessing import StandardScaler


X = df[features_list]
# 数値列の標準化
scaler = StandardScaler()
scaled_features = scaler.fit_transform(X)

X_scaled = pd.DataFrame(scaled_features, columns=features_list)
y = df[obj]  # 目的変数

# 行を詰める
df = df.reset_index(drop=True)

print(df)


      RIDAGEYR  RIAGENDR  PAQ605  BMXBMI  LBXGLU  DIQ010  LBXGLT  LBXIN
0         61.0       2.0     2.0    35.7   110.0     2.0   150.0  14.91
1         26.0       2.0     2.0    20.3    89.0     2.0    80.0   3.85
2         16.0       1.0     2.0    23.2    89.0     2.0    68.0   6.14
3         32.0       1.0     2.0    28.9   104.0     2.0    84.0  16.15
4         38.0       2.0     1.0    35.9   103.0     2.0    81.0  10.92
...        ...       ...     ...     ...     ...     ...     ...    ...
2165      38.0       2.0     2.0    33.5   100.0     2.0    73.0   6.53
2166      61.0       1.0     2.0    30.0    93.0     2.0   208.0  13.02
2167      34.0       1.0     2.0    23.7   103.0     2.0   124.0  21.41
2168      60.0       2.0     2.0    27.4    90.0     2.0   108.0   4.99
2169      26.0       1.0     2.0    24.5   108.0     2.0   108.0   3.76

[2170 rows x 8 columns]


In [16]:
class SFS(BaseEstimator, TransformerMixin):
    def __init__(self, n_features_to_select):
        self.n_features_to_select = n_features_to_select  # 選択する特徴量の数

    def fit(self, X, y):
        X, y = check_X_y(X, y)  # データの整合性確認
        self.n_features_ = X.shape[1]  # 特徴量の総数
        self.selected_features_ = []  # 特徴量サブセット初期化

        # Initialize the list of selected features
        current_features = []  # 現在選択されている特徴量を格納する用（ここに良い特徴量を追加していく）
        remaining_features = list(
            range(X.shape[1])
        )  # 選択されていない特徴量を格納する用（ここから良い特徴量を削除していく）

        while len(current_features) < self.n_features_to_select:
            best_score = -np.inf
            best_feature = None

            for feature in remaining_features:
                temp_features = current_features + [feature]
                score = self.evaluate_subset(X[:, temp_features], y)
                if score > best_score:
                    best_score = score
                    best_feature = feature

            if best_feature is not None:
                current_features.append(best_feature)
                remaining_features.remove(best_feature)
                self.selected_features_ = current_features
            else:
                break

        return self

    def transform(self, X):
        check_array(X)  # Xを検証
        return X[:, self.selected_features_]  # 選択された特徴量だけを抽出して返す

    def evaluate_subset(self, X_subset, y):
        # EMクラスタリング
        gmm = GaussianMixture(n_components=len(np.unique(y)), random_state=42)
        gmm.fit(X_subset)
        labels = gmm.predict(X_subset)

        # クラスごとの平均を計算
        class_means = np.array(
            [X_subset[y == label].mean(axis=0) for label in np.unique(y)]
        )
        overall_mean = X_subset.mean(axis=0)

        # クラス内散乱行列 (S_W)
        S_W = np.sum(
            [
                np.cov(X_subset[y == label].T, bias=True) * (np.sum(y == label) - 1)
                for label in np.unique(y)
            ],
            axis=0,
        )

        # クラス間散乱行列 (S_B)
        S_B = np.sum(
            [
                (
                    np.sum(y == label)
                    * np.outer(mean - overall_mean, mean - overall_mean)
                )
                for label, mean in zip(np.unique(y), class_means)
            ],
            axis=0,
        )

        # Debugging: Print shapes
        print(f"Shape of S_W: {S_W.shape}")
        print(f"Shape of S_B: {S_B.shape}")

        # Scatter Discriminability
        scatter_discriminability = np.trace(np.linalg.inv(S_W).dot(S_B))
        print(f"Shape of tr: {scatter_discriminability.shape}")
        return scatter_discriminability

In [17]:
# データの分割
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)

# SFSで特徴量選択
sfs = SFS(n_features_to_select=2)
sfs.fit(X_train, y_train)

# 訓練データとテストデータを選択した特徴だけで変換
X_train_selected = sfs.transform(X_train)
X_test_selected = sfs.transform(X_test)

# 選択した特徴量を用いてSVMモデルを訓練
from sklearn.svm import SVC

classifier = SVC()
classifier.fit(X_train_selected, y_train)

# 予測と評価
y_pred = classifier.predict(X_test_selected)
score = f1_score(y_test, y_pred, average="weighted")

print(f"Selected Features: {sfs.selected_features_}")
print(f"F1 Score: {score}")


Shape of S_W: ()
Shape of S_B: (1, 1)


  return fit_method(estimator, *args, **kwargs)


LinAlgError: 0-dimensional array given. Array must be at least two-dimensional