In [6]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from collections import Counter
from sklearn.feature_selection import mutual_info_classif
from scipy.spatial.distance import euclidean, hamming

class MFS:
    def __init__(self, num_classifiers=100, subset_size=None, sampling='with_replacement'):
        """
        初始化 MFS 分类器。

        参数：
            num_classifiers (int): 要组合的分类器数量。默认为 100。
            subset_size (int): 每个分类器使用的特征子集的大小。如果为 None，则默认使用原始特征集大小的一半。
            sampling (str):  特征子集的采样方法，可以是 'with_replacement'（有放回采样）或 'without_replacement'（无放回采样）。默认为 'with_replacement'。
        """
        self.num_classifiers = num_classifiers
        self.subset_size = subset_size
        self.sampling = sampling
        self.classifiers = []
        self.feature_subsets = []
        self.discrete_features = []
        self.continuous_features = []

    def fit(self, X, y):
        """
        训练 MFS 分类器，同时处理离散特征和连续特征。

        参数：
            X (array-like, shape (n_samples, n_features)): 训练数据。
            y (array-like, shape (n_samples,)):  训练数据的标签。
        """
        n_samples, n_features = X.shape
        if self.subset_size is None:
            self.subset_size = n_features // 2

        # 区分离散特征和连续特征
        self.discrete_features = [i for i in range(n_features) if isinstance(X[0, i], (int, np.integer))]
        self.continuous_features = list(set(range(n_features)) - set(self.discrete_features))

        for i in range(self.num_classifiers):
            # 随机选择连续特征子集
            if continuous_features:
                if self.sampling == 'with_replacement':
                    continuous_subset = np.random.choice(self.continuous_features, size=min(len(self.continuous_features), self.subset_size), replace=True)
                else:
                    continuous_subset = np.random.choice(self.continuous_features, size=min(len(self.continuous_features), self.subset_size), replace=False)
            else:
                continuous_subset = []

            # 选择离散特征子集 (示例：使用互信息)
            if discrete_features:
                n_discrete_features = min(len(self.discrete_features), self.subset_size - len(continuous_subset))
                discrete_subset = np.argsort(mutual_info_classif(X[:, self.discrete_features], y))[-n_discrete_features:]
            else:
                discrete_subset = []

            # 合并特征子集
            feature_subset = np.concatenate((continuous_subset, discrete_subset)).astype(int)

            self.feature_subsets.append(feature_subset)

            # 使用选定的特征子集训练 NN 分类器 (使用混合距离)
            clf = KNeighborsClassifier(n_neighbors=1, metric=self._mixed_distance)
            clf.fit(X[:, feature_subset], y)
            self.classifiers.append(clf)

    def predict(self, X):
        """
        预测新数据的标签。

        参数：
            X (array-like, shape (n_samples, n_features)): 要预测的数据。

        返回：
            array-like, shape (n_samples,): 预测的标签。
        """
        predictions = []
        for i in range(self.num_classifiers):
            # 使用每个分类器进行预测
            prediction = self.classifiers[i].predict(X[:, self.feature_subsets[i]])
            predictions.append(prediction)

        # 使用简单投票法组合预测结果
        predictions = np.array(predictions).T  # 转置为 (n_samples, n_classifiers)
        final_predictions = []
        for i in range(predictions.shape[0]):
            # 统计每个样本的投票结果
            votes = Counter(predictions[i])
            # 选择票数最多的类别作为最终预测结果
            final_predictions.append(votes.most_common(1)[0][0])
        return np.array(final_predictions)

    def _mixed_distance(self, x1, x2):
        """
        计算混合距离，分别使用汉明距离和欧氏距离。

        参数：
            x1 (array-like, shape (n_features,)):  第一个样本。
            x2 (array-like, shape (n_features,)):  第二个样本。

        返回：
            float: 两个样本之间的距离。
        """
        # 分别提取离散特征和连续特征
        x1_discrete = x1[self.discrete_features]
        x2_discrete = x2[self.discrete_features]
        x1_continuous = x1[self.continuous_features]
        x2_continuous = x2[self.continuous_features]

        # 计算汉明距离和欧氏距离
        if len(self.discrete_features) > 0:
            hamming_dist = hamming(x1_discrete, x2_discrete) 
        else: 
            hamming_dist = 0

        if len(self.continuous_features) > 0:
            euclidean_dist = euclidean(x1_continuous, x2_continuous)
        else:
            euclidean_dist = 0

        # 组合距离 (示例:  简单相加)
        return hamming_dist + euclidean_dist

# 示例用法：
if __name__ == "__main__":
    # 加载数据集
    from sklearn.datasets import load_iris
    iris = load_iris()
    X = iris.data
    y = iris.target

    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 创建并训练 MFS 分类器
    mfs = MFS(num_classifiers=100, subset_size=2, sampling='without_replacement')  # 使用 MFS2
    mfs.fit(X_train, y_train)

    # 预测测试集
    y_pred = mfs.predict(X_test)

    # 计算准确率
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")

NameError: name 'continuous_features' is not defined

In [7]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import euclidean_distances
from scipy.spatial.distance import hamming
import random

def custom_distance(x1, x2, selected_features, distance_types):
    distance = 0.0
    for i in selected_features:
        dist_type = distance_types[i]
        if dist_type == 'hamming':
            distance += hamming([x1[i]], [x2[i]]) * 1  # 汉明距离
        elif dist_type == 'euclidean':
            distance += euclidean_distances([[x1[i]]], [[x2[i]]])[0][0]  # 欧式距离
    return distance

class CustomKNN(BaseEstimator, ClassifierMixin):
    def __init__(self, n_neighbors=5, distance_types=None):
        self.n_neighbors = n_neighbors
        self.distance_types = distance_types
        self.knn = KNeighborsClassifier(n_neighbors=n_neighbors, metric=custom_distance, metric_params={})

    def fit(self, X, y):
        # 随机选择两个特征
        self.selected_features = random.sample(range(X.shape[1]), k=2)
        
        # 设置 KNN 分类器的距离度量参数
        self.knn.metric_params = {'selected_features': self.selected_features, 'distance_types': self.distance_types}
        
        # 使用 KNN 分类器的 fit 方法进行训练
        self.knn.fit(X[:, self.selected_features], y)
        
        # 返回 self
        return self

    def predict(self, X):
        return self.knn.predict(X[:, self.selected_features])

# 示例数据集
X = np.array([
    [0, 1, 1],
    [1, 0, 0],
    [0, 0, 1],
    [1, 1, 0]
])
y = np.array([0, 1, 0, 1])

# 指定每个特征的距离度量类型
distance_types = ['hamming', 'euclidean', 'hamming']

# 创建自定义KNN分类器
custom_knn = CustomKNN(n_neighbors=3, distance_types=distance_types)

# 训练模型
custom_knn.fit(X, y)

# 预测
print(custom_knn.predict(X))

IndexError: index 2 is out of bounds for axis 0 with size 2