In [4]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# 创建一个不平衡数据集（示例中正例较少）
X, y = make_classification(n_samples=1000, n_features=20, weights=[0.02, 0.98], random_state=42)

# 划分数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 训练逻辑回归分类器（未经过重要性采样）
model_original = LogisticRegression()
model_original.fit(X_train, y_train)

# 在测试集上进行预测和评估（未经过重要性采样）
y_pred_original = model_original.predict(X_test)
print("Results without Importance Resampling:")
print(classification_report(y_test, y_pred_original))

# 使用重要性 resampling 调整数据集
positive_samples = X_train[y_train == 1]
negative_samples = X_train[y_train == 0]

# 计算正例和负例的样本权重
positive_weight = len(negative_samples) / len(positive_samples)
print(len(negative_samples) , len(positive_samples)) #20, 680
print(positive_weight)
negative_weight = 1.0

# 使用重要性加权调整样本
adjusted_samples = np.vstack((positive_samples, negative_samples))
adjusted_labels = np.hstack(([1] * len(positive_samples), [0] * len(negative_samples)))
sample_weights = np.hstack(([positive_weight] * len(positive_samples), [negative_weight] * len(negative_samples)))

# 训练逻辑回归分类器（经过重要性采样）
model_adjusted = LogisticRegression()
model_adjusted.fit(adjusted_samples, adjusted_labels, sample_weight=sample_weights)

# 在测试集上进行预测和评估（经过重要性采样）
y_pred_adjusted = model_adjusted.predict(X_test)
print("\nResults with Importance Resampling:")
print(classification_report(y_test, y_pred_adjusted))


Results without Importance Resampling:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         6
           1       0.98      1.00      0.99       294

    accuracy                           0.98       300
   macro avg       0.49      0.50      0.49       300
weighted avg       0.96      0.98      0.97       300

20 680
0.029411764705882353

Results with Importance Resampling:
              precision    recall  f1-score   support

           0       0.06      0.83      0.11         6
           1       1.00      0.74      0.85       294

    accuracy                           0.74       300
   macro avg       0.53      0.79      0.48       300
weighted avg       0.98      0.74      0.84       300

