In [17]:
import numpy as np
import pandas as pd
import warnings
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from imblearn.over_sampling import SMOTE

# 归一化预处理
class Normalizer:
    def __init__(self, dataframe):
        """
        初始化类，接收一个DataFrame作为输入。
        """
        self.dataframe = dataframe
        self.scaler = MinMaxScaler()
        self.normalized_data = None
        self.scaler_params = None

    def get_normalization_params(self):
        """
        获取归一化参数（基于MinMaxScaler）。
        """
        # 拟合数据以获取归一化参数
        self.scaler.fit(self.dataframe)
        
        # 保存归一化参数
        self.scaler_params = {
            'data_min': self.scaler.data_min_,
            'data_max': self.scaler.data_max_
        }
        return self.scaler_params

    def preprocess(self, otherdata=None):
        """
        对数据进行归一化处理。
        使用类中的归一化参数，如果参数为空则抛出提示。

        参数:
        - otherdata: 可选参数，指定需要归一化的数据。如果未提供，则默认使用类中的dataframe。
        """
        if self.scaler_params is None:
            raise ValueError("归一化参数为空！请先调用 `get_normalization_params` 方法获取归一化参数。")
        
        # 如果未提供otherdata，则使用类中的dataframe
        if otherdata is None:
            otherdata = self.dataframe
        
        # 使用类中的归一化参数进行归一化
        data_min = self.scaler_params['data_min']
        data_max = self.scaler_params['data_max']
        normalized_data = (otherdata - data_min) / (data_max - data_min)
        
        # 将归一化后的数据转换回DataFrame
        normalized_df = pd.DataFrame(normalized_data, columns=otherdata.columns)
        return normalized_df

# 从 CSV 文件中读取数据并转换为 DataFrame
df_drop = pd.read_csv('df_drop.csv')
df = pd.read_csv('df.csv')

In [19]:
###自定义运行逻辑回归的类###

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from imblearn.over_sampling import SMOTE

class LogisticRegressionExperiment:
    def __init__(self, df, target_col, continuous_cols, preprocess_method='normalize', use_smote=False, random_state=68):
        """
        初始化实验类
        :param df: 数据集
        :param target_col: 目标列名
        :param continuous_cols: 连续变量列名列表
        :param preprocess_method: 预处理方法，可选 'none', 'normalize', 'log_normalize'
        :param use_smote: 是否使用 SMOTE 过采样
        :param random_state: 随机种子
        """
        self.df = df
        self.target_col = target_col
        self.continuous_cols = continuous_cols
        self.preprocess_method = preprocess_method
        self.use_smote = use_smote
        self.random_state = random_state

    def preprocess_data(self, train_df, test_df):
        """
        数据预处理方法
        :param train_df: 训练集
        :param test_df: 测试集
        :return: 处理后的训练集和测试集
        """
        # 创建副本以避免操作视图
        train_df = train_df.copy()
        test_df = test_df.copy()

        if self.preprocess_method == 'normalize':
            scaler = MinMaxScaler()
            train_df.loc[:, self.continuous_cols] = scaler.fit_transform(train_df.loc[:, self.continuous_cols])
            test_df.loc[:, self.continuous_cols] = scaler.transform(test_df.loc[:, self.continuous_cols])
        elif self.preprocess_method == 'log_normalize':
            # 对训练集和测试集取对数
            train_log = train_df.loc[:, self.continuous_cols].apply(lambda x: np.log(x + 1))  # 加 1 避免 log(0)
            test_log = test_df.loc[:, self.continuous_cols].apply(lambda x: np.log(x + 1))  # 加 1 避免 log(0)

            # 归一化
            scaler = MinMaxScaler()
            train_df.loc[:, self.continuous_cols] = scaler.fit_transform(train_log)
            test_df.loc[:, self.continuous_cols] = scaler.transform(test_log)
        # 如果 preprocess_method 是 'none'，则不进行任何处理
        return train_df, test_df

    def run_experiment(self):
        """
        运行逻辑回归实验
        """
        # 打印当前实验配置
        print(f"正在运行实验 - 预处理方法: {self.preprocess_method}, 使用 SMOTE: {self.use_smote}")

        kf = KFold(n_splits=5, shuffle=True, random_state=self.random_state)
        results = []
        iterations = []

        for train_index, test_index in kf.split(self.df):
            train_df, test_df = self.df.iloc[train_index, :], self.df.iloc[test_index, :]
            train_df, test_df = self.preprocess_data(train_df, test_df)

            X_train = train_df.drop(self.target_col, axis=1)
            y_train = train_df[self.target_col]
            X_test = test_df.drop(self.target_col, axis=1)
            y_test = test_df[self.target_col]

            # 根据 use_smote 参数决定是否使用 SMOTE
            if self.use_smote:
                smote = SMOTE(random_state=self.random_state)
                X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
            else:
                X_train_resampled, y_train_resampled = X_train, y_train

            # 训练逻辑回归模型
            model = LogisticRegression(max_iter=2000)
            model.fit(X_train_resampled, y_train_resampled)

            # 记录迭代次数
            iterations.append(model.n_iter_[0])

            # 在测试集上评估模型
            y_pred = model.predict(X_test)
            y_pred_proba = model.predict_proba(X_test)[:, 1]

            # 保存评估结果
            report = classification_report(y_test, y_pred, output_dict=True)
            auc = roc_auc_score(y_test, y_pred_proba)
            accuracy = accuracy_score(y_test, y_pred)

            results.append({
                'precision': report['weighted avg']['precision'],
                'recall': report['weighted avg']['recall'],
                'f1_score': report['weighted avg']['f1-score'],
                'auc': auc,
                'accuracy': accuracy
            })

        # 输出平均评估结果
        avg_precision = np.mean([result['precision'] for result in results])
        avg_recall = np.mean([result['recall'] for result in results])
        avg_f1_score = np.mean([result['f1_score'] for result in results])
        avg_auc = np.mean([result['auc'] for result in results])
        avg_accuracy = np.mean([result['accuracy'] for result in results])
        avg_iterations = np.mean(iterations)

        print(f"平均 Precision: {avg_precision:.4f}")
        print(f"平均 Recall: {avg_recall:.4f}")
        print(f"平均 F1 Score: {avg_f1_score:.4f}")
        print(f"平均 AUC: {avg_auc:.4f}")
        print(f"平均 Accuracy: {avg_accuracy:.4f}")
        print(f"平均迭代次数: {avg_iterations:.1f}")
        print("-" * 50)


In [20]:
###运行结果###

# 代码实践
cols_continuous1 = ['Age', 'MonthlyIncome', 'NumCompaniesWorked', 'TotalWorkingYears', 'TrainingTimesLastYear', 
                    'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

# 定义所有实验配置
preprocess_methods = ['none', 'normalize', 'log_normalize']
use_smote_options = [False]

# 不采用任何采样处理
for preprocess_method in preprocess_methods:
    for use_smote in use_smote_options:
        experiment = LogisticRegressionExperiment(
            df_drop, 
            'Attrition', 
            cols_continuous1, 
            preprocess_method=preprocess_method, 
            use_smote=use_smote
        )
        experiment.run_experiment()

正在运行实验 - 预处理方法: none, 使用 SMOTE: False
平均 Precision: 0.8754
平均 Recall: 0.8836
平均 F1 Score: 0.8676
平均 AUC: 0.8254
平均 Accuracy: 0.8836
平均迭代次数: 1111.8
--------------------------------------------------
正在运行实验 - 预处理方法: normalize, 使用 SMOTE: False
平均 Precision: 0.8722
平均 Recall: 0.8818
平均 F1 Score: 0.8651
平均 AUC: 0.8266
平均 Accuracy: 0.8818
平均迭代次数: 42.4
--------------------------------------------------
正在运行实验 - 预处理方法: log_normalize, 使用 SMOTE: False
平均 Precision: 0.8833
平均 Recall: 0.8900
平均 F1 Score: 0.8771
平均 AUC: 0.8319
平均 Accuracy: 0.8900
平均迭代次数: 42.8
--------------------------------------------------


In [21]:
# 进行SMOTE采样处理
use_smote_options = [True]

# 遍历所有组合并运行实验
for preprocess_method in preprocess_methods:
    for use_smote in use_smote_options:
        experiment = LogisticRegressionExperiment(
            df_drop, 
            'Attrition', 
            cols_continuous1, 
            preprocess_method=preprocess_method, 
            use_smote=use_smote
        )
        experiment.run_experiment()

正在运行实验 - 预处理方法: none, 使用 SMOTE: True
平均 Precision: 0.8537
平均 Recall: 0.7855
平均 F1 Score: 0.8074
平均 AUC: 0.8191
平均 Accuracy: 0.7855
平均迭代次数: 1282.6
--------------------------------------------------
正在运行实验 - 预处理方法: normalize, 使用 SMOTE: True
平均 Precision: 0.8542
平均 Recall: 0.7827
平均 F1 Score: 0.8056
平均 AUC: 0.8194
平均 Accuracy: 0.7827
平均迭代次数: 60.0
--------------------------------------------------
正在运行实验 - 预处理方法: log_normalize, 使用 SMOTE: True
平均 Precision: 0.8551
平均 Recall: 0.7882
平均 F1 Score: 0.8096
平均 AUC: 0.8212
平均 Accuracy: 0.7882
平均迭代次数: 60.6
--------------------------------------------------
