In [11]:
import numpy as np
import pandas as pd
import warnings
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
import matplotlib


# 归一化预处理
class Normalizer:
    def __init__(self, dataframe):
        """
        初始化类，接收一个DataFrame作为输入。
        """
        self.dataframe = dataframe
        self.scaler = MinMaxScaler()
        self.normalized_data = None
        self.scaler_params = None

    def get_normalization_params(self):
        """
        获取归一化参数（基于MinMaxScaler）。
        """
        # 拟合数据以获取归一化参数
        self.scaler.fit(self.dataframe)
        
        # 保存归一化参数
        self.scaler_params = {
            'data_min': self.scaler.data_min_,
            'data_max': self.scaler.data_max_
        }
        return self.scaler_params

    def preprocess(self, otherdata=None):
        """
        对数据进行归一化处理。
        使用类中的归一化参数，如果参数为空则抛出提示。

        参数:
        - otherdata: 可选参数，指定需要归一化的数据。如果未提供，则默认使用类中的dataframe。
        """
        if self.scaler_params is None:
            raise ValueError("归一化参数为空！请先调用 `get_normalization_params` 方法获取归一化参数。")
        
        # 如果未提供otherdata，则使用类中的dataframe
        if otherdata is None:
            otherdata = self.dataframe
        
        # 使用类中的归一化参数进行归一化
        data_min = self.scaler_params['data_min']
        data_max = self.scaler_params['data_max']
        normalized_data = (otherdata - data_min) / (data_max - data_min)
        
        # 将归一化后的数据转换回DataFrame
        normalized_df = pd.DataFrame(normalized_data, columns=otherdata.columns)
        return normalized_df
    

# 对无序分类变量进行独热编码
class OneHotEncoderProcessor:
    def __init__(self, dataframe):
        """
        初始化类，接收一个DataFrame作为输入。
        
        参数:
        - dataframe: 包含无序类别变量的数据矩阵。
        """
        self.dataframe = dataframe
        self.encoded_data = None

    def process(self):
        """
        对所有列进行独热编码。
        """
        # 初始化独热编码器
        ohe = OneHotEncoder(sparse=False, drop='first')  # drop='first' 避免多重共线性
        encoded_array = ohe.fit_transform(self.dataframe)
        encoded_columns = ohe.get_feature_names_out(self.dataframe.columns)
        
        # 将编码后的数据转换为DataFrame
        self.encoded_data = pd.DataFrame(encoded_array, columns=encoded_columns)
        return self.encoded_data
#导入数据
train=pd.read_csv('pfm_train.csv')
# 查看数据的基本信息
train.info()
# 查看数据的统计信息
train.describe()
# 检查缺失值
null_counts = train.isnull().sum()
# 筛选null个数非零的列
columns_with_null = null_counts[null_counts > 0]
# 打印结果
print("包含null值的列名及其null值个数：")
print(columns_with_null)
# 删除无关项EmployeeNumber（为编号对建模是干扰项）StandardHours和Over18（全数据集固定值没有意义）
train.drop(columns=['EmployeeNumber', 'StandardHours', 'Over18'], inplace=True)

# 有序类别变量BusinessTravel
# 定义映射关系
travel_mapping = {
    'Non-Travel': 1,
    'Travel_Rarely': 2,
    'Travel_Frequently': 3
}
train['BusinessTravel'] = train['BusinessTravel'].map(travel_mapping)
print(train['BusinessTravel'])

# 所有连续型变量
cols_continuous = ['Age', 'MonthlyIncome', 'NumCompaniesWorked', 'PercentSalaryHike',  
                    'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole', 
                    'YearsSinceLastPromotion', 'YearsWithCurrManager']
train_continuous = train.loc[:, cols_continuous]

# 所有有序类别变量
cols_order_categories = ['BusinessTravel', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 
                          'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'PerformanceRating', 
                          'RelationshipSatisfaction', 'StockOptionLevel', 'WorkLifeBalance']
train_order_categories = train.loc[:, cols_order_categories]

# 所有无序类别变量
cols_categories = ['Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']
train_categories = train.loc[:, cols_categories]

# 归一化有序分类变量
normalize1 = Normalizer(train_order_categories)
normalize1.get_normalization_params()
train_normalized_order_categories = normalize1.preprocess()
# 独热编码无序分类变量
one1=OneHotEncoderProcessor(train_categories)
train_onehot_categories = one1.process()
# 连接处理后的变量
df = pd.concat([train['Attrition'].astype(float), train_continuous.astype(float), train_normalized_order_categories,train_onehot_categories], axis=1)
df.head()
cols_drop = ['YearsAtCompany', 'JobLevel', 'Department_Sales', 'JobRole_Sales Executive', 'PercentSalaryHike']
df_drop = df.drop(cols_drop, axis=1)
df_drop.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100 entries, 0 to 1099
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1100 non-null   int64 
 1   Attrition                 1100 non-null   int64 
 2   BusinessTravel            1100 non-null   object
 3   Department                1100 non-null   object
 4   DistanceFromHome          1100 non-null   int64 
 5   Education                 1100 non-null   int64 
 6   EducationField            1100 non-null   object
 7   EmployeeNumber            1100 non-null   int64 
 8   EnvironmentSatisfaction   1100 non-null   int64 
 9   Gender                    1100 non-null   object
 10  JobInvolvement            1100 non-null   int64 
 11  JobLevel                  1100 non-null   int64 
 12  JobRole                   1100 non-null   object
 13  JobSatisfaction           1100 non-null   int64 
 14  MaritalStatus           

Unnamed: 0,Attrition,Age,MonthlyIncome,NumCompaniesWorked,TotalWorkingYears,TrainingTimesLastYear,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,BusinessTravel,...,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single,OverTime_Yes
0,0.0,37.0,5993.0,1.0,7.0,2.0,5.0,0.0,7.0,0.5,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,54.0,10502.0,7.0,33.0,2.0,4.0,1.0,4.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,34.0,6074.0,1.0,9.0,3.0,7.0,0.0,6.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,0.0,39.0,12742.0,1.0,21.0,3.0,6.0,11.0,8.0,0.5,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,28.0,2596.0,1.0,1.0,2.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# 保存 df,df_drop 到 CSV 文件
df_drop.to_csv('df_drop.csv',index=False)  
df.to_csv('df.csv', index=False)  