## 数据预处理

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler



In [3]:
# 读取数据
df = pd.read_csv('../data_new/ziguan_full.csv')

# 显示原始数据信息
print("原始数据信息:")
print(df.info())
print("\n原始数据样本:")
print(df.head())

原始数据信息:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54072 entries, 0 to 54071
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CUST_ID    54072 non-null  int64  
 1   CUST_TYPE  54072 non-null  int64  
 2   CONF_DATE  54072 non-null  int64  
 3   BUSI_CODE  54072 non-null  int64  
 4   FUND_CODE  54072 non-null  object 
 5   CONF_AMTS  54072 non-null  float64
 6   GENDER     26561 non-null  float64
 7   BIRTH      26561 non-null  float64
 8   TELL       46888 non-null  object 
 9   NET_CODE   47640 non-null  object 
 10  RISK_LEV   47640 non-null  float64
 11  COUNTY     26744 non-null  float64
 12  Matched    54072 non-null  bool   
dtypes: bool(1), float64(5), int64(4), object(3)
memory usage: 5.0+ MB
None

原始数据样本:
        CUST_ID  CUST_TYPE  CONF_DATE  BUSI_CODE FUND_CODE  CONF_AMTS  GENDER  \
0  369000460487          1   20240102        139    000596      50.00     2.0   
1  369000450364          1   2024010

In [4]:
# 2.1 处理缺失值
# 数值型列用0填充，类别型列用'missing'填充
num_cols = ['CONF_AMTS', 'GENDER', 'BIRTH', 'RISK_LEV', 'COUNTY','TELL']
cat_cols = ['CUST_TYPE', 'BUSI_CODE', 'FUND_CODE', 'NET_CODE']

df[num_cols] = df[num_cols].fillna(0)
df[cat_cols] = df[cat_cols].fillna('missing')

df.head()

Unnamed: 0,CUST_ID,CUST_TYPE,CONF_DATE,BUSI_CODE,FUND_CODE,CONF_AMTS,GENDER,BIRTH,TELL,NET_CODE,RISK_LEV,COUNTY,Matched
0,369000460487,1,20240102,139,596,50.0,2.0,19770726.0,139****9808,etrading,4.0,320102.0,False
1,369000450364,1,20240102,139,1986,10.0,1.0,19900106.0,138****8973,etrading,3.0,310107.0,False
2,369000610014,1,20240102,139,5928,1.0,2.0,19640306.0,136****5770,etrading,5.0,650104.0,False
3,369001110005,1,20240102,122,4399,95.77,2.0,19860711.0,186****7616,etrading,4.0,440305.0,False
4,369001110005,1,20240102,122,4399,23.94,2.0,19860711.0,186****7616,etrading,4.0,440305.0,False


In [5]:
# 2.2 处理日期特征
df['CONF_DATE'] = pd.to_datetime(df['CONF_DATE'], format='%Y%m%d')
df['CONF_YEAR'] = df['CONF_DATE'].dt.year
df['CONF_MONTH'] = df['CONF_DATE'].dt.month
df['CONF_DAY'] = df['CONF_DATE'].dt.day
df = df.drop('CONF_DATE', axis=1)

df.head()

Unnamed: 0,CUST_ID,CUST_TYPE,BUSI_CODE,FUND_CODE,CONF_AMTS,GENDER,BIRTH,TELL,NET_CODE,RISK_LEV,COUNTY,Matched,CONF_YEAR,CONF_MONTH,CONF_DAY
0,369000460487,1,139,596,50.0,2.0,19770726.0,139****9808,etrading,4.0,320102.0,False,2024,1,2
1,369000450364,1,139,1986,10.0,1.0,19900106.0,138****8973,etrading,3.0,310107.0,False,2024,1,2
2,369000610014,1,139,5928,1.0,2.0,19640306.0,136****5770,etrading,5.0,650104.0,False,2024,1,2
3,369001110005,1,122,4399,95.77,2.0,19860711.0,186****7616,etrading,4.0,440305.0,False,2024,1,2
4,369001110005,1,122,4399,23.94,2.0,19860711.0,186****7616,etrading,4.0,440305.0,False,2024,1,2


In [6]:
# 2.3 处理出生日期特征

def process_birth_data(df):
    """处理BIRTH字段的完整方案"""
    
    # 1. 类型转换与格式处理
    df['BIRTH'] = df['BIRTH'].astype('Int64').astype(str)  # 先转Int64避免精度丢失，再转字符串
    
    # 2. 有效日期过滤 (处理8位数字)
    mask_valid = df['BIRTH'].str.match(r'^\d{8}$') & ~df['BIRTH'].isin(['0', '99999999'])
    
    # 3. 日期转换
    df['BIRTH_DATE'] = pd.to_datetime(
        df.loc[mask_valid, 'BIRTH'],
        format='%Y%m%d',
        errors='coerce'
    )
    
    # 4. 计算年龄（当前年份 - 出生年份）
    current_year = pd.Timestamp.now().year
    df['AGE'] = (current_year - df['BIRTH_DATE'].dt.year)
    
    # 5. 年龄合理性校验
    age_mask = (df['AGE'] > 0) & (df['AGE'] < 120)
    df.loc[~age_mask, 'AGE'] = np.nan
    
    # 6. 缺失值处理（使用中位数填充）
    median_age = df['AGE'].median()
    df['AGE'] = df['AGE'].fillna(median_age).astype(int)
    
    
    # 8. 删除临时列
    df = df.drop(['BIRTH', 'BIRTH_DATE'], axis=1)
    
    # 9. 验证输出
    print("年龄分布统计:")
    print(df['AGE'].describe())

    
    return df

df = process_birth_data(df)
df.head()

年龄分布统计:
count    54072.000000
mean        43.496875
std          8.787953
min         20.000000
25%         42.000000
50%         42.000000
75%         42.000000
max         78.000000
Name: AGE, dtype: float64


Unnamed: 0,CUST_ID,CUST_TYPE,BUSI_CODE,FUND_CODE,CONF_AMTS,GENDER,TELL,NET_CODE,RISK_LEV,COUNTY,Matched,CONF_YEAR,CONF_MONTH,CONF_DAY,AGE
0,369000460487,1,139,596,50.0,2.0,139****9808,etrading,4.0,320102.0,False,2024,1,2,48
1,369000450364,1,139,1986,10.0,1.0,138****8973,etrading,3.0,310107.0,False,2024,1,2,35
2,369000610014,1,139,5928,1.0,2.0,136****5770,etrading,5.0,650104.0,False,2024,1,2,61
3,369001110005,1,122,4399,95.77,2.0,186****7616,etrading,4.0,440305.0,False,2024,1,2,39
4,369001110005,1,122,4399,23.94,2.0,186****7616,etrading,4.0,440305.0,False,2024,1,2,39


In [7]:
# 2.4 处理电话号码特征
df['TELL_PREFIX'] = df['TELL'].apply(
    lambda x: f"{x[:3]}{x[-4:]}" if pd.notna(x) and len(str(x)) >= 7 else 0
)

df = df.drop('TELL', axis=1)
df.head()


Unnamed: 0,CUST_ID,CUST_TYPE,BUSI_CODE,FUND_CODE,CONF_AMTS,GENDER,NET_CODE,RISK_LEV,COUNTY,Matched,CONF_YEAR,CONF_MONTH,CONF_DAY,AGE,TELL_PREFIX
0,369000460487,1,139,596,50.0,2.0,etrading,4.0,320102.0,False,2024,1,2,48,1399808
1,369000450364,1,139,1986,10.0,1.0,etrading,3.0,310107.0,False,2024,1,2,35,1388973
2,369000610014,1,139,5928,1.0,2.0,etrading,5.0,650104.0,False,2024,1,2,61,1365770
3,369001110005,1,122,4399,95.77,2.0,etrading,4.0,440305.0,False,2024,1,2,39,1867616
4,369001110005,1,122,4399,23.94,2.0,etrading,4.0,440305.0,False,2024,1,2,39,1867616


In [8]:
## 2.5. 改进的COUNTY处理（拆分320102.0为省、市、县）
def split_county(county_float):
    try:
        county_str = f"{int(county_float):06d}"
        return {
            'COUNTY_PROV': county_str[:2],
            'COUNTY_CITY': county_str[2:4],
            'COUNTY_DIST': county_str[4:6]
        }
    except:
        return {
            'COUNTY_PROV': '00',
            'COUNTY_CITY': '00',
            'COUNTY_DIST': '00'
        }

county_split = pd.DataFrame(df['COUNTY'].apply(split_county).tolist())
df = pd.concat([df.drop('COUNTY', axis=1), county_split], axis=1)

In [9]:
# 2.6 处理分类特征
# 对高基数分类特征进行频数编码
for col in ['FUND_CODE']:
    freq = df[col].value_counts(normalize=True)
    df[col+'_FREQ'] = df[col].map(freq)
    df = df.drop(col, axis=1)

# 对低基数分类特征进行标签编码
le = LabelEncoder()
for col in ['NET_CODE', 'BUSI_CODE', 'CUST_TYPE']:
    df[col] = le.fit_transform(df[col])

df.head()



Unnamed: 0,CUST_ID,CUST_TYPE,BUSI_CODE,CONF_AMTS,GENDER,NET_CODE,RISK_LEV,Matched,CONF_YEAR,CONF_MONTH,CONF_DAY,AGE,TELL_PREFIX,COUNTY_PROV,COUNTY_CITY,COUNTY_DIST,FUND_CODE_FREQ
0,369000460487,1,9,50.0,2.0,1,4.0,False,2024,1,2,48,1399808,32,1,2,0.007361
1,369000450364,1,9,10.0,1.0,1,3.0,False,2024,1,2,35,1388973,31,1,7,0.002756
2,369000610014,1,9,1.0,2.0,1,5.0,False,2024,1,2,61,1365770,65,1,4,0.001406
3,369001110005,1,1,95.77,2.0,1,4.0,False,2024,1,2,39,1867616,44,3,5,0.236352
4,369001110005,1,1,23.94,2.0,1,4.0,False,2024,1,2,39,1867616,44,3,5,0.236352


In [10]:
# 2.7 处理数值特征
# 归一化数值特征
scaler = MinMaxScaler()
scale_cols = ['CONF_AMTS', 'AGE', 'RISK_LEV']
df[scale_cols] = scaler.fit_transform(df[scale_cols])

# 2.7 处理目标变量

df['target'] = df['Matched'].astype(int)
df = df.drop('Matched',axis=1)
# # 显示处理后的数据
# print("\n处理后的数据信息:")
# print(df.info())

df.head()

Unnamed: 0,CUST_ID,CUST_TYPE,BUSI_CODE,CONF_AMTS,GENDER,NET_CODE,RISK_LEV,CONF_YEAR,CONF_MONTH,CONF_DAY,AGE,TELL_PREFIX,COUNTY_PROV,COUNTY_CITY,COUNTY_DIST,FUND_CODE_FREQ,target
0,369000460487,1,9,5.598032e-08,2.0,1,0.8,2024,1,2,0.482759,1399808,32,1,2,0.007361,0
1,369000450364,1,9,1.119606e-08,1.0,1,0.6,2024,1,2,0.258621,1388973,31,1,7,0.002756,0
2,369000610014,1,9,1.119606e-09,2.0,1,1.0,2024,1,2,0.706897,1365770,65,1,4,0.001406,0
3,369001110005,1,1,1.072247e-07,2.0,1,0.8,2024,1,2,0.327586,1867616,44,3,5,0.236352,0
4,369001110005,1,1,2.680338e-08,2.0,1,0.8,2024,1,2,0.327586,1867616,44,3,5,0.236352,0


In [11]:
null_counts = df.isnull().sum()

# 打印结果
print("每列的空值数量:")
print(null_counts)

每列的空值数量:
CUST_ID           0
CUST_TYPE         0
BUSI_CODE         0
CONF_AMTS         0
GENDER            0
NET_CODE          0
RISK_LEV          0
CONF_YEAR         0
CONF_MONTH        0
CONF_DAY          0
AGE               0
TELL_PREFIX       0
COUNTY_PROV       0
COUNTY_CITY       0
COUNTY_DIST       0
FUND_CODE_FREQ    0
target            0
dtype: int64


In [13]:
# 保存为新的CSV文件
df.to_csv('../data_new/preprocessed_data_ml_full.csv', index=False)

print("\n预处理完成！")


预处理完成！
