# 1.数据预处理

In [1]:
# 导入库包
import pandas as pd

In [2]:
# 读取数据文件
df = pd.read_csv('data.csv')

# 显示前5行数据
print(df.head())

   SK_ID_CURR  TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR  \
0      100002       1         Cash loans           M            N   
1      100003       0         Cash loans           F            N   
2      100004       0    Revolving loans           M            Y   
3      100006       0         Cash loans           F            N   
4      100007       0         Cash loans           M            N   

  FLAG_OWN_REALTY  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \
0               Y             0          202500.0    406597.5      24700.5   
1               N             0          270000.0   1293502.5      35698.5   
2               Y             0           67500.0    135000.0       6750.0   
3               Y             0          135000.0    312682.5      29686.5   
4               Y             0          121500.0    513000.0      21865.5   

   ...  FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21  \
0  ...                 0             

In [3]:
# 显示数据集信息
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(65), int64(41), object(16)
memory usage: 286.2+ MB


In [4]:
# 数据集的行数和列数
print(df.shape)

(307511, 122)


In [5]:
# 每一列数据的缺失值
print(df.isnull().sum())

SK_ID_CURR                        0
TARGET                            0
NAME_CONTRACT_TYPE                0
CODE_GENDER                       0
FLAG_OWN_CAR                      0
                              ...  
AMT_REQ_CREDIT_BUREAU_DAY     41519
AMT_REQ_CREDIT_BUREAU_WEEK    41519
AMT_REQ_CREDIT_BUREAU_MON     41519
AMT_REQ_CREDIT_BUREAU_QRT     41519
AMT_REQ_CREDIT_BUREAU_YEAR    41519
Length: 122, dtype: int64


In [6]:
# 数据集的统计描述
print(df.describe(include='all').T)

                               count unique         top    freq  \
SK_ID_CURR                  307511.0    NaN         NaN     NaN   
TARGET                      307511.0    NaN         NaN     NaN   
NAME_CONTRACT_TYPE            307511      2  Cash loans  278232   
CODE_GENDER                   307511      3           F  202448   
FLAG_OWN_CAR                  307511      2           N  202924   
...                              ...    ...         ...     ...   
AMT_REQ_CREDIT_BUREAU_DAY   265992.0    NaN         NaN     NaN   
AMT_REQ_CREDIT_BUREAU_WEEK  265992.0    NaN         NaN     NaN   
AMT_REQ_CREDIT_BUREAU_MON   265992.0    NaN         NaN     NaN   
AMT_REQ_CREDIT_BUREAU_QRT   265992.0    NaN         NaN     NaN   
AMT_REQ_CREDIT_BUREAU_YEAR  265992.0    NaN         NaN     NaN   

                                     mean            std       min       25%  \
SK_ID_CURR                  278180.518577  102790.175348  100002.0  189145.5   
TARGET                           0.

In [8]:
# 删除多余的列名
columns_to_keep = [
    "TARGET",
    "NAME_CONTRACT_TYPE",
    "CODE_GENDER",
    "FLAG_OWN_CAR",
    "FLAG_OWN_REALTY",
    "CNT_CHILDREN",
    "AMT_INCOME_TOTAL",
    "AMT_CREDIT",
    "AMT_ANNUITY",
    "AMT_GOODS_PRICE"
]

# 使用 DataFrame 的方括号[]操作来选择指定的列
df = df[columns_to_keep]

# 查看前5行数据
print(df.head())

   TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY  \
0       1         Cash loans           M            N               Y   
1       0         Cash loans           F            N               N   
2       0    Revolving loans           M            Y               Y   
3       0         Cash loans           F            N               Y   
4       0         Cash loans           M            N               Y   

   CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  AMT_GOODS_PRICE  
0             0          202500.0    406597.5      24700.5         351000.0  
1             0          270000.0   1293502.5      35698.5        1129500.0  
2             0           67500.0    135000.0       6750.0         135000.0  
3             0          135000.0    312682.5      29686.5         297000.0  
4             0          121500.0    513000.0      21865.5         513000.0  


In [9]:
# 保留前一万条数据
df = df.head(10000)

# 打印数据的行数和列数
print("数据的行数和列数：", df.shape)

数据的行数和列数： (10000, 10)


In [10]:
# 删除存在缺失值的行
# inplace=True 表示在原 df 上直接修改，不会返回新的 DataFrame
df.dropna(inplace=True)
# 再次打印数据的形状，确认缺失值的行已删除
print("删除缺失值后的数据形状：", df.shape)

# 如果想确认是否还存在缺失值，可以用 isnull().sum()
print("每一列的缺失值数量：")
print(df.isnull().sum())

删除缺失值后的数据形状： (9993, 10)
每一列的缺失值数量：
TARGET                0
NAME_CONTRACT_TYPE    0
CODE_GENDER           0
FLAG_OWN_CAR          0
FLAG_OWN_REALTY       0
CNT_CHILDREN          0
AMT_INCOME_TOTAL      0
AMT_CREDIT            0
AMT_ANNUITY           0
AMT_GOODS_PRICE       0
dtype: int64


In [11]:
# 将清洗以后的文件保存为新的csv文件
# index=False 表示不把行索引保存到文件中
# encoding="utf-8" 表示使用 UTF-8 编码保存（兼容性较好）

df.to_csv("data_clean.csv", index=False, encoding="utf-8")

print("数据已保存为 data_clean.csv")

数据已保存为 data_clean.csv
