In [1]:
import numpy as np
import pandas as pd
data = pd.read_csv("data/train.csv")
df = pd.DataFrame(data)

In [None]:
1️⃣ 缺失值處理（Missing Values Handling）

In [4]:
# 計算缺失值比例
missing_ratio = df.isnull().sum() / len(df) * 100
print("缺失值比例:\n", missing_ratio)

缺失值比例:
 Id                            0.000000
Home Ownership                0.000000
Annual Income                20.760000
Years in current job          4.946667
Tax Liens                     0.000000
Number of Open Accounts       0.000000
Years of Credit History       0.000000
Maximum Open Credit           0.000000
Number of Credit Problems     0.000000
Bankruptcies                  0.186667
Purpose                       0.000000
Term                          0.000000
Current Loan Amount           0.000000
Current Credit Balance        0.000000
Monthly Debt                  0.000000
Credit Score                 20.760000
Credit Default                0.000000
dtype: float64


In [5]:
# 刪除缺失值 > 50% 的欄位
df = df.drop(columns=missing_ratio[missing_ratio > 50].index)

In [8]:
# 填補數值型欄位（用中位數填補）
# df.fillna(df.median(), inplace=True)
for col in df.select_dtypes(include=['number']).columns:
    df[col].fillna(df[col].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [9]:
# 填補類別型欄位（用眾數填補）
for col in df.select_dtypes(include=['object']).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# 確認缺失值已處理
print("缺失值處理後:\n", df.isnull().sum())


缺失值處理後:
 Id                           0
Home Ownership               0
Annual Income                0
Years in current job         0
Tax Liens                    0
Number of Open Accounts      0
Years of Credit History      0
Maximum Open Credit          0
Number of Credit Problems    0
Bankruptcies                 0
Purpose                      0
Term                         0
Current Loan Amount          0
Current Credit Balance       0
Monthly Debt                 0
Credit Score                 0
Credit Default               0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [None]:
2️⃣ 重複數據處理（Duplicate Data Handling）

In [10]:
# 檢查重複值
print("重複數據數量:", df.duplicated().sum())

# 移除重複數據
df = df.drop_duplicates()

# 確認是否還有重複值
print("處理後的重複數據數量:", df.duplicated().sum())


重複數據數量: 0
處理後的重複數據數量: 0


In [None]:
3️⃣ 異常值處理（Outlier Handling）

In [11]:
# 定義 IQR 方法來處理異常值
def remove_outliers_iqr(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

# 只對數值型欄位應用 IQR 異常值處理
num_cols = df.select_dtypes(include=['number']).columns
for col in num_cols:
    df = remove_outliers_iqr(df, col)

print("異常值處理後的數據:", df.shape)


異常值處理後的數據: (3895, 17)


In [None]:
4️⃣ 類別型欄位處理（Categorical Data Cleaning）

In [None]:
# 統一類別型欄位格式（全部轉小寫，去除額外空格）
# df['Job Title'] = df['Job Title'].str.lower().str.strip()

# # 檢查不同類別的分佈
# print(df['Job Title'].value_counts())

# # 若有相似類別，可進行合併（例如縮寫統一）
# df['Job Title'] = df['Job Title'].replace({'mgr': 'manager', 'sr.': 'senior'})


In [None]:
5️⃣ 數據型態轉換（Data Type Conversion）

In [12]:
# 轉換類別型欄位
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype('category')

# 確認數據型態
print(df.dtypes)

Id                              int64
Home Ownership               category
Annual Income                 float64
Years in current job         category
Tax Liens                     float64
Number of Open Accounts       float64
Years of Credit History       float64
Maximum Open Credit           float64
Number of Credit Problems     float64
Bankruptcies                  float64
Purpose                      category
Term                         category
Current Loan Amount           float64
Current Credit Balance        float64
Monthly Debt                  float64
Credit Score                  float64
Credit Default                  int64
dtype: object


In [None]:
df['Credit Score'] = df['Credit Score'].astype('Int64')  # 保留 NaN

In [13]:
print(df.isnull().sum())

Id                           0
Home Ownership               0
Annual Income                0
Years in current job         0
Tax Liens                    0
Number of Open Accounts      0
Years of Credit History      0
Maximum Open Credit          0
Number of Credit Problems    0
Bankruptcies                 0
Purpose                      0
Term                         0
Current Loan Amount          0
Current Credit Balance       0
Monthly Debt                 0
Credit Score                 0
Credit Default               0
dtype: int64


In [14]:
print(df['Credit Default'].unique())

[1 0]


In [16]:
# 將整理的資料存成新的檔案，且避免儲存 Pandas 自動生成的索引欄位
df.to_csv("data/cleaned_data.csv", index=False)