In [None]:
"""
欄位名稱	繁體中文說明
Id	客戶 ID（唯一識別碼）
Home Ownership	房屋持有狀況（Own Home: 自有房產, Rent: 租房, Home Mortgage: 房貸）
Annual Income	年收入（申請人的年度收入）
Years in current job	目前工作的年數（如 10+ years, 5 years, <1 year，需轉換為數值）
Tax Liens	稅務留置權數量（是否因未繳稅款被政府扣押資產）
Number of Open Accounts	開啟的信貸帳戶數量（目前持有的信用卡、貸款數）
Years of Credit History	信用歷史年數（客戶擁有信貸紀錄的總年數）
Maximum Open Credit	最高可用信貸額度（過去曾獲得的最大信貸額度）
Number of Credit Problems	信用問題數量（如遲繳、違約等信用問題的總數）
Months since last delinquent	距離上次違約的月數（若為 0，代表未違約）
Bankruptcies	破產次數（申請人過去申請破產的次數）
Purpose	貸款用途（如 debt con: 債務整合, home im: 房屋裝修, other: 其他）
Term	貸款期限（Short: 短期貸款, Long: 長期貸款）
Current Loan Amount	目前貸款金額（客戶當前貸款的總金額）
Current Credit Balance	當前信用卡/貸款餘額（未償還的信貸餘額）
Monthly Debt	每月負債（申請人每月的貸款或信用卡還款額）
Credit Score	信用評分（數值型變數，影響違約風險）
Credit Default	是否違約（0=未違約，1=違約）（目標變數，分類問題）
"""

In [5]:
import numpy as np
import pandas as pd
data = pd.read_csv("data/train.csv")
df = pd.DataFrame(data)

In [7]:
# 1️⃣ 查看前幾筆數據
df.head()

Unnamed: 0,Id,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,debt consolidation,Short Term,99999999.0,47386.0,7914.0,749.0,0
1,1,Own Home,1025487.0,10+ years,0.0,15.0,15.3,1181730.0,0.0,,0.0,debt consolidation,Long Term,264968.0,394972.0,18373.0,737.0,1
2,2,Home Mortgage,751412.0,8 years,0.0,11.0,35.0,1182434.0,0.0,,0.0,debt consolidation,Short Term,99999999.0,308389.0,13651.0,742.0,0
3,3,Own Home,805068.0,6 years,0.0,8.0,22.5,147400.0,1.0,,1.0,debt consolidation,Short Term,121396.0,95855.0,11338.0,694.0,0
4,4,Rent,776264.0,8 years,0.0,13.0,13.6,385836.0,1.0,,0.0,debt consolidation,Short Term,125840.0,93309.0,7180.0,719.0,0


In [8]:
# 2️⃣ 查看數據結構（欄位名稱、數據型態、缺失值）
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Id                            7500 non-null   int64  
 1   Home Ownership                7500 non-null   object 
 2   Annual Income                 5943 non-null   float64
 3   Years in current job          7129 non-null   object 
 4   Tax Liens                     7500 non-null   float64
 5   Number of Open Accounts       7500 non-null   float64
 6   Years of Credit History       7500 non-null   float64
 7   Maximum Open Credit           7500 non-null   float64
 8   Number of Credit Problems     7500 non-null   float64
 9   Months since last delinquent  3419 non-null   float64
 10  Bankruptcies                  7486 non-null   float64
 11  Purpose                       7500 non-null   object 
 12  Term                          7500 non-null   object 
 13  Cur

In [9]:
# 3️⃣ 查看數據統計摘要（只適用於數值型）
df.describe()

Unnamed: 0,Id,Annual Income,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
count,7500.0,5943.0,7500.0,7500.0,7500.0,7500.0,7500.0,3419.0,7486.0,7500.0,7500.0,7500.0,5943.0,7500.0
mean,3749.5,1366392.0,0.030133,11.130933,18.317467,945153.7,0.17,34.6926,0.117152,11873180.0,289833.2,18314.454133,1151.087498,0.281733
std,2165.207842,845339.2,0.271604,4.908924,7.041946,16026220.0,0.498598,21.688806,0.347192,31926120.0,317871.4,11926.764673,1604.451418,0.449874
min,0.0,164597.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,11242.0,0.0,0.0,585.0,0.0
25%,1874.75,844341.0,0.0,8.0,13.5,279229.5,0.0,16.0,0.0,180169.0,114256.5,10067.5,711.0,0.0
50%,3749.5,1168386.0,0.0,10.0,17.0,478159.0,0.0,32.0,0.0,309573.0,209323.0,16076.5,731.0,0.0
75%,5624.25,1640137.0,0.0,14.0,21.8,793501.5,0.0,50.0,0.0,519882.0,360406.2,23818.0,743.0,1.0
max,7499.0,10149340.0,7.0,43.0,57.7,1304726000.0,7.0,118.0,4.0,100000000.0,6506797.0,136679.0,7510.0,1.0


In [10]:
# 4️⃣ 檢查缺失值
df.isnull().sum()

Id                                 0
Home Ownership                     0
Annual Income                   1557
Years in current job             371
Tax Liens                          0
Number of Open Accounts            0
Years of Credit History            0
Maximum Open Credit                0
Number of Credit Problems          0
Months since last delinquent    4081
Bankruptcies                      14
Purpose                            0
Term                               0
Current Loan Amount                0
Current Credit Balance             0
Monthly Debt                       0
Credit Score                    1557
Credit Default                     0
dtype: int64

In [11]:
# 5️⃣ 確認數值型與類別型欄位
num_cols = df.select_dtypes(include=['number']).columns
cat_cols = df.select_dtypes(include=['object']).columns

print("數值型欄位:", num_cols.tolist())
print("類別型欄位:", cat_cols.tolist())


數值型欄位: ['Id', 'Annual Income', 'Tax Liens', 'Number of Open Accounts', 'Years of Credit History', 'Maximum Open Credit', 'Number of Credit Problems', 'Months since last delinquent', 'Bankruptcies', 'Current Loan Amount', 'Current Credit Balance', 'Monthly Debt', 'Credit Score', 'Credit Default']
類別型欄位: ['Home Ownership', 'Years in current job', 'Purpose', 'Term']
