In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import font_manager, rc
import warnings
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from sklearn.metrics import f1_score, confusion_matrix
import math

# 경고 메시지 무시
warnings.filterwarnings('ignore')

# 한글 폰트 설정
font_path = "/usr/share/fonts/truetype/nanum/NanumGothic.ttf"
fontprop = font_manager.FontProperties(fname=font_path)
rc('font', family=fontprop.get_name())

# 1. 데이터 로드
train = pd.read_csv('/apps/study_promptengineerings/dacon/debt_risk/train.csv')
test = pd.read_csv('/apps/study_promptengineerings/dacon/debt_risk/test.csv')

# ID 매핑 저장 및 컬럼 제거
id_mapping = pd.Series(test['UID'].values, index=test.index)
train = train.drop('UID', axis=1)
test = test.drop('UID', axis=1)

# 결측치 확인
print("Train set missing values:")
print(train.isnull().sum())
print("\nTest set missing values:")
print(test.isnull().sum())

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, confusion_matrix
from catboost import CatBoostClassifier
import seaborn as sns
import matplotlib.pyplot as plt

# 범주형 변수 지정
cat_features = ['주거 형태', '대출 목적', '대출 상환 기간', '현재 직장 근속 연수']

# 4. 데이터 준비
X = train.drop('채무 불이행 여부', axis=1)
y = train['채무 불이행 여부']

# 클래스 가중치 계산
class_weights = dict(zip(
    np.unique(y),
    1 / np.bincount(y) * len(y) / 2
))

# 5. 모델 학습 및 예측
params = {
    'iterations': 1000,
    'learning_rate': 0.05,
    'depth': 6,
    'l2_leaf_reg': 0.1,
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.8,
    'random_seed': 42,
    'loss_function': 'Logloss',
    'eval_metric': 'F1',
    'early_stopping_rounds': 50,
    'verbose': 100,
    'class_weights': [1, class_weights[1]/class_weights[0]]  # 클래스 불균형 처리
}

# 교차 검증 설정
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(test))

# 모델 학습
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"Training fold {fold + 1}/5")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = CatBoostClassifier(**params)
    model.fit(
        X_train, 
        y_train,
        cat_features=cat_features,
        eval_set=[(X_val, y_val)],
        plot=True
    )
    
    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    test_preds += model.predict_proba(test)[:, 1] / skf.n_splits

# 6. 모델 평가
oof_preds_binary = (oof_preds > 0.5).astype(int)
f1 = f1_score(y, oof_preds_binary, average='macro')
print(f"\nOOF F1 Score: {f1:.4f}")

# 혼동 행렬 시각화
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y, oof_preds_binary)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# 7. 제출 파일 생성
test_preds_binary = (test_preds > 0.5).astype(int)
submission = pd.DataFrame({
    'UID': id_mapping,
    '채무 불이행 확률': test_preds_binary
})
submission.to_csv('submission.csv', index=False)
print("Submission file has been created successfully!")

# 8. 특성 중요도 시각화
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance.head(15))
plt.title('Top 15 Feature Importance')
plt.show()