In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score
from tqdm import tqdm

In [2]:
def find_optimal_threshold(y_true, y_pred_proba, thresholds=None, optimize_for='recall'):
   """
   여러 threshold를 시도하여 최적의 값을 찾는 함수
   """
   if thresholds is None:
       thresholds = np.linspace(0.1, 0.9, 81)  # 0.1부터 0.9까지 0.01 간격
   
   from tqdm import tqdm    
   best_score = 0
   best_threshold = 0.5
   results = {}
   
   for threshold in tqdm(thresholds, desc="Finding optimal threshold"):
       y_pred = (y_pred_proba >= threshold).astype(int)
       report = classification_report(y_true, y_pred, output_dict=True)
       
       results[threshold] = {
           'f1': report['1']['f1-score'],
           'recall': report['1']['recall'],
           'precision': report['1']['precision']
       }
       
       # recall로 최적 threshold 선택
       if report['1']['recall'] > best_score:
           best_score = report['1']['recall']
           best_threshold = threshold
   
   return best_threshold, results

def optimize_ridge_logistic(X_train, X_test, y_train, y_test, cv=5):
   """Ridge 정규화를 사용한 로지스틱 회귀 최적화"""
   print("\nOptimizing Ridge Regression...")
   param_grid = {
       'C': np.logspace(-4, 4, 20),
       'class_weight': ['balanced', None],
       'random_state': [42]
   }
   
   model = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000)
   
   # recall을 주요 지표로 설정
   grid_search = GridSearchCV(
       model, 
       param_grid, 
       cv=cv, 
       scoring=['recall', 'precision', 'f1', 'roc_auc'],
       refit='recall',  # recall 기준으로 최적 모델 선택
       n_jobs=-1,
       verbose=1
   )
   
   grid_search.fit(X_train, y_train)
   
   print("\nRidge Regression Results:")
   print(f"Best parameters: {grid_search.best_params_}")
   print("\nBest scores from CV:")
   for metric in grid_search.scoring:
       print(f"{metric}: {grid_search.cv_results_[f'mean_test_{metric}'][grid_search.best_index_]:.4f}")
   
   # Threshold 최적화
   y_pred_proba = grid_search.predict_proba(X_test)[:, 1]
   best_threshold, threshold_results = find_optimal_threshold(y_test, y_pred_proba)
   
   # 최적 threshold로 예측
   y_pred = (y_pred_proba >= best_threshold).astype(int)
   
   print(f"\nOptimal Threshold: {best_threshold:.2f}")
   print("\nThreshold optimization results:")
   for threshold, metrics in sorted(threshold_results.items()):
       print(f"\nThreshold {threshold:.2f}:")
       print(f"Recall: {metrics['recall']:.4f}")
       print(f"Precision: {metrics['precision']:.4f}")
       print(f"F1: {metrics['f1']:.4f}")
   
   print("\nFinal Test Set Performance (with optimal threshold):")
   print(classification_report(y_test, y_pred))
   print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
   
   return grid_search.best_estimator_, best_threshold

def optimize_lasso_logistic(X_train, X_test, y_train, y_test, cv=5):
   """Lasso 정규화를 사용한 로지스틱 회귀 최적화"""
   print("\nOptimizing Lasso Regression...")
   param_grid = {
       'C': np.logspace(-4, 4, 20),
       'class_weight': ['balanced', None],
       'random_state': [42]
   }
   
   model = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000)
   
   grid_search = GridSearchCV(
       model, 
       param_grid, 
       cv=cv, 
       scoring=['recall', 'precision', 'f1', 'roc_auc'],
       refit='recall',
       n_jobs=-1,
       verbose=1
   )
   
   grid_search.fit(X_train, y_train)
   
   print("\nLasso Regression Results:")
   print(f"Best parameters: {grid_search.best_params_}")
   print("\nBest scores from CV:")
   for metric in grid_search.scoring:
       print(f"{metric}: {grid_search.cv_results_[f'mean_test_{metric}'][grid_search.best_index_]:.4f}")
   
   # Threshold 최적화
   y_pred_proba = grid_search.predict_proba(X_test)[:, 1]
   best_threshold, threshold_results = find_optimal_threshold(y_test, y_pred_proba)
   
   # 최적 threshold로 예측
   y_pred = (y_pred_proba >= best_threshold).astype(int)
   
   print(f"\nOptimal Threshold: {best_threshold:.2f}")
   print("\nThreshold optimization results:")
   for threshold, metrics in sorted(threshold_results.items()):
       print(f"\nThreshold {threshold:.2f}:")
       print(f"Recall: {metrics['recall']:.4f}")
       print(f"Precision: {metrics['precision']:.4f}")
       print(f"F1: {metrics['f1']:.4f}")
   
   print("\nFinal Test Set Performance (with optimal threshold):")
   print(classification_report(y_test, y_pred))
   print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
   
   return grid_search.best_estimator_, best_threshold

def optimize_elasticnet_logistic(X_train, X_test, y_train, y_test, cv=5):
   """Elastic Net 정규화를 사용한 로지스틱 회귀 최적화"""
   print("\nOptimizing Elastic Net Regression...")
   param_grid = {
       'C': np.logspace(-4, 4, 20),
       'l1_ratio': np.linspace(0.1, 0.9, 9),
       'class_weight': ['balanced', None],
       'random_state': [42]
   }
   
   model = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=1000)
   
   grid_search = GridSearchCV(
       model, 
       param_grid, 
       cv=cv, 
       scoring=['recall', 'precision', 'f1', 'roc_auc'],
       refit='recall',
       n_jobs=-1,
       verbose=1
   )
   
   grid_search.fit(X_train, y_train)
   
   print("\nElastic Net Regression Results:")
   print(f"Best parameters: {grid_search.best_params_}")
   print("\nBest scores from CV:")
   for metric in grid_search.scoring:
       print(f"{metric}: {grid_search.cv_results_[f'mean_test_{metric}'][grid_search.best_index_]:.4f}")
   
   # Threshold 최적화
   y_pred_proba = grid_search.predict_proba(X_test)[:, 1]
   best_threshold, threshold_results = find_optimal_threshold(y_test, y_pred_proba)
   
   # 최적 threshold로 예측
   y_pred = (y_pred_proba >= best_threshold).astype(int)
   
   print(f"\nOptimal Threshold: {best_threshold:.2f}")
   print("\nThreshold optimization results:")
   for threshold, metrics in sorted(threshold_results.items()):
       print(f"\nThreshold {threshold:.2f}:")
       print(f"Recall: {metrics['recall']:.4f}")
       print(f"Precision: {metrics['precision']:.4f}")
       print(f"F1: {metrics['f1']:.4f}")
   
   print("\nFinal Test Set Performance (with optimal threshold):")
   print(classification_report(y_test, y_pred))
   print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
   
   return grid_search.best_estimator_, best_threshold

In [3]:
def evaluate_model_detailed(model, X_test, y_test, threshold, model_name="Model"):
    """
    모델의 상세한 성능 평가
    """
    # 예측 확률
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    # threshold 적용한 예측
    y_pred = (y_pred_proba >= threshold).astype(int)
    
    # 1. 기본 성능 지표
    print(f"\n{model_name} Performance Metrics:")
    print("="*50)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # 2. ROC-AUC
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    print(f"\nROC-AUC Score: {roc_auc:.4f}")
    
    # 3. Confusion Matrix 시각화
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    
    # 4. ROC Curve
    plt.subplot(1, 2, 2)
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()
    
    plt.tight_layout()
    plt.show()
    
    # 5. 예측 확률 분포
    plt.figure(figsize=(10, 4))
    
    plt.hist(y_pred_proba[y_test==0], bins=50, alpha=0.5, label='Class 0', density=True)
    plt.hist(y_pred_proba[y_test==1], bins=50, alpha=0.5, label='Class 1', density=True)
    plt.axvline(x=threshold, color='r', linestyle='--', label=f'Threshold ({threshold:.2f})')
    plt.title('Prediction Probability Distribution')
    plt.xlabel('Predicted Probability')
    plt.ylabel('Density')
    plt.legend()
    plt.show()
    
    # 6. 특성 중요도 (상위 20개)
    feature_importance = pd.DataFrame({
        'Feature': X_test.columns,
        'Importance': abs(model.coef_[0])
    }).sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(12, 6))
    sns.barplot(data=feature_importance.head(20), x='Importance', y='Feature')
    plt.title('Top 20 Feature Importance')
    plt.tight_layout()
    plt.show()
    
    return feature_importance

# 사용 예시:
# feature_importance = evaluate_model_detailed(best_ridge_model, X_test, y_test, 
#                                           best_ridge_threshold, "Ridge Regression")

In [7]:
df = pd.read_csv('final_VIF_Delete.csv', index_col=0)

In [9]:
df.columns

Index(['TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR',
       'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT',
       'AMT_ANNUITY', 'NAME_TYPE_SUITE', 'NAME_EDUCATION_TYPE',
       'NAME_HOUSING_TYPE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH',
       'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE',
       'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE',
       'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START',
       'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION',
       'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY',
       'LIVE_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE', 'EXT_SOURCE_1',
       'EXT_SOURCE_2', 'EXT_SOURCE_3', 'OBS_60_CNT_SOCIAL_CIRCLE',
       'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE', 'FLAG_DOCUMENT_2',
       'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5',
       'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8',
       'FLAG_DOCUMENT_9'

In [10]:
selected_columns = ['EXT_SOURCE_1', 'EXT_SOURCE_3', 'EXT_SOURCE_2', 'OWN_CAR_AGE', 'DAYS_BIRTH', 'OCCUPATION_TYPE', 'FLAG_DOCUMENT_3']
X = df[selected_columns]

In [11]:
y = df['TARGET']

In [12]:
# 3. 학습/테스트 데이터 분할
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    stratify=y, 
                                                    random_state=42)

In [13]:
# 4. 모델 최적화 실행
best_ridge_model, best_ridge_threshold = optimize_elasticnet_logistic(X_train, X_test, y_train, y_test)


Optimizing Elastic Net Regression...
Fitting 5 folds for each of 360 candidates, totalling 1800 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


Elastic Net Regression Results:
Best parameters: {'C': 0.0001, 'class_weight': 'balanced', 'l1_ratio': 0.9, 'random_state': 42}

Best scores from CV:
recall: 0.8147
precision: 0.2669
f1: 0.4021
roc_auc: 0.8388


Finding optimal threshold: 100%|████████████████| 81/81 [00:03<00:00, 21.06it/s]



Optimal Threshold: 0.10

Threshold optimization results:

Threshold 0.10:
Recall: 0.9921
Precision: 0.0843
F1: 0.1554

Threshold 0.11:
Recall: 0.9895
Precision: 0.0849
F1: 0.1563

Threshold 0.12:
Recall: 0.9863
Precision: 0.0854
F1: 0.1572

Threshold 0.13:
Recall: 0.9849
Precision: 0.0860
F1: 0.1582

Threshold 0.14:
Recall: 0.9823
Precision: 0.0866
F1: 0.1591

Threshold 0.15:
Recall: 0.9784
Precision: 0.0870
F1: 0.1598

Threshold 0.16:
Recall: 0.9752
Precision: 0.0875
F1: 0.1606

Threshold 0.17:
Recall: 0.9726
Precision: 0.0880
F1: 0.1614

Threshold 0.18:
Recall: 0.9700
Precision: 0.0886
F1: 0.1624

Threshold 0.19:
Recall: 0.9666
Precision: 0.0894
F1: 0.1636

Threshold 0.20:
Recall: 0.9639
Precision: 0.0904
F1: 0.1653

Threshold 0.21:
Recall: 0.9603
Precision: 0.0917
F1: 0.1675

Threshold 0.22:
Recall: 0.9575
Precision: 0.0934
F1: 0.1701

Threshold 0.23:
Recall: 0.9539
Precision: 0.0952
F1: 0.1732

Threshold 0.24:
Recall: 0.9496
Precision: 0.0973
F1: 0.1765

Threshold 0.25:
Recall: 0.