In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# 데이터 로드
train_data = pd.read_excel('train_data.xlsx')
train_data = train_data.drop(['SP'], axis=1)
train_data = pd.get_dummies(train_data, columns=['AD_NO', 'LIV_ADD', 'RES_ADD', 'ADD_YN'])

In [3]:
# 입력 (X) / 출력 (Y) 변수 분리
X = train_data.drop('결과값(연체회차)', axis=1)
y = train_data['결과값(연체회차)']

In [4]:
# 학습 데이터와 검증 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
features = X.columns.tolist()

In [5]:
# 특성 스케일링
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Logistic Regression

In [6]:
# LogisticRegression 모델 학습
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=42)

In [7]:
# 훈련,테스트 데이터에서의 예측
train_pred = lr.predict(X_train)
test_pred = lr.predict(X_test)

In [8]:
# 정확도 계산
train_accuracy = accuracy_score(y_train, train_pred)
test_accuracy = accuracy_score(y_test, test_pred)
print(f'Train Accuracy: {train_accuracy:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

Train Accuracy: 0.9771
Test Accuracy: 0.9782


In [9]:
# 피처 중요도 확인
lr_coef = lr.coef_[0]

In [10]:
# 피처 중요도를 데이터프레임으로 변환
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': lr_coef})

In [11]:
# 중요도 순으로 정렬
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

In [12]:
# 로지스틱 회귀 피처 중요도 확인 및 저장
total_importance = sum(abs(lr_coef))
lr_feature_importance_percentages = (abs(lr_coef) / total_importance) * 100
lr_feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': lr_feature_importance_percentages})
lr_feature_importance_df = lr_feature_importance_df.sort_values(by='Importance', ascending=False)
lr_feature_importance_df.to_excel('Overdue_LogisticRegression_feature_importance.xlsx', index=False)

### Test_data.xlsx CB예측

In [13]:
# test.xlsx 파일 로드
test_data = pd.read_excel('test_data.xlsx')

In [14]:
# test_data에 동일한 전처리 수행
test_data = pd.get_dummies(test_data, columns=['AD_NO', 'LIV_ADD', 'RES_ADD', 'ADD_YN'])

In [15]:
# 훈련 데이터와 동일한 특성을 가지도록 조정
missing_cols = set(X.columns) - set(test_data.columns)
for c in missing_cols:
    test_data[c] = 0
test_data = test_data[X.columns]

In [16]:
# 스케일링
test_data_scaled = scaler.transform(test_data)

In [17]:
# 훈련된 모델을 사용하여 예측 수행
test_pred = lr.predict(test_data_scaled)

In [18]:
# 예측 결과를 데이터프레임으로 변환
test_pred_df = pd.DataFrame(test_pred, columns=['연체 예측'])

In [19]:
# 결과를 엑셀 파일로 저장
test_pred_df.to_excel('test_LogisticRegression_Overdue.xlsx', index=False)

In [21]:
import joblib
joblib.dump(lr, 'Overdue_LogisticRegression_model.pkl')
joblib.dump(scaler, 'Overdue_LogisticRegression_scaler.pkl')

['Overdue_LogisticRegression_scaler.pkl']