# library

In [6]:
import numpy as np
import pickle
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler

from data.load import Data
from src.models.classification.models import Model

### Utils

In [118]:
def get_max_score_model_instance(model_info_list):
    max_score = model_info_list[0].get("model_score", 0)
    max_score_model_instance = model_info_list[0].get("model_instance")
    max_score_model_name = model_info_list[0].get("model_name")

    for model_info in model_info_list[1:]:
        model_score = model_info.get("model_score", 0)
        model_instance = model_info.get("model_instance")
        model_name = model_info.get("model_name")
        
        if model_score > max_score:
            max_score = model_score
            max_score_model_instance = model_instance
            max_score_model_name = model_name
        
    print(f"MAX SCORE model instance: {max_score_model_name} TEST SCORE: {max_score}")
    return max_score_model_instance

In [179]:
class ModelProcess:
    def __init__(self):
        """ 생성자 """
        self._train = Data.train
        self._test = Data.test
        self._submit = Data.submission

        self.result = None
    
    def preprocessing(self, df):
        """ 전처리기 """
        #init
        df = df.drop(columns=['ID','근로기간', '연체계좌수'])
        
        #JW
        """연체관련"""
        # 최근_2년간_연체_횟수 -> 최근_2년간_연체 유무
        is_overdue = df[df['최근_2년간_연체_횟수']!=0.0].index
        df.loc[is_overdue,'최근_2년간_연체_횟수']=1.0
        df['최근_2년간_연체']=df['최근_2년간_연체_횟수']
        df.drop('최근_2년간_연체_횟수', axis=1, inplace=True)

        """ 부채관련 """
        # 월원금상환금
        df['대출기간'] = df['대출기간'].astype(str).str[:3].astype(int)
        df['월원금상환금'] = round(df['대출금액']/df['대출기간'])
        # 상환개월
        df['상환개월'] = np.trunc(df['총상환원금']/df['월원금상환금'])
        
        df.drop(['월원금상환금'], axis=1, inplace=True)

        # df['대출기간'] = df['대출기간'].map({36:1, 60: 1.6})
        # df['근로기간'] = df['근로기간'].map({'10+ years':10,'10+years':10, '9 years':9, '8 years':8, '7 years':7, '6 years':6, 
        #                              '5 years':5, '4 years':4, '3 years':3, '3':3, '2 years':2, 
        #                              '1 years':1, '1 year':1, '<1 year ':1, '< 1 year':1})

        df['연간소득'] = pd.qcut(df['연간소득'], q=10, labels = False)
        df["총상환금"] = df["총상환원금"] + df["총상환이자"]
        df["남은대출금액"] = df["대출금액"] - df["총상환금"]
        df = df.drop(columns=['총상환금'])
        
        # label 범주형 수치형으로 변환
        categorical_features = ['대출목적','주택소유상태']
        for i in categorical_features:
            le = LabelEncoder()
            df[i] = le.fit_transform(df[i])
        return df
        
    def train(self, _model):
        """ model 훈련 """
        # train 전처리
        print(f"MODE: {str(_model)}")
        self._train = self.preprocessing(self._train)
        
        # train 분리
        X = self._train.drop(['대출등급'],axis = 1)
        Y = self._train['대출등급']
        
        self.label_encoder = LabelEncoder()
        Y = self.label_encoder.fit_transform(Y)
        
        x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.1 ,random_state=42,stratify=Y)
        
        # model 훈련 및 평가
        _model.fit(x_train,y_train)
        score = 0.5
        
        y_pred = _model.predict(x_train)
        score = f1_score(y_train,y_pred, average="macro")
        print(f"[Train Score]: {score:.2f}", end = " ")

        y_pred = _model.predict(x_test)
        score = f1_score(y_test,y_pred, average="macro")
        print(f"[Test Score]: {score:.5f}")
        
        model_name = str(_model).split("(")[0]
        #전체 학습 시
        # _model.fit(X,Y)
        # score = 0.5
        return {
            "model_name": model_name,
            "model_instance": _model,
            "model_score": round(score, 2)
        }
        
    def test(self, _model):
        """ model 추론 """
        # test 전처리
        self._test = self.preprocessing(Data.test) 
        # self._test = self.scaler(self._test) #scaler
        
        # model 예측
        real = _model.predict(self._test)
        pred = self.label_encoder.inverse_transform(real)
        self.result = pred
        # self.result = np.where(real == 0, 'A', 
        #               np.where(real == 1, 'B',
        #               np.where(real == 2, 'C',
        #               np.where(real == 3, 'D',
        #               np.where(real == 4, 'E', 'F')))))
        return _model
        
    def submit(self, _model):
        """ 제출 """
        # csv 저장
        self._submit['대출등급'] = self.result
        self._submit.to_csv("submit_3.csv",index = False)
        
        # 모델 저장
        # model_name = str(_model).split("(")[0]
        with open('model.pkl', 'wb') as file:
            pickle.dump(_model, file)
    
    def auto_test(self):
        model_set = Model()
        models = model_set.get_model_instances()

        model_info_list = list(map(self.train, models))
        model_instance = get_max_score_model_instance(model_info_list)

        self.test(model_instance)
        self.submit(model_instance)
            

In [180]:
model_process = ModelProcess()

### 전체 모델을 대상 학습 및 테스트 평가

In [181]:
# model_process.auto_test()

### 개별 모델 학습

In [182]:
# model1 = MLPClassifier(hidden_layer_sizes=(50,50), max_iter=1000, random_state=1)
model_set = Model()
models = model_set.get_model_instances()

필요한 모듈이 설치되지 않았습니다: No module named 'catboost'
필요한 모듈이 설치되지 않았습니다: No module named 'lightgbm'


In [183]:
# model_li = list(map(model_process.train, models))

In [184]:
# 데이터 로드
with open('./data/best_params.pkl', 'rb') as f:
	data = pickle.load(f)

In [185]:
# data

In [186]:
_model = model_process.train(models[-4].set_params(**data))

MODE: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8734150979529018, device=None,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0.017600210882064204,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.0947263260073529,
              max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=10, max_leaves=None,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=373, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...)
[Train Score]: 1.00 [Test Score]: 0.81597


In [116]:
model_process.test(_model.get("model_instance"))

In [117]:
model_process.submit(_model.get("model_instance"))