# library

In [14]:
import numpy as np
import pickle

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler

from data.load import Data
from classification.models import Model

### Utils

In [15]:
def get_max_score_model_instance(model_info_list):
    max_score = model_info_list[0].get("model_score", 0)
    max_score_model_instance = model_info_list[0].get("model_instance")
    max_score_model_name = model_info_list[0].get("model_name")

    for model_info in model_info_list[1:]:
        model_score = model_info.get("model_score", 0)
        model_instance = model_info.get("model_instance")
        model_name = model_info.get("model_name")
        
        if model_score > max_score:
            max_score = model_score
            max_score_model_instance = model_instance
            max_score_model_name = model_name
        
    print(f"MAX SCORE model instance: {max_score_model_name} TEST SCORE: {max_score}")
    return max_score_model_instance

In [16]:
class ModelProcess:
    def __init__(self):
        """ 생성자 """
        self._train = Data.train
        self._test = Data.test
        self._submit = Data.submission

        self.result = None
    
    def preprocessing(self, df):
        """ 전처리기 """
        #init
        df = df.drop(columns=['ID', '근로기간'])
        
        #JW
        df['대출기간'] = df['대출기간'].astype(str).str[:3].astype(int)
        df['월원금상환금'] = df['대출금액']/df['대출기간']
        df['상환개월'] = df['총상환원금']/df['월원금상환금']
        df.drop(['월원금상환금'],axis=1, inplace=True)
        df['상환개월'] = df['상환개월'].round().astype(int)
        #------------------------------------------------------------#
        
        #sb
        # 'ANY' 컬럼 지우기
        df[df["주택소유상태"] != "ANY"].reset_index(drop = True,inplace = True)
        train_object = df['대출목적'].unique()
        oject_dict = dict()
        for i, v in enumerate(df['대출목적'].unique()):
            oject_dict[v] = i
        df['대출목적'] = df['대출목적'].apply(lambda x: oject_dict.get(x, '기타'))
        #------------------------------------------------------------#
        
        # label 범주형 수치형으로 변환
        categorical_features = ['대출기간', '주택소유상태', '대출목적']
        for i in categorical_features:
            le = LabelEncoder()
            df[i] = le.fit_transform(df[i])
        return df
        
    def train(self, _model):
        """ model 훈련 """
        # train 전처리
        print(f"MODE: {str(_model)}")
        self._train = self.preprocessing(self._train)
        
        # train 분리
        X = self._train.drop(['대출등급'],axis = 1)
        Y = self._train['대출등급']
        
        self.label_encoder = LabelEncoder()
        Y = self.label_encoder.fit_transform(Y)
        
        # x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.1 ,random_state=42,stratify=Y)
        
        # model 훈련 및 평가
        _model.fit(X,Y)
        
        y_pred = _model.predict(X)
        score = f1_score(Y,y_pred, average="macro")
        print(f"[Train Score]: {score:.2f}", end = " ")

        # y_pred = _model.predict(x_test)
        # score = f1_score(y_test,y_pred, average="macro")
        # print(f"[Test Score]: {score:.2f}")
        model_name = str(_model).split("(")[0]

        return {
            "model_name": model_name,
            "model_instance": _model,
            "model_score": round(score, 2)
        }
        
    def test(self, _model):
        """ model 추론 """
        # test 전처리
        self._test = self.preprocessing(Data.test) 
        # self._test = self.scaler(self._test) #scaler
        
        # model 예측
        real = _model.predict(self._test)
        pred = self.label_encoder.inverse_transform(real)
        self.result = pred
        # self.result = np.where(real == 0, 'A', 
        #               np.where(real == 1, 'B',
        #               np.where(real == 2, 'C',
        #               np.where(real == 3, 'D',
        #               np.where(real == 4, 'E', 'F')))))
        return _model
        
    def submit(self, _model):
        """ 제출 """
        # csv 저장
        self._submit['대출등급'] = self.result
        self._submit.to_csv("submit_2.csv",index = False)
        
        # 모델 저장
        # model_name = str(_model).split("(")[0]
        with open('model.pkl', 'wb') as file:
            pickle.dump(_model, file)
    
    def auto_test(self):
        model_set = Model()
        models = model_set.get_model_instances()

        model_info_list = list(map(self.train, models))
        model_instance = get_max_score_model_instance(model_info_list)

        self.test(model_instance)
        self.submit(model_instance)
            

In [4]:
model_process = ModelProcess()

### 전체 모델을 대상 학습 및 테스트 평가

In [7]:
model_process.auto_test()

MODE: LogisticRegression(random_state=42, solver='liblinear')
[Train Score]: 0.23 [Test Score]: 0.22
MODE: GaussianNB()
[Train Score]: 0.20 [Test Score]: 0.20
MODE: DecisionTreeClassifier(random_state=42)
[Train Score]: 1.00 [Test Score]: 0.76
MODE: RandomForestClassifier(random_state=42)
[Train Score]: 1.00 [Test Score]: 0.70
MODE: SVC(random_state=42)
[Train Score]: 0.28 [Test Score]: 0.27
MODE: AdaBoostClassifier(random_state=42)
[Train Score]: 0.35 [Test Score]: 0.36
MODE: GradientBoostingClassifier(random_state=42)
[Train Score]: 0.74 [Test Score]: 0.69
MODE: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
       

### 개별 모델 학습

In [5]:
# model1 = MLPClassifier(hidden_layer_sizes=(50,50), max_iter=1000, random_state=1)
model_set = Model()
models = model_set.get_model_instances()

In [23]:
model_li = list(map(model_process.train, models))

MODE: LogisticRegression(random_state=42, solver='liblinear')
[Train Score]: 0.23 [Test Score]: 0.22
MODE: GaussianNB()
[Train Score]: 0.20 [Test Score]: 0.20
MODE: DecisionTreeClassifier(random_state=42)
[Train Score]: 1.00 [Test Score]: 0.76
MODE: RandomForestClassifier(random_state=42)
[Train Score]: 1.00 [Test Score]: 0.70
MODE: SVC(random_state=42)


In [6]:
_model = model_process.train(models[-2])

MODE: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...)
[Train Score]: 0.91 

In [12]:
model_process.test(_model.get("model_instance"))

In [13]:
model_process.submit(_model.get("model_instance"))