# library

In [12]:
import numpy as np
import pickle

from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler

from data.load import Data
from classification.models import Model

In [14]:
class ModelProcess:
    def __init__(self):
        """ 생성자 """
        self._train = Data.train
        self._test = Data.test
        self._submit = Data.submission

        self.result = None
    
    def preprocessing(self, df):
        """ 전처리기 """
        #JW
        df['대출기간'] = df['대출기간'].astype(str).str[:3].astype(int)
        df['월원금상환금'] = df['대출금액']/df['대출기간']
        df['상환개월'] = df['총상환원금']/df['월원금상환금']

        df.drop(['월원금상환금'],axis=1, inplace=True)
        df['상환개월'] = df['상환개월'].round().astype(int)


        # ------------------------------------------------------------#
        #sb
        # 'ANY' 컬럼 지우기
        df[df["주택소유상태"] != "ANY"].reset_index(drop = True,inplace = True)
        train_object = df['대출목적'].unique()
        oject_dict = dict()
        for i, v in enumerate(df['대출목적'].unique()):
            oject_dict[v] = i
        df['대출목적'] = df['대출목적'].apply(lambda x: oject_dict.get(x, '기타'))
        # 근로연수 컬럼의 데이터를 문자열로 변환
        df['근로기간'] = df['근로기간'].astype(str)
        # 숫자만 추출
        df['근로기간'] = df['근로기간'].str.extract('(\d+)').astype(float)
        # 'Unknown'은 NaN으로 변환되므로, 이를 -1로 대체
        df['근로기간'].fillna(-1, inplace=True)
        # '10+'와 같은 표현은 10으로 처리되므로, 이를 10 이상의 값으로 대체
        df.loc[df['근로기간'] == 10, '근로기간'] = 11
        # '<1 year'와 같은 표현은 NaN으로 처리되므로, 이를 0으로 대체
        df.loc[df['근로기간'].isna(), '근로기간'] = 0
#label
        categoric_col = df.select_dtypes(include='object').columns # 범주형
        numeric_col = df.select_dtypes(include='int64').columns # 수치형
        
        # 범주형 수치형으로 변환
        ordinal_encoder = OrdinalEncoder()
        df[categoric_col] = ordinal_encoder.fit_transform(df[categoric_col])
        return df
        
    def scaler(self,df):
        # numeric_col = df.select_dtypes(include='int64').columns
        df_col = df.columns
        scaler = MinMaxScaler()
        if '대출등급' in df_col:
            df_col=df_col.drop('대출등급')

        # if "대출등급" in df_col:
        #     df_cols = df_col[:-2]
            # df[df_cols] = scaler.fit_transform(df[df_cols])
        #     df[df_col[-1]] = scaler.fit_transform(df[df_col[-1]])
        # else:
        df[df_col] = scaler.fit_transform(df[df_col])
        
        return df
        
    def train(self, _model):
        """ model 훈련 """
        # train 전처리
        print(f"MODE: {str(_model)}")
        self._train = self.preprocessing(Data.train)
        
        categoric_col = ["ID","대출등급"]
        ordinal_encoder = OrdinalEncoder()
        self._train[categoric_col] = ordinal_encoder.fit_transform(self._train[categoric_col])
        self._train = self.scaler(self._train) #scaler
        # train 분리
        X = self._train.drop(['대출등급'],axis = 1)
        Y = self._train['대출등급']
        x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.1 ,random_state=42,stratify=Y)
        
        # model 훈련 및 평가
        _model.fit(x_train,y_train)
        y_pred = _model.predict(x_test)
        score = f1_score(y_test,y_pred, average="macro")
        print(f"[Test Score]: {score:.2f}", end=' ')

        y_pred = _model.predict(x_train)
        score = f1_score(y_train,y_pred, average="macro")
        print(f"[Train Score]: {score:.2f}")
        return _model
        
    def test(self, _model):
        """ model 추론 """
        # test 전처리
        self._test = self.preprocessing(Data.test) 
        self._test = self.scaler(self._test) #scaler
        
        # model 예측
        real = _model.predict(self._test)
        self.result = np.where(real == 0, 'A', 
                      np.where(real == 1, 'B',
                      np.where(real == 2, 'C',
                      np.where(real == 3, 'D',
                      np.where(real == 4, 'E', 'F')))))
        return _model
        
    def submit(self, _model):
        """ 제출 """
        # csv 저장
        self._submit['대출등급'] = self.result
        self._submit.to_csv("submit_2.csv",index = False)
        
        # 모델 저장
        # model_name = str(_model).split("(")[0]
        with open('model.pkl', 'wb') as file:
            pickle.dump(_model, file)

In [15]:
model_process = ModelProcess()

In [4]:
# model1 = MLPClassifier(hidden_layer_sizes=(50,50), max_iter=1000, random_state=1)
model_set = Model()
models = model_set.get_model_instances()

print(models)

[LogisticRegression(random_state=42, solver='liblinear'), GaussianNB(), DecisionTreeClassifier(random_state=42), RandomForestClassifier(random_state=42), AdaBoostClassifier(random_state=42), GradientBoostingClassifier(random_state=42), XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...), <catboo

In [18]:
model_li = list(map(model_process.train, models[2:5]))

MODE: DecisionTreeClassifier(random_state=42)
[Test Score]: 0.76 [Train Score]: 1.00
MODE: RandomForestClassifier(random_state=42)
[Test Score]: 0.70 [Train Score]: 1.00
MODE: AdaBoostClassifier(random_state=42)
[Test Score]: 0.36 [Train Score]: 0.35


In [7]:
model_process.test(model_li[-1])

MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=1000, random_state=42)

In [8]:
model_process.submit(model_li[-1])

In [9]:
model_process._test

Unnamed: 0,ID,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,상환개월
0,-1.732024,-0.144071,-0.696452,0.469507,-0.959813,0.336026,0.005108,-1.106004,-1.059225,-0.380443,-0.433649,-0.639095,-0.036031,-0.068717,-0.594944
1,-1.731970,-0.952864,-0.696452,-0.235451,1.174519,-0.040218,-0.089008,-0.024566,-0.407456,-0.380443,-0.833132,-0.966866,-0.036031,-0.068717,-1.167236
2,-1.731916,-0.097854,-0.696452,-0.000465,1.174519,0.497163,-0.273031,-0.440504,0.244313,-0.380443,0.975537,-0.336786,-0.036031,-0.068717,1.121932
3,-1.731862,-0.375155,-0.696452,-0.235451,-0.959813,-0.254809,-0.141515,0.391372,0.244313,0.729102,-0.155988,-0.337000,-0.036031,-0.068717,-0.022652
4,-1.731808,0.895807,-0.696452,-0.235451,1.174519,-0.351491,0.274083,-1.106004,0.244313,-0.380443,0.432092,0.407774,-0.036031,-0.068717,-0.022652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64192,1.731808,1.126890,-0.696452,-0.705422,-0.959813,-0.147385,0.065541,0.141809,-0.407456,1.838647,0.490270,0.739865,-0.036031,-0.068717,-0.022652
64193,1.731862,1.126890,1.435850,1.174464,-0.959813,0.131919,-0.182629,0.058621,-0.407456,-0.380443,0.139139,1.817212,-0.036031,-0.068717,-0.022652
64194,1.731916,-1.172394,-0.696452,1.174464,1.174519,-0.491143,0.231979,0.640934,-0.407456,-0.380443,-0.700016,-0.786038,-0.036031,-0.068717,-0.594944
64195,1.731970,-0.652455,-0.696452,1.174464,-0.959813,-0.254809,0.148760,1.306434,-0.407456,0.729102,0.522662,0.378772,-0.036031,-0.068717,1.121932


In [10]:
import pandas as pd

model_process._test.to_csv("test.csv",index = False)
model_process._train.to_csv("train.csv",index = False)

In [11]:
model_process._train

Unnamed: 0,ID,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,대출등급,상환개월
0,-1.732033,-0.563848,-0.702436,-0.003991,1.180537,-0.220218,-0.014287,-0.852449,-0.755827,-0.376102,-0.800303,-0.972784,-0.038438,-0.072595,2.0,-1.037938
1,-1.731997,-0.377964,1.423617,1.172107,-0.953772,0.370332,0.087890,-0.356109,-0.421718,-0.376102,-0.436814,-0.441082,-0.038438,-0.072595,1.0,-0.024393
2,-1.731961,-0.610319,-0.702436,-0.239211,-0.953772,0.020823,-0.321114,-0.935172,-0.755827,-0.376102,0.103276,-0.627621,-0.038438,-0.072595,0.0,0.482380
3,-1.731925,-0.377964,-0.702436,0.466448,-0.953772,0.382384,-0.127783,-0.852449,-0.755827,-0.376102,-0.483274,-0.624977,-0.038438,-0.072595,2.0,-0.531165
4,-1.731889,-0.029431,1.423617,-1.650530,1.180537,-0.222870,0.179044,-0.521556,-0.087609,-0.376102,-0.577932,-0.634409,-0.038438,-0.072595,1.0,-0.531165
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96289,1.731889,-0.377964,-0.702436,1.172107,-0.953772,1.165767,-0.299368,0.636570,1.248826,-0.376102,0.147972,0.145249,-0.038438,-0.072595,2.0,-0.024393
96290,1.731925,1.016166,1.423617,1.172107,-0.953772,0.382384,-0.423588,-0.025216,-0.421718,-0.376102,-0.232331,0.969667,-0.038438,-0.072595,4.0,-0.531165
96291,1.731961,-0.377964,-0.702436,-1.180090,-0.953772,-0.099698,-0.242471,-0.273386,1.248826,-0.376102,0.648632,-0.424781,-0.038438,-0.072595,0.0,0.989152
96292,1.731997,-0.261786,-0.702436,-0.239211,-0.953772,-0.277164,-0.061949,-0.356109,-0.755827,1.799906,0.540861,0.885597,-0.038438,-0.072595,3.0,0.482380
