# library

In [99]:
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

In [115]:
from sklearn.neural_network import MLPClassifier

In [127]:
class Model:
    def __init__(self):
        """ 생성자 """
        self._train = pd.read_csv('./data/train_label1.csv')
        self._test = pd.read_csv('./data/test.csv')
        self._submit = pd.read_csv('./data/sample_submission.csv')

        self.result = None
    
    def preprocessing(self, df):
        """ 전처리기 """
        #sb
        train_object = df['대출목적'].unique()
        oject_dict = dict()
        for i, v in enumerate(df['대출목적'].unique()):
            oject_dict[v] = i
        df['대출목적'] = df['대출목적'].apply(lambda x: oject_dict.get(x, '기타'))
        # 근로연수 컬럼의 데이터를 문자열로 변환
        df['근로기간'] = df['근로기간'].astype(str)
        # 숫자만 추출
        df['근로기간'] = df['근로기간'].str.extract('(\d+)').astype(float)
        # 'Unknown'은 NaN으로 변환되므로, 이를 -1로 대체
        df['근로기간'].fillna(-1, inplace=True)
        # '10+'와 같은 표현은 10으로 처리되므로, 이를 10 이상의 값으로 대체
        df.loc[df['근로기간'] == 10, '근로기간'] = 11
        # '<1 year'와 같은 표현은 NaN으로 처리되므로, 이를 0으로 대체
        df.loc[df['근로기간'].isna(), '근로기간'] = 0
#label
        categoric_col = df.select_dtypes(include='object').columns # 범주형
        numeric_col = df.select_dtypes(include='int64').columns # 수치형
        
        # 범주형 수치형으로 변환
        ordinal_encoder = OrdinalEncoder()
        df[categoric_col] = ordinal_encoder.fit_transform(df[categoric_col])
        return df
        
    def scaler(self,df):
        # numeric_col = df.select_dtypes(include='int64').columns
        df_col = df.columns
        if "대출등급" in df_col:
            df_col = df_col[:-1]
        scaler = StandardScaler()
        df[df_col] = scaler.fit_transform(df[df_col])
        return df
        
    def train(self, _model):
        """ model 훈련 """
        # train 전처리
        # self._train = self.preprocessing(self._train) 
        categoric_col = ["ID","대출등급"]
        ordinal_encoder = OrdinalEncoder()
        self._train[categoric_col] = ordinal_encoder.fit_transform(self._train[categoric_col])
        self._train = self.scaler(self._train) #scaler
        # train 분리
        X = self._train.drop(['대출등급'],axis = 1)
        Y = self._train['대출등급']
        x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.2 ,random_state=42,stratify=Y)
        
        # model 훈련 및 평가
        _model.fit(x_train,y_train)
        y_pred = _model.predict(x_test)
        score = f1_score(y_test,y_pred, average="macro")
        print(f"[Score]: {score:.2f}")
        
    def test(self):
        """ model 추론 """
        # test 전처리
        self._test = self.preprocessing(self._test) 
        self._test = self.scaler(self._test) #scaler
        
        # model 예측
        real = self.model.predict(self._test)
        self.result = np.where(real == 0, 'A', 
                      np.where(real == 1, 'B',
                      np.where(real == 2, 'C',
                      np.where(real == 3, 'D',
                      np.where(real == 4, 'E', 'F')))))
        
    def submit(self):
        """ 제출 """
        # csv 저장
        self._submit['대출등급'] = self.result
        self._submit.to_csv("submit_2.csv",index = False)
        
        # 모델 저장
        with open('model.pkl', 'wb') as file:
            pickle.dump(self.model, file)
        
        

In [128]:
model = Model()

In [118]:
model.train()

[Score]: 0.77


In [103]:
model.test()

In [104]:
model.submit()

In [124]:
model1 = MLPClassifier(hidden_layer_sizes=(50,50), max_iter=1000, random_state=1)
model2 = XGBClassifier()

models = [model1, model2]

In [126]:
print(models)

[MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=1000, random_state=1), XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)]


In [129]:
print(list(map(model.train, models)))

[Score]: 0.77
[Score]: 0.77
[None, None]
