# library

In [1]:
import numpy as np
import pickle

from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler

from data.load import Data
from classification.models import Model

In [25]:
class ModelProcess:
    def __init__(self):
        """ 생성자 """
        self._train = Data.train
        self._test = Data.test
        self._submit = Data.submission

        self.result = None
    
    def preprocessing(self, df):
        """ 전처리기 """
        #JW
        df['대출기간'] = df['대출기간'].astype(str).str[:3].astype(int)
        df['월원금상환금'] = df['대출금액']/df['대출기간']
        df['상환개월'] = df['총상환원금']/df['월원금상환금']

        df.drop(['월원금상환금'],axis=1, inplace=True)
        df['상환개월'] = df['상환개월'].round().astype(int)


        # ------------------------------------------------------------#
        #sb
        # 'ANY' 컬럼 지우기
        df[df["주택소유상태"] != "ANY"].reset_index(drop = True,inplace = True)
        train_object = df['대출목적'].unique()
        oject_dict = dict()
        for i, v in enumerate(df['대출목적'].unique()):
            oject_dict[v] = i
        df['대출목적'] = df['대출목적'].apply(lambda x: oject_dict.get(x, '기타'))
        # 근로연수 컬럼의 데이터를 문자열로 변환
        df['근로기간'] = df['근로기간'].astype(str)
        # 숫자만 추출
        df['근로기간'] = df['근로기간'].str.extract('(\d+)').astype(float)
        # 'Unknown'은 NaN으로 변환되므로, 이를 -1로 대체
        df['근로기간'].fillna(-1, inplace=True)
        # '10+'와 같은 표현은 10으로 처리되므로, 이를 10 이상의 값으로 대체
        df.loc[df['근로기간'] == 10, '근로기간'] = 11
        # '<1 year'와 같은 표현은 NaN으로 처리되므로, 이를 0으로 대체
        df.loc[df['근로기간'].isna(), '근로기간'] = 0
#label
        categoric_col = df.select_dtypes(include='object').columns # 범주형
        numeric_col = df.select_dtypes(include='int64').columns # 수치형
        
        # 범주형 수치형으로 변환
        ordinal_encoder = OrdinalEncoder()
        df[categoric_col] = ordinal_encoder.fit_transform(df[categoric_col])
        return df
        
    def scaler(self,df):
        # numeric_col = df.select_dtypes(include='int64').columns
        df_col = df.columns
        scaler = StandardScaler()
        if '대출등급' in df_col:
            df_col=df_col.drop('대출등급')

        # if "대출등급" in df_col:
        #     df_cols = df_col[:-2]
            # df[df_cols] = scaler.fit_transform(df[df_cols])
        #     df[df_col[-1]] = scaler.fit_transform(df[df_col[-1]])
        # else:
        df[df_col] = scaler.fit_transform(df[df_col])
        
        return df
        
    def train(self, _model):
        """ model 훈련 """
        # train 전처리
        print(f"MODE: {str(_model)}")
        self._train = self.preprocessing(Data.train)
        
        categoric_col = ["ID","대출등급"]
        ordinal_encoder = OrdinalEncoder()
        self._train[categoric_col] = ordinal_encoder.fit_transform(self._train[categoric_col])
        self._train = self.scaler(self._train) #scaler
        # train 분리
        X = self._train.drop(['대출등급'],axis = 1)
        Y = self._train['대출등급']
        x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.1 ,random_state=42,stratify=Y)
        
        # model 훈련 및 평가
        _model.fit(x_train,y_train)
        y_pred = _model.predict(x_test)
        score = f1_score(y_test,y_pred, average="macro")
        print(f"[Test Score]: {score:.2f}", end=' ')

        y_pred = _model.predict(x_train)
        score = f1_score(y_train,y_pred, average="macro")
        print(f"[Train Score]: {score:.2f}")
        return _model
        
    def test(self, _model):
        """ model 추론 """
        # test 전처리
        self._test = self.preprocessing(Data.test) 
        self._test = self.scaler(self._test) #scaler
        
        # model 예측
        real = _model.predict(self._test)
        self.result = np.where(real == 0, 'A', 
                      np.where(real == 1, 'B',
                      np.where(real == 2, 'C',
                      np.where(real == 3, 'D',
                      np.where(real == 4, 'E', 'F')))))
        return _model
        
    def submit(self, _model):
        """ 제출 """
        # csv 저장
        self._submit['대출등급'] = self.result
        self._submit.to_csv("submit_2.csv",index = False)
        
        # 모델 저장
        with open('model.pkl', 'wb') as file:
            pickle.dump(_model, file)

In [26]:
model_process = ModelProcess()

In [16]:
# model1 = MLPClassifier(hidden_layer_sizes=(50,50), max_iter=1000, random_state=1)
model_set = Model()
models = model_set.get_model_instances()

print(models)

[LogisticRegression(random_state=42, solver='liblinear'), GaussianNB(), DecisionTreeClassifier(random_state=42), RandomForestClassifier(random_state=42), SVC(random_state=42), AdaBoostClassifier(random_state=42), GradientBoostingClassifier(random_state=42), XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_s

In [28]:
model_li = list(map(model_process.train, models))

MODE: LogisticRegression(random_state=42, solver='liblinear')
[Score]: 0.27
MODE: GaussianNB()
[Score]: 0.20
MODE: DecisionTreeClassifier(random_state=42)
[Score]: 0.76
MODE: RandomForestClassifier(random_state=42)
[Score]: 0.71
MODE: SVC(random_state=42)


In [None]:
model.test(model_li[0])

In [150]:
model.submit(model_li[0])

In [None]:
m