In [1]:
import pandas as pd
import numpy as np

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sub = pd.read_csv('sample_submission.csv')

In [None]:
train_df

In [None]:
test_df

In [None]:
print(train_df.info())
print(test_df.info())

In [None]:
print(train_df.isnull().sum())
print(test_df.isnull().sum())

In [None]:
num_cols = train_df.select_dtypes('number').columns
num_cols

In [None]:
cat_cols = train_df.select_dtypes('object').columns
cat_cols

In [2]:
num_cols = ['대출금액', '연간소득', '부채_대비_소득_비율', '총계좌수', '최근_2년간_연체_횟수', '총상환원금', '총상환이자',
       '총연체금액', '연체계좌수']

In [3]:
cat_cols = ['대출기간', '근로기간', '주택소유상태', '대출목적']

## 순서를 정렬해주는게 좋습니다

In [4]:
lton = {val:idx for idx,val in enumerate(sorted(train_df['대출등급'].unique()))}
lton

{'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6}

## 순서가 바뀔 위험성이 있으므로 lton 를 이용해서 바꾸는게 좋습니다


In [5]:
ntol = {v : k for k, v in lton.items()}
ntol

{0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G'}

In [None]:
# ntol = {idx:val for idx,val in enumerate(train_df['대출등급'].unique())}
# ntol

In [None]:
train_df['대출등급'] = train_df['대출등급'].apply(lambda x: lton[x])

In [None]:
train_df['대출금액'] = np.log(train_df['대출금액'] + 1)
train_df['연간소득'] = np.log(train_df['연간소득'] + 1)
train_df['총상환원금'] = np.log(train_df['총상환원금'] + 1)
train_df['총상환이자'] = np.log(train_df['총상환이자'] + 1)

In [None]:
test_df['대출금액'] = np.log(test_df['대출금액'] + 1)
test_df['연간소득'] = np.log(test_df['연간소득'] + 1)
test_df['총상환원금'] = np.log(test_df['총상환원금'] + 1)
test_df['총상환이자'] = np.log(test_df['총상환이자'] + 1)

In [None]:
from sklearn.metrics import f1_score

In [None]:
from sklearn.model_selection import train_test_split
train, valid = train_test_split(train_df,test_size=0.3,random_state=1000,stratify=train_df['대출등급'])

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(train[num_cols])
train_scaled = scaler.transform(train[num_cols])
valid_scaled = scaler.transform(valid[num_cols])
test_scaled = scaler.transform(test_df[num_cols])

print(train_scaled.shape)
print(test_scaled.shape)
print(valid_scaled.shape)

train_scaled_df = pd.DataFrame(train_scaled,columns=train[num_cols].columns)
valid_scaled_df = pd.DataFrame(valid_scaled,columns=train[num_cols].columns)
test_scaled_df = pd.DataFrame(test_scaled,columns=test_df[num_cols].columns)
train_scaled_df

In [None]:
test_scaled_df

In [None]:
valid_scaled_df

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoder.fit(train[cat_cols])
train_encoded = encoder.transform(train[cat_cols])
test_encoded = encoder.transform(test_df[cat_cols])
valid_encoded = encoder.transform(valid[cat_cols])
print(train_encoded.shape)
print(test_encoded.shape)
print(valid_encoded.shape)

In [None]:
train_encoded_df = pd.DataFrame(train_encoded,columns=encoder.get_feature_names_out())
test_encoded_df = pd.DataFrame(test_encoded,columns=encoder.get_feature_names_out())
valid_encoded_df = pd.DataFrame(valid_encoded,columns=encoder.get_feature_names_out())
train_encoded_df

In [None]:
train_encoded_df.reset_index(drop=True)
train_scaled_df.reset_index(drop=True)
test_encoded_df.reset_index(drop=True)
test_scaled_df.reset_index(drop=True)
valid_encoded_df.reset_index(drop=True)
valid_scaled_df.reset_index(drop=True)

In [None]:
x_train = np.concatenate([train_encoded_df,train_scaled_df],axis=1)
x_test = np.concatenate([test_encoded_df,test_scaled_df],axis=1)
x_valid = np.concatenate([valid_encoded_df,valid_scaled_df],axis=1)
print(x_train.shape)
print(x_test.shape)
print(x_valid.shape)

In [None]:
y_train = np.array(train['대출등급']).reshape(-1,1)
y_valid = np.array(valid['대출등급']).reshape(-1,1)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense,Dropout
from tensorflow.keras.models import Sequential

## input_shape 은 첫번째 레이어에서만 선언해주시면 됩니다

## 입력 레이어의 모양이 42가 아니라 43입니다. 

In [None]:
num_classes = len(ntol)

model = Sequential()
model.add(Dense(1024, activation='relu',input_shape=(43,)))
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes,activation='softmax'))

## y_pred 모양이  (None, 7) 이므로  argmax로 (None, 1)로 바꿔줘야 합니다.

In [None]:
def macro_f1_score(y_true, y_pred):
    y_pred = np.argmax(y_pred, 1)
    f1_scores = []
    for i in range(num_classes):
        y_t = (y_true == i)
        y_p = (y_pred == i)
        k = f1_score(y_t,y_p, zero_division=0)
        f1_scores.append(k)
    return np.mean(f1_scores)
        
def custom_metric(y_true,y_pred):
    f1 = tf.py_function(func = macro_f1_score,inp=[y_true,y_pred],Tout=tf.float32)
    return f1

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
             loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
             metrics=[tf.keras.metrics.SparseCategoricalAccuracy(),
             custom_metric])

In [None]:
history = model.fit(x = x_train, y = y_train, validation_data=(x_valid,y_valid), batch_size=128, epochs=10)

## predict 할때는 y값을 넣지 않습니다

In [None]:
y_pred_val = model.predict(x_valid)

In [None]:
def macro_f1_score(y_true, y_pred, num_classes):
    y_pred = np.argmax(y_pred, 1)
    f1_scores = []
    for i in range(num_classes):
        y_t = (y_true == i)
        y_p = (y_pred == i)
        k = f1_score(y_t,y_p)
        f1_scores.append(k)
    return np.mean(f1_scores)

In [None]:
macro_f1 = macro_f1_score(y_true, y_pred, num_classes)
print("Macro F1 Score:", macro_f1)

In [None]:
pred = model.predict(test_x)

pred = np.argmax(pred, 1)

sample_submission["대출등급"] = [ntol[i] for i in pred]

sample_submission

sample_submission.to_csv('3rd_try', index=False)