In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier,Pool
from itertools import product
pd.set_option('display.max_columns', None)

## 데이터 불러오기

In [2]:
train_csv = pd.read_csv('./data/train.csv')
test_csv = pd.read_csv('./data/test.csv')
sample_submission_csv = pd.read_csv('./data/sample_submission.csv')

train_data = train_csv.copy()
test_data = test_csv.copy()
train_data

Unnamed: 0,index,gender,Annual_income,income_type,Education,family_type,house_type,DAYS_BIRTH,working_day,FLAG_MOBIL,work_phone,phone,email,occyp_type,begin_month,car_reality,credit
0,0,F,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,-6.0,0,1.0
1,1,F,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,-5.0,1,1.0
2,2,M,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,-22.0,2,2.0
3,3,F,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,-37.0,1,0.0
4,4,F,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,-26.0,2,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,26452,F,225000.0,State servant,Secondary / secondary special,Married,House / apartment,-12079,-1984,1,0,0,0,Core staff,-2.0,0,1.0
26453,26453,F,180000.0,Working,Higher education,Separated,House / apartment,-15291,-2475,1,0,0,0,,-47.0,1,2.0
26454,26454,F,292500.0,Working,Secondary / secondary special,Civil marriage,With parents,-10082,-2015,1,0,0,0,Core staff,-25.0,1,2.0
26455,26455,M,171000.0,Working,Incomplete higher,Single / not married,House / apartment,-10145,-107,1,0,0,0,Laborers,-59.0,1,2.0


In [3]:
numermic_col = ['Annual_income','working_day','begin_month','DAYS_BIRTH']
default_cate_col = ['gender','income_type','Education','family_type','house_type','work_phone','phone','email','occyp_type','car_reality']
cate_col = ['gender','income_type','Education','family_type','house_type','work_phone','phone','email','occyp_type','car_reality']

## 데이터 전처리
### 필요 없는 변수 제거
모두 1인 **FLAG_MOBIL** 제거.  
데이터 순서를 나타내는 **index** 제거.

In [4]:
# index와 FLAG_MOBIL drop
train_data.drop(['index','FLAG_MOBIL'],inplace=True,axis=1)
test_data.drop(['index','FLAG_MOBIL'],inplace=True,axis=1)

### 결측치 채워넣기
직업 종류를 나타내는 **occyp_type**만 결측치가 있는것으로 확인 됨.

In [5]:
# 결측치 확인
train_data.isnull().mean()

gender           0.000000
Annual_income    0.000000
income_type      0.000000
Education        0.000000
family_type      0.000000
house_type       0.000000
DAYS_BIRTH       0.000000
working_day      0.000000
work_phone       0.000000
phone            0.000000
email            0.000000
occyp_type       0.308841
begin_month      0.000000
car_reality      0.000000
credit           0.000000
dtype: float64

fillna로 직업이 없는 칸에 **None**으로 설정.

In [6]:
# null을 모두 None으로 변경
train_data.fillna("None",inplace=True)
test_data.fillna("None",inplace=True)

working_day가 0이고 occuyp_type이 None이면 **무직자**라고 판단.  
working_day가 0이 아니고 ocuuup_type이 None이면 **결측치**라고 판단.

In [7]:
# working_day == 0 && occyp_type == None은 무직자
train_data.loc[((train_data['working_day']== 0) & (train_data['occyp_type'] == "None")),'occyp_type'] = 'No_Job'
test_data.loc[((test_data['working_day']== 0) & (test_data['occyp_type'] == "None")),'occyp_type'] = 'No_Job'

### Numerical 데이터 전처리
최대한 여러 종류의 데이터를 만들려고 시도. 
  
음수를 모두 양수로 변환.

In [8]:
# DAYS_BIRTH 모두 양수로 변환
train_data['DAYS_BIRTH'] = np.abs(train_data['DAYS_BIRTH'])
test_data['DAYS_BIRTH'] = np.abs(test_data['DAYS_BIRTH'])

# working_day 모두 양수로 변환
train_data['working_day'] = np.abs(train_data['working_day'])
test_data['working_day'] = np.abs(test_data['working_day'])

# begin_month 모두 양수로 변환
train_data['begin_month'] = np.abs(train_data['begin_month'])
test_data['begin_month'] = np.abs(test_data['begin_month'])

무직자로 지낸 기간 추가.

In [9]:
df = train_data

# 무직자로 지낸 기간 not_working_day로 추가
df['not_working_day'] = df['DAYS_BIRTH'] - df['working_day']
numermic_col.append('not_working_day')

나이 데이터 추가.

In [10]:
# 산 나이 (년, 월, 주) 각각 age_y, age_m, age_w 추가
df['age_y'] = df['DAYS_BIRTH'] // 365
numermic_col.append('age_y')
df['age_m'] = df['DAYS_BIRTH'] % 365 // 30
numermic_col.append('age_m')
df['age_w'] = df['DAYS_BIRTH'] % 365 % 30 // 7
numermic_col.append('age_w')

일한 기간 연, 월, 주 추가.

In [11]:
# 일한 기간 (년, 월, 주) 각각 working_y, working_m, working_w 추가
df['working_y'] = df['working_day'] // 365
numermic_col.append('working_y')
df['working_m'] = df['working_day'] % 365 // 30
numermic_col.append('working_m')
df['working_w'] = df['working_day'] % 365 % 30 // 7
numermic_col.append('working_w')

카드를 만든 기간 연, 월 추가.

In [12]:
# 카드를 만들고 경과한 시간 (년, 월) 각각 begin_y, begin_m 추가
df['begin_y'] = df['begin_month'] // 12
numermic_col.append('begin_y')
df['begin_m'] = df['begin_month'] % 12
numermic_col.append('begin_m')

나머지 전처리.

In [13]:
df['begin_prop_income'] = np.floor(df['Annual_income']  / df['begin_month'])
numermic_col.append('begin_prop_income')

# inf 데이터 모두 0으로 치환
df.replace(-np.inf,0,inplace=True)
df.replace(np.inf,0,inplace=True)

# 혹시 모를 결측치 모두 0으로 설정
df.fillna(0,inplace=True)
train_data = df

test 데이터도 똑같이 처리

In [14]:
df = test_data
df['not_working_day'] = df['DAYS_BIRTH'] - df['working_day']
df['age_y'] = df['DAYS_BIRTH'] // 365
df['age_m'] = df['DAYS_BIRTH'] % 365 // 30
df['age_w'] = df['DAYS_BIRTH'] % 365 % 30 // 7

df['working_y'] = df['working_day'] // 365
df['working_m'] = df['working_day'] % 365 // 30
df['working_w'] = df['working_day'] % 365 % 30 // 7

df['begin_y'] = df['begin_month'] // 12
df['begin_m'] = df['begin_month'] % 12
df['begin_prop_income'] = np.floor(df['Annual_income']  / df['begin_month'])

df.replace(-np.inf,0,inplace=True)
df.replace(np.inf,0,inplace=True)
df.fillna(0,inplace=True)
test_data = df

train 데이터에서 클래스 분리

In [15]:
# 클래스만 train_data_y로 분리
train_data_y = train_data['credit'].astype(int)
# train_data에서 클래스와 index drop
train_data.drop('credit',axis=1,inplace=True)
train_data.reset_index(inplace=True)
train_data.drop('index',axis=1,inplace=True)

categorical 데이터 LabelEncoding.

In [16]:
for col in default_cate_col:
    encoder = LabelEncoder()
    train_data[col] = encoder.fit_transform(train_data[col])
    test_data[col] = encoder.transform(test_data[col])

In [17]:
train_data

Unnamed: 0,gender,Annual_income,income_type,Education,family_type,house_type,DAYS_BIRTH,working_day,work_phone,phone,email,occyp_type,begin_month,car_reality,not_working_day,age_y,age_m,age_w,working_y,working_m,working_w,begin_y,begin_m,begin_prop_income
0,0,202500.0,0,1,1,2,13899,4709,0,0,0,13,6.0,0,9190,38,0,4,12,10,4,0.0,6.0,33750.0
1,0,247500.0,0,4,0,1,11380,1540,0,0,1,8,5.0,1,9840,31,2,0,4,2,2,0.0,5.0,49500.0
2,1,450000.0,4,1,1,1,19087,4434,0,1,0,10,22.0,2,14653,52,3,2,12,1,3,1.0,10.0,20454.0
3,0,202500.0,0,4,1,1,15088,2092,0,1,0,16,37.0,1,12996,41,4,0,5,8,3,3.0,1.0,5472.0
4,0,157500.0,2,1,1,1,15037,2105,0,0,0,10,26.0,2,12932,41,2,1,5,9,1,2.0,2.0,6057.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,0,225000.0,2,4,1,1,12079,1984,0,0,0,3,2.0,0,10095,33,1,0,5,5,1,0.0,2.0,112500.0
26453,0,180000.0,4,1,2,1,15291,2475,0,0,0,13,47.0,1,12816,41,10,3,6,9,2,3.0,11.0,3829.0
26454,0,292500.0,4,4,0,5,10082,2015,0,0,0,3,25.0,1,8067,27,7,2,5,6,1,2.0,1.0,11700.0
26455,1,171000.0,4,2,3,1,10145,107,0,0,0,8,59.0,1,10038,27,9,2,0,3,2,4.0,11.0,2898.0


### Owner 변수 추가
동일한 인물로 추정되는 데이터가 여러개 있는것을 확인.
동일 인물을 구분하기 위한 **Owner** 변수 추가.  
gender, Annual_income, income_type, Education, family_type, house_type, DAYS_BIRTH, working_day, work_phone, phone, email, occyp_type, car_reality를 하나로 합쳐 변수 구성.

In [18]:
train_data['Owner'] = ''
test_data['Owner'] = ''
for col in ['gender','Annual_income','income_type','Education','family_type','house_type','DAYS_BIRTH','working_day','work_phone','phone','email','occyp_type','car_reality']:
    train_data['Owner'] = train_data['Owner']+ train_data[col].astype(int).astype(str)
    test_data['Owner'] = test_data['Owner'] + test_data[col].astype(int).astype(str)

XGBClassifier, RandomForestClassifier, LGBMClassifier 학습을 위한 numerical 데이터를 categorical 데이터로 변환.  
categorical 데이터로 변환한 변수들로 **Owner_r** 변수 추가. 그냥 Owner 변수를 LabelEncoding 하기에는 경우의 수가 너무 많아져서 이런 방식을 사용.

In [19]:
# Annual_income 범위로 자르기
income_range = [0, 180000, 330000, 490000, 640000, 
800000, 950000, 1110000, 1260000, 1420000]
cnt = 0
for i in income_range:
    train_data.loc[train_data['Annual_income'] >= i, 'Annual_income_r'] = cnt
    test_data.loc[test_data['Annual_income'] >= i, 'Annual_income_r'] = cnt
    cnt += 1
# DAYS_BIRTH 범위로 자르기
birth_range = [0, 20 * 365, 30 * 365, 40 * 365, 50 * 365, 60 * 365]
cnt = 0
for i in birth_range:
    train_data.loc[train_data['DAYS_BIRTH'] >= i, 'DAYS_BIRTH_r'] = cnt
    test_data.loc[test_data['DAYS_BIRTH'] >= i, 'DAYS_BIRTH_r'] = cnt
    cnt += 1
# working_day 범위로 자르기
work_range = [0, 1 * 365, 3 * 365, 5 * 365, 7 * 365, 10 * 365]
cnt = 0
for i in work_range:
    train_data.loc[train_data['working_day'] >= i, 'working_day_r'] = cnt
    test_data.loc[test_data['working_day'] >= i, 'working_day_r'] = cnt
    cnt += 1

# 모두 categorical data 로만 구성된 Owner_r 변수
train_data['Owner_r'] = ''
test_data['Owner_r'] = ''
all_l = []
l = ['gender','income_type','Education','family_type','house_type',
'work_phone','phone','email','occyp_type','car_reality']
for col in l:
    train_data['Owner_r'] = train_data['Owner_r']+ train_data[col].astype(int).astype(str)
    test_data['Owner_r'] = test_data['Owner_r'] + test_data[col].astype(int).astype(str)
    all_l.append([i for i in range(len(train_data[col].unique().tolist()))])

In [20]:
# Owner, Owner_r 모두 str로 변환
train_data['Owner'] = train_data['Owner'].astype(str)
test_data['Owner'] = test_data['Owner'].astype(str)
train_data['Owner_r'] = train_data['Owner_r'].astype(str)
test_data['Owner_r'] = test_data['Owner_r'].astype(str)

In [21]:
train_data

Unnamed: 0,gender,Annual_income,income_type,Education,family_type,house_type,DAYS_BIRTH,working_day,work_phone,phone,email,occyp_type,begin_month,car_reality,not_working_day,age_y,age_m,age_w,working_y,working_m,working_w,begin_y,begin_m,begin_prop_income,Owner,Annual_income_r,DAYS_BIRTH_r,working_day_r,Owner_r
0,0,202500.0,0,1,1,2,13899,4709,0,0,0,13,6.0,0,9190,38,0,4,12,10,4,0.0,6.0,33750.0,02025000112138994709000130,1.0,2.0,5.0,00112000130
1,0,247500.0,0,4,0,1,11380,1540,0,0,1,8,5.0,1,9840,31,2,0,4,2,2,0.0,5.0,49500.0,0247500040111380154000181,1.0,2.0,2.0,0040100181
2,1,450000.0,4,1,1,1,19087,4434,0,1,0,10,22.0,2,14653,52,3,2,12,1,3,1.0,10.0,20454.0,14500004111190874434010102,2.0,4.0,5.0,14111010102
3,0,202500.0,0,4,1,1,15088,2092,0,1,0,16,37.0,1,12996,41,4,0,5,8,3,3.0,1.0,5472.0,02025000411150882092010161,1.0,3.0,3.0,00411010161
4,0,157500.0,2,1,1,1,15037,2105,0,0,0,10,26.0,2,12932,41,2,1,5,9,1,2.0,2.0,6057.0,01575002111150372105000102,0.0,3.0,3.0,02111000102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,0,225000.0,2,4,1,1,12079,1984,0,0,0,3,2.0,0,10095,33,1,0,5,5,1,0.0,2.0,112500.0,0225000241112079198400030,1.0,2.0,3.0,0241100030
26453,0,180000.0,4,1,2,1,15291,2475,0,0,0,13,47.0,1,12816,41,10,3,6,9,2,3.0,11.0,3829.0,01800004121152912475000131,1.0,3.0,3.0,04121000131
26454,0,292500.0,4,4,0,5,10082,2015,0,0,0,3,25.0,1,8067,27,7,2,5,6,1,2.0,1.0,11700.0,0292500440510082201500031,1.0,1.0,3.0,0440500031
26455,1,171000.0,4,2,3,1,10145,107,0,0,0,8,59.0,1,10038,27,9,2,0,3,2,4.0,11.0,2898.0,117100042311014510700081,0.0,1.0,0.0,1423100081


**Owner_r** LabelEncoding.

In [22]:
# product 함수로 모든 가능한 경우의 수를 계산
all_l = list(product(*all_l))
for i in range(len(all_l)):
    all_l[i] = ''.join([str(j) for j in all_l[i]])

# Owner_r 변수 LabelEncoding
e = LabelEncoder()
e.fit(all_l)
train_data['Owner_r'] = e.transform(train_data['Owner_r'])
test_data['Owner_r'] = e.transform(test_data['Owner_r'])

# categorical columns에 Owner와 Owner_r 추가
cate_col.append('Owner')
cate_col.append('Owner_r')

In [23]:
train_data

Unnamed: 0,gender,Annual_income,income_type,Education,family_type,house_type,DAYS_BIRTH,working_day,work_phone,phone,email,occyp_type,begin_month,car_reality,not_working_day,age_y,age_m,age_w,working_y,working_m,working_w,begin_y,begin_m,begin_prop_income,Owner,Annual_income_r,DAYS_BIRTH_r,working_day_r,Owner_r
0,0,202500.0,0,1,1,2,13899,4709,0,0,0,13,6.0,0,9190,38,0,4,12,10,4,0.0,6.0,33750.0,02025000112138994709000130,1.0,2.0,5.0,18255
1,0,247500.0,0,4,0,1,11380,1540,0,0,1,8,5.0,1,9840,31,2,0,4,2,2,0.0,5.0,49500.0,0247500040111380154000181,1.0,2.0,2.0,58195
2,1,450000.0,4,1,1,1,19087,4434,0,1,0,10,22.0,2,14653,52,3,2,12,1,3,1.0,10.0,20454.0,14500004111190874434010102,2.0,4.0,5.0,665886
3,0,202500.0,0,4,1,1,15088,2092,0,1,0,16,37.0,1,12996,41,4,0,5,8,3,3.0,1.0,5472.0,02025000411150882092010161,1.0,3.0,3.0,61105
4,0,157500.0,2,1,1,1,15037,2105,0,0,0,10,26.0,2,12932,41,2,1,5,9,1,2.0,2.0,6057.0,01575002111150372105000102,0.0,3.0,3.0,161766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,0,225000.0,2,4,1,1,12079,1984,0,0,0,3,2.0,0,10095,33,1,0,5,5,1,0.0,2.0,112500.0,0225000241112079198400030,1.0,2.0,3.0,204999
26453,0,180000.0,4,1,2,1,15291,2475,0,0,0,13,47.0,1,12816,41,10,3,6,9,2,3.0,11.0,3829.0,01800004121152912475000131,1.0,3.0,3.0,308656
26454,0,292500.0,4,4,0,5,10082,2015,0,0,0,3,25.0,1,8067,27,7,2,5,6,1,2.0,1.0,11700.0,0292500440510082201500031,1.0,1.0,3.0,348040
26455,1,171000.0,4,2,3,1,10145,107,0,0,0,8,59.0,1,10038,27,9,2,0,3,2,4.0,11.0,2898.0,117100042311014510700081,0.0,1.0,0.0,685975


numerical 데이터 정규화.

In [24]:
for col in numermic_col:
    minmax = StandardScaler()
    train_data[col] = minmax.fit_transform(train_data[[col]])
    test_data[col] = minmax.transform(test_data[[col]])

In [25]:
train_data

Unnamed: 0,gender,Annual_income,income_type,Education,family_type,house_type,DAYS_BIRTH,working_day,work_phone,phone,email,occyp_type,begin_month,car_reality,not_working_day,age_y,age_m,age_w,working_y,working_m,working_w,begin_y,begin_m,begin_prop_income,Owner,Annual_income_r,DAYS_BIRTH_r,working_day_r,Owner_r
0,0,0.149136,0,1,1,2,-0.490075,1.059227,0,0,0,13,-1.215231,0,-0.939166,-0.452819,-1.641152,1.885986,0.994411,1.427247,2.303833,-1.249606,0.183042,0.628472,02025000112138994709000130,1.0,2.0,5.0,18255
1,0,0.590848,0,4,0,1,-1.089621,-0.277849,0,0,1,8,-1.275620,1,-0.805573,-1.060808,-1.064590,-1.315703,-0.250447,-0.705356,0.575713,-1.249606,-0.109147,1.178125,0247500040111380154000181,1.0,2.0,2.0,58195
2,1,2.578550,4,1,1,1,0.744719,0.943198,0,1,0,10,-0.249003,2,0.183634,0.763158,-0.776310,0.285141,0.994411,-0.971932,1.439773,-0.526934,1.351799,0.164461,14500004111190874434010102,2.0,4.0,5.0,665886
3,0,0.149136,0,4,1,1,-0.207081,-0.044947,0,1,0,16,0.656836,1,-0.156926,-0.192253,-0.488029,-1.315703,-0.094839,0.894096,1.439773,0.918412,-1.277904,-0.358389,02025000411150882092010161,1.0,3.0,3.0,61105
4,0,-0.292575,2,1,1,1,-0.219220,-0.039462,0,0,0,10,-0.007446,2,-0.170080,-0.192253,-1.064590,-0.515281,-0.094839,1.160672,-0.288347,0.195739,-0.985715,-0.337974,01575002111150372105000102,0.0,3.0,3.0,161766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,0,0.369992,2,4,1,1,-0.923252,-0.090515,0,0,0,3,-1.456788,0,-0.753163,-0.887097,-1.352871,-1.315703,-0.094839,0.094370,-0.288347,-1.249606,-0.985715,3.376734,0225000241112079198400030,1.0,2.0,3.0,204999
26453,0,-0.071719,4,1,2,1,-0.158765,0.116649,0,0,0,13,1.260729,1,-0.193921,-0.192253,1.241654,1.085563,0.060768,1.160672,0.575713,0.918412,1.643988,-0.415728,01800004121152912475000131,1.0,3.0,3.0,308656
26454,0,1.032559,4,4,0,5,-1.398558,-0.077435,0,0,0,3,-0.067835,1,-1.169974,-1.408230,0.376812,0.285141,-0.094839,0.360945,-0.288347,0.195739,-1.277904,-0.141041,0292500440510082201500031,1.0,1.0,3.0,348040
26455,1,-0.160062,4,2,3,1,-1.383563,-0.882466,0,0,0,8,1.985400,1,-0.764878,-1.408230,0.953373,0.285141,-0.872875,-0.438781,0.575713,1.641085,1.643988,-0.448218,117100042311014510700081,0.0,1.0,0.0,685975


## 모델 학습
과적합을 최소화 하기 위해 **CV Stacking** 사용.  
**CatBoostClassifier, XGBClassifier, RandomForestClassifier, LGBMClassifier**로 학습시킨 뒤, 결과를 합쳐 **LGBMClassifier**로 한번더 학습.

In [26]:
from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier
from lightgbm.callback import early_stopping, log_evaluation
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
import copy

In [27]:
# XGBClassifier, RandomForestClassifier, LGBMClassifier에서 사용할 데이터
X = train_data.drop(['Owner'], axis=1)
y = train_data_y
X_test = test_data.drop(['Owner'], axis=1)
categorical_columns = copy.deepcopy(cate_col)
categorical_columns.remove('Owner')

# CatBoostClassifier에서 사용할 데이터
X_cat = train_data.drop(['Owner_r'], axis=1)
y_cat = train_data_y
X_test_cat = test_data.drop(['Owner_r'], axis=1)
categorical_columns_cat = copy.deepcopy(cate_col)
categorical_columns_cat.remove('Owner_r')

### 하이퍼 파라미터 로딩
**optuna**로 튜닝. CatBoostClassifier 튜닝을 하지 않은 모델이 성능이 더 좋아서 튜닝 x.

#### LGBMClassifier 하이퍼 파라미터

In [28]:
param_lgbm = {}
with open('./data/params/best_param_lgbm', 'rb') as f:
    param_lgbm = pickle.load(f)
param_lgbm

{'learning_rate': 0.015,
 'objective': 'multiclass',
 'n_jobs': 12,
 'metric': 'multi_logloss',
 'device_type': 'cuda',
 'random_state': 1234,
 'n_estimators': 100000,
 'boosting_type': 'gbdt',
 'bagging_freq': 19,
 'bagging_fraction': 0.8361499274257655,
 'max_depth': 9,
 'min_data_in_leaf': 5,
 'colsample_bytree': 0.5545131276302593,
 'reg_alpha': 5.442681671789194,
 'reg_lambda': 0.2525850918547932,
 'max_bin': 131,
 'num_leaves': 307}

#### XGBClassifier 하이퍼 파라미터

In [29]:
param_xgb = {}
with open('./data/params/best_param_xgb', 'rb') as f:
    param_xgb = pickle.load(f)
param_xgb

{'gamma': 0.2065804393547938,
 'max_depth': 8,
 'min_child_weight': 0,
 'colsample_bytree': 0.6736188784034954,
 'reg_alpha': 2.040292632111543,
 'reg_lambda': 0.6260501553667925,
 'grow_policy': 'lossguide',
 'subsample': 0.8393160499864659,
 'max_leaves': 205,
 'learning_rate': 0.015,
 'n_jobs': 12,
 'n_estimators': 80000,
 'random_state': 1234,
 'booster': 'gbtree',
 'tree_method': 'gpu_hist',
 'objective': 'multi:softmax',
 'use_label_encoder': False}

#### RandomForestClassifier 하이퍼 파라미터

In [30]:
param_rf = {}
with open('./data/params/best_param_rf', 'rb') as f:
    param_rf = pickle.load(f)
param_rf

{'n_estimators': 252,
 'max_depth': 26,
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'random_state': 540}

#### 최종 학습에 사용할 LGBMClassifier 하이퍼 파라미터

In [31]:
param_lgbm_last = {}
with open('./data/params/best_param_lgbm_stacked', 'rb') as f:
    param_lgbm_last = pickle.load(f)
param_lgbm_last

{'learning_rate': 0.015,
 'objective': 'multiclass',
 'n_jobs': 12,
 'metric': 'multi_logloss',
 'device_type': 'cuda',
 'random_state': 1234,
 'n_estimators': 100000,
 'boosting_type': 'gbdt',
 'bagging_freq': 14,
 'bagging_fraction': 0.8957366717057349,
 'max_depth': 8,
 'min_data_in_leaf': 617,
 'colsample_bytree': 0.4185219684831536,
 'reg_alpha': 21.574874226018196,
 'reg_lambda': 0.45933281999458775,
 'max_bin': 102,
 'num_leaves': 154}

### CatBoostClassifier 학습

In [32]:
def train_catboost(n_folds):
    # CV 적용
    folds = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=4558)
    splits = folds.split(X_cat, y_cat)
    cat_val = np.zeros((X_cat.shape[0], 3))
    cat_test = np.zeros((X_test_cat.shape[0], 3))

    for fold, (train_idx, valid_idx) in enumerate(splits):
        print(f"-----------Catboost {fold}번-----------")
        # 학습, 검증 셋 나누기
        X_train, X_valid = X_cat.iloc[train_idx], X_cat.iloc[valid_idx]
        y_train, y_valid = y_cat.iloc[train_idx], y_cat.iloc[valid_idx]
        
        train_data = Pool(data=X_train, label=y_train, cat_features=categorical_columns_cat)
        valid_data = Pool(data=X_valid, label=y_valid, cat_features=categorical_columns_cat)

        # 모델 학습
        model = CatBoostClassifier(n_estimators=10000)
        model.fit(train_data, eval_set=valid_data, early_stopping_rounds=100, verbose=50, use_best_model=True)

        # 모델 예측값 저장
        cat_val[valid_idx] = model.predict_proba(X_valid)
        cat_test += model.predict_proba(X_test_cat) / n_folds

    log_score = log_loss(y_cat, cat_val)
    print(f"Catboost Log Loss Score: {log_score:.5f}\n")
    return cat_val, cat_test

In [33]:
n_folds = 15
cat_val, cat_test = train_catboost(n_folds)

-----------Catboost 0번-----------
Learning rate set to 0.047773
0:	learn: 1.0710752	test: 1.0710582	best: 1.0710582 (0)	total: 62ms	remaining: 10m 19s
50:	learn: 0.7352424	test: 0.6858323	best: 0.6858323 (50)	total: 551ms	remaining: 1m 47s
100:	learn: 0.7184249	test: 0.6614029	best: 0.6614029 (100)	total: 1.13s	remaining: 1m 51s
150:	learn: 0.7124352	test: 0.6567157	best: 0.6567157 (150)	total: 1.73s	remaining: 1m 53s
200:	learn: 0.7079899	test: 0.6538627	best: 0.6538627 (200)	total: 2.35s	remaining: 1m 54s
250:	learn: 0.7029737	test: 0.6519333	best: 0.6519333 (250)	total: 3.02s	remaining: 1m 57s
300:	learn: 0.6989341	test: 0.6512645	best: 0.6512203 (295)	total: 3.7s	remaining: 1m 59s
350:	learn: 0.6950078	test: 0.6507418	best: 0.6506553 (336)	total: 4.38s	remaining: 2m
400:	learn: 0.6917447	test: 0.6502265	best: 0.6501992 (391)	total: 5.06s	remaining: 2m 1s
450:	learn: 0.6880769	test: 0.6503899	best: 0.6501921 (401)	total: 5.75s	remaining: 2m 1s
500:	learn: 0.6841970	test: 0.6502675	b

### LGBMClassifier 학습

In [34]:
def train_lgbm(n_folds):
    folds = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=4558)
    splits = folds.split(X, y)
    lgbm_val = np.zeros((X.shape[0], 3))
    lgbm_test = np.zeros((X_test.shape[0], 3))

    param_lgbm['learning_rate'] = 0.25

    for fold, (train_idx, valid_idx) in enumerate(splits):
        print(f"-----------LGBM {fold}번-----------")
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        model = LGBMClassifier(**param_lgbm)
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)],
             callbacks=[early_stopping(stopping_rounds=100), log_evaluation(period=50)])

        lgbm_val[valid_idx] = model.predict_proba(X_valid)
        lgbm_test += model.predict_proba(X_test) / n_folds

    log_score = log_loss(y, lgbm_val)
    print(f"LGBM Log Loss Score: {log_score:.5f}\n")
    return lgbm_val, lgbm_test

In [35]:
lgbm_val, lgbm_test = train_lgbm(n_folds)

-----------LGBM 0번-----------
Training until validation scores don't improve for 100 rounds
[50]	training's multi_logloss: 0.548554	valid_1's multi_logloss: 0.725238
[100]	training's multi_logloss: 0.442031	valid_1's multi_logloss: 0.713716
[150]	training's multi_logloss: 0.385336	valid_1's multi_logloss: 0.716726
Early stopping, best iteration is:
[99]	training's multi_logloss: 0.443173	valid_1's multi_logloss: 0.713361
-----------LGBM 1번-----------
Training until validation scores don't improve for 100 rounds
[50]	training's multi_logloss: 0.548399	valid_1's multi_logloss: 0.734427
[100]	training's multi_logloss: 0.44317	valid_1's multi_logloss: 0.731314
[150]	training's multi_logloss: 0.385309	valid_1's multi_logloss: 0.735627
Early stopping, best iteration is:
[87]	training's multi_logloss: 0.46428	valid_1's multi_logloss: 0.729989
-----------LGBM 2번-----------
Training until validation scores don't improve for 100 rounds
[50]	training's multi_logloss: 0.544269	valid_1's multi_logl

### XGBClassifier 학습

In [36]:
def train_xgb(n_folds):
    folds = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=4558)
    splits = folds.split(X, y)
    xgb_val = np.zeros((X.shape[0], 3))
    xgb_test = np.zeros((X_test.shape[0], 3))

    for fold, (train_idx, valid_idx) in enumerate(splits):
        print(f"-----------XGB {fold}번-----------")
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        model = XGBClassifier(**param_xgb)
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)],
              early_stopping_rounds=100, verbose=500, eval_metric='mlogloss')

        xgb_val[valid_idx] = model.predict_proba(X_valid)
        xgb_test += model.predict_proba(X_test) / n_folds

    log_score = log_loss(y, xgb_val)
    print(f"Xgb Log Loss Score: {log_score:.5f}\n")
    return xgb_val, xgb_test

In [37]:
xgb_val, xgb_test = train_xgb(n_folds)

-----------XGB 0번-----------
[0]	validation_0-mlogloss:1.09165	validation_1-mlogloss:1.09182
[500]	validation_0-mlogloss:0.61584	validation_1-mlogloss:0.72825
[1000]	validation_0-mlogloss:0.49712	validation_1-mlogloss:0.69773
[1500]	validation_0-mlogloss:0.42135	validation_1-mlogloss:0.68753
[1780]	validation_0-mlogloss:0.39233	validation_1-mlogloss:0.68672
-----------XGB 1번-----------
[0]	validation_0-mlogloss:1.09159	validation_1-mlogloss:1.09178
[500]	validation_0-mlogloss:0.61527	validation_1-mlogloss:0.74141
[1000]	validation_0-mlogloss:0.49526	validation_1-mlogloss:0.71835
[1500]	validation_0-mlogloss:0.41962	validation_1-mlogloss:0.71391
[1534]	validation_0-mlogloss:0.41555	validation_1-mlogloss:0.71397
-----------XGB 2번-----------
[0]	validation_0-mlogloss:1.09160	validation_1-mlogloss:1.09196
[500]	validation_0-mlogloss:0.61598	validation_1-mlogloss:0.74790
[1000]	validation_0-mlogloss:0.49645	validation_1-mlogloss:0.72139
[1500]	validation_0-mlogloss:0.42002	validation_1-mlog

### RandomForestClassifier 학습

In [38]:
def train_rf(n_folds):
    folds = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=4558)
    splits = folds.split(X, y)
    rf_val = np.zeros((X.shape[0], 3))
    rf_test = np.zeros((X_test.shape[0], 3))

    for fold, (train_idx, valid_idx) in enumerate(splits):
        print(f"-----------RandomForest {fold}번-----------")
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        model = RandomForestClassifier(**param_rf)
        model.fit(X_train, y_train)

        rf_val[valid_idx] = model.predict_proba(X_valid)
        rf_test += model.predict_proba(X_test) / n_folds
        print(f"Log Loss Score: {log_loss(y_valid, rf_val[valid_idx]):.5f}")

    log_score = log_loss(y, rf_val)
    print(f"RandomForest Log Loss Score: {log_score:.5f}\n")
    return rf_val, rf_test

In [39]:
rf_val, rf_test = train_rf(n_folds)

-----------RandomForest 0번-----------
Log Loss Score: 0.69864
-----------RandomForest 1번-----------
Log Loss Score: 0.72244
-----------RandomForest 2번-----------
Log Loss Score: 0.72294
-----------RandomForest 3번-----------
Log Loss Score: 0.73472
-----------RandomForest 4번-----------
Log Loss Score: 0.71388
-----------RandomForest 5번-----------
Log Loss Score: 0.70889
-----------RandomForest 6번-----------
Log Loss Score: 0.72111
-----------RandomForest 7번-----------
Log Loss Score: 0.69382
-----------RandomForest 8번-----------
Log Loss Score: 0.69702
-----------RandomForest 9번-----------
Log Loss Score: 0.72157
-----------RandomForest 10번-----------
Log Loss Score: 0.73967
-----------RandomForest 11번-----------
Log Loss Score: 0.74071
-----------RandomForest 12번-----------
Log Loss Score: 0.71317
-----------RandomForest 13번-----------
Log Loss Score: 0.72824
-----------RandomForest 14번-----------
Log Loss Score: 0.72194
RandomForest Log Loss Score: 0.71858



### 학습 결과 결합

In [40]:
# 각 모델에서의 예측 결과 모두 결합
train_pred = np.concatenate([cat_val, lgbm_val, xgb_val, rf_val], axis=1)
test_pred = np.concatenate([cat_test, lgbm_test, xgb_test, rf_test], axis=1)
print(f'train_pred\n{train_pred}\n{train_pred.shape}\n')
print(f'test_pred\n{test_pred}\n{test_pred.shape}\n')

train_pred
[[0.07699502 0.16798797 0.755017   ... 0.21653738 0.1937674  0.58969522]
 [0.65685546 0.08425705 0.25888749 ... 0.52458428 0.12820767 0.34720805]
 [0.04616581 0.09571085 0.85812334 ... 0.05534404 0.41102911 0.53362685]
 ...
 [0.09876626 0.21892352 0.68231022 ... 0.08765252 0.24270516 0.66964232]
 [0.07571    0.1378242  0.7864658  ... 0.12391886 0.21652534 0.6595558 ]
 [0.09027888 0.16667268 0.74304844 ... 0.11172791 0.24088516 0.64738693]]
(26457, 12)

test_pred
[[0.11244988 0.17371823 0.71383189 ... 0.05981103 0.17805626 0.7621327 ]
 [0.3371832  0.23365793 0.42915887 ... 0.2467788  0.22984613 0.52337507]
 [0.04327082 0.0747881  0.88194108 ... 0.0621314  0.08543021 0.85243839]
 ...
 [0.04724408 0.10725124 0.84550467 ... 0.05004236 0.11116913 0.83878851]
 [0.1454231  0.29471245 0.55986445 ... 0.13806908 0.27613325 0.58579766]
 [0.07445797 0.42042342 0.50511861 ... 0.16543603 0.26305406 0.57150991]]
(10000, 12)



### LGBMClassifier로 최종 학습

In [41]:
folds = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=4558)
stack_val = np.zeros((train_pred.shape[0], 3))
stack_test = np.zeros((test_pred.shape[0], 3))

for fold, (train_idx, valid_idx) in enumerate(folds.split(train_pred, y), 1):
    print(f"-----------Stacked LGBM {fold}번-----------")
    X_train, X_valid = train_pred[train_idx], train_pred[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    model = LGBMClassifier(**param_lgbm_last)
    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)],
              callbacks=[early_stopping(stopping_rounds=100), log_evaluation(period=100)])

    stack_val[valid_idx, :] = model.predict_proba(train_pred[valid_idx])
    stack_test += model.predict_proba(test_pred) / n_folds

loss = log_loss(y, stack_val)
print(f'{loss}')

-----------Stacked LGBM 1번-----------
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.686182	valid_1's multi_logloss: 0.674587
[200]	training's multi_logloss: 0.665489	valid_1's multi_logloss: 0.653541
[300]	training's multi_logloss: 0.659295	valid_1's multi_logloss: 0.649171
[400]	training's multi_logloss: 0.656164	valid_1's multi_logloss: 0.647793
[500]	training's multi_logloss: 0.653884	valid_1's multi_logloss: 0.647349
[600]	training's multi_logloss: 0.652135	valid_1's multi_logloss: 0.647078
[700]	training's multi_logloss: 0.650591	valid_1's multi_logloss: 0.646977
Early stopping, best iteration is:
[687]	training's multi_logloss: 0.650766	valid_1's multi_logloss: 0.646916
-----------Stacked LGBM 2번-----------
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.684768	valid_1's multi_logloss: 0.689101
[200]	training's multi_logloss: 0.664037	valid_1's multi_logloss: 0.672385
[300]	training'

In [42]:
submission = pd.read_csv('./data/sample_submission.csv')
submission.loc[:, 1:] = stack_test
submission

  indexer = self._get_setitem_indexer(key)


Unnamed: 0,index,0,1,2
0,26457,0.098574,0.206903,0.694523
1,26458,0.333605,0.247072,0.419324
2,26459,0.037885,0.057823,0.904292
3,26460,0.048531,0.067786,0.883683
4,26461,0.073703,0.205349,0.720949
...,...,...,...,...
9995,36452,0.055489,0.207363,0.737149
9996,36453,0.222826,0.296583,0.480591
9997,36454,0.055221,0.089584,0.855195
9998,36455,0.105940,0.300948,0.593113


In [43]:
submission.to_csv(f'./data/submission/stacking_{n_folds}_{loss}_xgboost.csv', index=False)