In [None]:
import numpy as np
import pandas as pd

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
pip install --upgrade scikit-learn

In [None]:
pip install -U scikit-learn==0.23

In [None]:
import lightgbm as lgb
import xgboost as xgb
import sklearn
import joblib
from sklearn.metrics import f1_score
from sklearn.externals.joblib import Parallel, delayed
from sklearn.base import clone
from sklearn.ensemble import VotingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.utils import class_weight

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from sklearn.preprocessing import LabelEncoder

def encode_Data(df): # 라벨 인코딩으로 df의 idhogar 열을 categorical data -> numerical data로 변환
    df['idhogar']=LabelEncoder().fit_transform(df['idhogar'])

def feature_importance(forest, X_train, display_results=True): # forest 모델의 feature 중요도 계산, 정렬
    ranked_list = [] # 피쳐 중요도 저장
    zero_features = [] # 중요도 0인 피쳐 저장

    importances = forest.feature_importances_
    imdicies = np.argsort(importances)[::-1] # 중요도 가져와서 내림차순 정렬

    if display_results:
        print("Feature ranking:")

    for f in range(X_train.shape[1]): # 피쳐 수만큼 반복하면서 각 피쳐의 순위, 인덱스, 중요도, 이름 출력
        if display_results:
            print("%d. feature %d (%f)" % (f+1, indices[f], importances[indices[f]])+"-"+X_train.columns[indices[f]])

        ranked_list.append(X_train.columns[indices[f]]) # 중요도 높으면 ranked_list에 추가

        if importances[indices[f]] == 0.0: # 중요도 0이면 zero_features에 추가
            zero_features.append(X_trian.columns[indices[f]])

    return ranked_list, zero_features

In [None]:
def do_features(df): # 새로운 feature 추가하고 집계

    feats_div = [('children_fraction', 'r4t1', 'r4t3'),
                 ('working_man_fraction', 'r4h2', 'r4t3'),
                 ('all_man_fraction', 'r4h3', 'r4t3'),
                 ('human_density', 'tamviv', 'rooms'),
                 ('human_bed_density', 'tamviv', 'bedrooms'),
                 ('rent_per_person', 'v2a1', 'r4t3'),
                 ('rent_per_room', 'v2a1', 'rooms'),
                 ('mobile_density', 'qmobilephone', 'r4t3'),
                 ('tablet_density', 'v18q1', 'r4t3'),
                 ('mobile_adult_density', 'qmobilephone', 'r4t2'),
                 ('tablet_adult_density', 'v18q1', 'r4t2'),
                ]

    feats_sub = [('people_not_living', 'tamhog', 'tamviv'),
                 ('people_weird_stat', 'tamhog', 'r4t3')]
    # 두 개의 칼럼으로 나누어 새로운 특성 생성
    for f_new, f1, f2 in feats_div:
        df['fe_'+f_new] = (df[f1]/df[f2]).astype(np.float32)
    # 두 개의 칼럼의 차이를 계산하여 새로운 특성 생성
    for f_new, f1, f2 in feats_sub:
        df['fe_'+f_new] = (df[f1]-df[f2]).astype(np.float32)

    # 'estadocivil', 'parentesco', 'intlevel'로 시작하는 모든 컬럼에 대한 집계
    aggs_cat = {'dis':['mean']}
    for s_ in ['estadocivil', 'parentesco', 'intlevel']:
        for f_ in [f_ for f_ in df.columns if f_.startwith(s_)]:
            aggs_cat[f_] = ['mean','count']

    # age가 18이상인 데이터를 idhogar 기준으로 집계, 새로운 이름으로 df에 합침
    for name_, df_ in [('18', df.query('age>=18'))]:
        df_agg = df_.groupby('idhogar').agg({**aggs_num, **aggs_cat}).astype(np.float32)
        df_agg.columns = pd.Index(['agg' + name_+'_'+e[0]+"_"+e[1].upper() for e in df_agg.columns.tolist()])
        df = df.join(df_Agg, how='left', on='idhogar')
        del df_agg

    #id 컬럼은 df에서 삭제
    df.drop(['id'], axis=1, inplace=True)

    return df

In [None]:
def convert_OHE2LE(df): # 원핫인코딩된 dataframe을 레이블 인코딩으로 변환
    tmp_df = df.copy(deep=True)
    for s_ in ['pared', 'piso', 'techo', 'abastagua', 'sanitario', 'energcocinar', 'elimbasu',
               'epared', 'etecho', 'eviv', 'estadocivil', 'parentesco',
               'instlevel', 'lugar', 'tipovivi',
               'manual_elec']: # 레이블 인코딩할 컬럼 리스트
        # 각 컬럼에 대해 원핫 인코딩된 컬럼 선택, 컬럼의 합계 계산.
        if 'manual_' not in s_:
            cols_s_ = [f_ for f_ in df.columns if f_.startswith(s_)]
        elif 'elec' in s_:
            cols_s_ = ['public', 'planpri', 'noelec', 'coopele']
        sum_ohe = tmp_df[cols_s_].sum(axis=1).unique()

        # 원핫인코딩된 컬럼의 합이 0인 경우 결측치가 있는 것으로 간주, dummy columns 추가
        if 0 in sum_ohe:
            print('The OHE in {} is incomplete. A new column will be added before label encoding'
                  .format(s_))
            # dummy colmn name to be added
            col_dummy = s_+'_dummy'
            # add the column to the dataframe
            tmp_df[col_dummy] = (tmp_df[cols_s_].sum(axis=1) == 0).astype(np.int8)
            # add the name to the list of columns to be label-encoded
            cols_s_.append(col_dummy)
            # proof-check, that now the category is complete
            sum_ohe = tmp_df[cols_s_].sum(axis=1).unique()
            if 0 in sum_ohe:
                 print("The category completion did not work")

        # idxmax를 사용하여 가장 큰 값 가지는 인덱스 찾고 레이블 인코딩
        tmp_cat = tmp_df[cols_s_].idxmax(axis=1)
        tmp_df[s_ + '_LE'] = LabelEncoder().fit_transform(tmp_cat).astype(np.int16)
        # 원핫인코딩된 컬럼은 삭제
        if 'parentesco1' in cols_s_:
            cols_s_.remove('parentesco1')
        tmp_df.drop(cols_s_, axis=1, inplace=True)
    return tmp_df

Read in the data and clean it up

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

test_ids = test.Id

In [None]:
def process_df(df_): # 데이터프레임 처리하는 변수
    # encode the idhogar
    encode_data(df_)

    # create aggregate features
    return do_features(df_)

train = process_df(train)
test = process_df(test)

In [None]:
# 의존성 처리 -> 제곱근 계산하여 dependency 칼럼 생성
train['dependency'] = np.sqrt(train['SQBdependency'])
test['dependency'] = np.sqrt(test['SQBdependency'])

# no 값을 0으로 변환
train.loc[train['edjefa'] == "no", "edjefa"] = 0
train.loc[train['edjefe'] == "no", "edjefe"] = 0
test.loc[test['edjefa'] == "no", "edjefa"] = 0
test.loc[test['edjefe'] == "no", "edjefe"] = 0

# 교육이 yes, 가구주인 경우 escolarj로 채우기
train.loc[(train['edjefa'] == "yes") & (train['parentesco1'] == 1), "edjefa"] = train.loc[(train['edjefa'] == "yes") & (train['parentesco1'] == 1), "escolari"]
train.loc[(train['edjefe'] == "yes") & (train['parentesco1'] == 1), "edjefe"] = train.loc[(train['edjefe'] == "yes") & (train['parentesco1'] == 1), "escolari"]

test.loc[(test['edjefa'] == "yes") & (test['parentesco1'] == 1), "edjefa"] = test.loc[(test['edjefa'] == "yes") & (test['parentesco1'] == 1), "escolari"]
test.loc[(test['edjefe'] == "yes") & (test['parentesco1'] == 1), "edjefe"] = test.loc[(test['edjefe'] == "yes") & (test['parentesco1'] == 1), "escolari"]

# 교육상태가 yes인 경우 4로 변환
train.loc[train['edjefa'] == "yes", "edjefa"] = 4
train.loc[train['edjefe'] == "yes", "edjefe"] = 4

test.loc[test['edjefa'] == "yes", "edjefa"] = 4
test.loc[test['edjefe'] == "yes", "edjefe"] = 4

# 데이터 타입을 int로 변환
train['edjefe'] = train['edjefe'].astype("int")
train['edjefa'] = train['edjefa'].astype("int")
test['edjefe'] = test['edjefe'].astype("int")
test['edjefa'] = test['edjefa'].astype("int")

# 가구주 중 최대 교육 수준 계산
train['edjef'] = np.max(train[['edjefa','edjefe']], axis=1)
test['edjef'] = np.max(test[['edjefa','edjefe']], axis=1)

# 결측치 0으로 대체
train['v2a1']=train['v2a1'].fillna(0)
test['v2a1']=test['v2a1'].fillna(0)

test['v18q1']=test['v18q1'].fillna(0)
train['v18q1']=train['v18q1'].fillna(0)

train['rez_esc']=train['rez_esc'].fillna(0)
test['rez_esc']=test['rez_esc'].fillna(0)

train.loc[train.meaneduc.isnull(), "meaneduc"] = 0
train.loc[train.SQBmeaned.isnull(), "SQBmeaned"] = 0
무 의존성 처리 -> 제곱근 계산하여 dependency 칼럼 생성
train['dependency'] = np.sqrt(train['SQBdependency'])
test['dependency'] = np.sqrt(test['SQBdependency'])

# no 값을 0으로 변환
train.loc[train['edjefa'] == "no", "edjefa"] = 0
train.loc[train['edjefe'] == "no", "edjefe"] = 0
test.loc[test['edjefa'] == "no", "edjefa"] = 0
test.loc[test['edjefe'] == "no", "edjefe"] = 0

# 교육이 yes, 가구주인 경우 escolarj로 채우기
train.loc[(train['edjefa'] == "yes") & (train['parentesco1'] == 1), "edjefa"] = train.loc[(train['edjefa'] == "yes") & (train['parentesco1'] == 1), "escolari"]
train.loc[(train['edjefe'] == "yes") & (train['parentesco1'] == 1), "edjefe"] = train.loc[(train['edjefe'] == "yes") & (train['parentesco1'] == 1), "escolari"]

test.loc[(test['edjefa'] == "yes") & (test['parentesco1'] == 1), "edjefa"] = test.loc[(test['edjefa'] == "yes") & (test['parentesco1'] == 1), "escolari"]
test.loc[(test['edjefe'] == "yes") & (test['parentesco1'] == 1), "edjefe"] = test.loc[(test['edjefe'] == "yes") & (test['parentesco1'] == 1), "escolari"]

# 교육상태가 yes인 경우 4로 변환
train.loc[train['edjefa'] == "yes", "edjefa"] = 4
train.loc[train['edjefe'] == "yes", "edjefe"] = 4

test.loc[test['edjefa'] == "yes", "edjefa"] = 4
test.loc[test['edjefe'] == "yes", "edjefe"] = 4

# 데이터 타입을 int로 변환
train['edjefe'] = train['edjefe'].astype("int")
train['edjefa'] = train['edjefa'].astype("int")
test['edjefe'] = test['edjefe'].astype("int")
test['edjefa'] = test['edjefa'].astype("int")

# 가구주 중 최대 교육 수준 계산
train['edjef'] = np.max(train[['edjefa','edjefe']], axis=1)
test['edjef'] = np.max(test[['edjefa','edjefe']], axis=1)

# 결측치 0으로 대체
train['v2a1']=train['v2a1'].fillna(0)
test['v2a1']=test['v2a1'].fillna(0)

test['v18q1']=test['v18q1'].fillna(0)
train['v18q1']=train['v18q1'].fillna(0)

train['rez_esc']=train['rez_esc'].fillna(0)
test['rez_esc']=test['rez_esc'].fillna(0)

train.loc[train.meaneduc.isnull(), "meaneduc"] = 0
train.loc[train.SQBmeaned.isnull(), "SQBmeaned"] = 0
무
test.loc[test.meaneduc.isnull(), "meaneduc"] = 0
test.loc[test.SQBmeaned.isnull(), "SQBmeaned"] = 0

# 화장실 및 급수 관련 데이터의 불일치 처리
# if there is no water we'll assume they do not
train.loc[(train.v14a ==  1) & (train.sanitario1 ==  1) & (train.abastaguano == 0), "v14a"] = 0
train.loc[(train.v14a ==  1) & (train.sanitario1 ==  1) & (train.abastaguano == 0), "sanitario1"] = 0

test.loc[(test.v14a ==  1) & (test.sanitario1 ==  1) & (test.abastaguano == 0), "v14a"] = 0
test.loc[(test.v14a ==  1) & (test.sanitario1 ==  1) & (test.abastaguano == 0), "sanitario1"] = 0

In [None]:
def train_test_apply_func(train_, test_, func_): # 주어진 train_, test_ 데이터프레임에 대해 특정 함수 적용
    test_['Target'] = 0
    xx = pd.concat([train_, test_]) # 두 데이터셋을 수직으로 결합

    xx_func = func_(xx) # 함수에 적용하여 변환
    train_ = xx_func.iloc[:train_.shape[0], :] # 원래 train_ 데이터프레임 크기만큼의 행을 가져와 train_으로 설정, 나머지를 test로 설정
    test_  = xx_func.iloc[train_.shape[0]:, :].drop('Target', axis=1) # target컬럼은 삭제

    del xx, xx_func
    return train_, test_

In [None]:
# convert the one hot fields into label encoded
train, test = train_test_apply_func(train, test, convert_OHE2LE)

Geo aggregates

In [None]:
cols_2_ohe = ['eviv_LE', 'etecho_LE', 'epared_LE', 'elimbasu_LE',
              'energcocinar_LE', 'sanitario_LE', 'manual_elec_LE',
              'pared_LE'] # 원핫인코딩을 적용할 범주형 변수 리스트
cols_nums = ['age', 'meaneduc', 'dependency',
             'hogar_nin', 'hogar_adul', 'hogar_mayor', 'hogar_total',
             'bedrooms', 'overcrowding'] # 숫자형

def convert_geo2aggs(df_): # 지리적 특성에 따른 집계 값 계산
    tmp_df = pd.concat([df_[(['lugar_LE', 'idhogar']+cols_nums)], # df에서 이런 변수들 선택, 원핫인코딩
                        pd.get_dummies(df_[cols_2_ohe],
                                       columns=cols_2_ohe)],axis=1)
    # lugar_LE, idhogar로 그룹화하여 평균값 계산, 다시 그룹화하여 각 지역의 평균 집계 값 계산. float32 형태로 변환
    geo_agg = tmp_df.groupby(['lugar_LE','idhogar']).mean().groupby('lugar_LE').mean().astype(np.float32)
    geo_agg.columns = pd.Index(['geo_' + e for e in geo_agg.columns.tolist()]) # 컬럼 이름에 geo_ 추가

    del tmp_df
    return df_.join(geo_agg, how='left', on='lugar_LE') # 원본 df에 왼쪽에 geo_agg 조인해줌

# add some aggregates by geography
train, test = train_test_apply_func(train, test, convert_geo2aggs)

In [None]:
# train, test 데이터프레임에 추가적인 피쳐 생성, 각 가구의 18세 이상 인원 수 계산
train['num_over_18'] = 0 # 초기화
train['num_over_18'] = train[train.age >= 18].groupby('idhogar').transform("count") # 18세 이상 사람 수 카운트, num-over_18에 할당
train['num_over_18'] = train.groupby("idhogar")["num_over_18"].transform("max") # 각가구의 최대값 계산하여 저장
train['num_over_18'] = train['num_over_18'].fillna(0) # 결측값은 0으로

test['num_over_18'] = 0
test['num_over_18'] = test[test.age >= 18].groupby('idhogar').transform("count")
test['num_over_18'] = test.groupby("idhogar")["num_over_18"].transform("max")
test['num_over_18'] = test['num_over_18'].fillna(0)

# 추가로 특성을 생성해냄.
def extract_features(df):
    df['bedrooms_to_rooms'] = df['bedrooms']/df['rooms'] # 침실 수를 방 수로 나눔
    df['rent_to_rooms'] = df['v2a1']/df['rooms'] # 임대료를 방 수로 나눔
    df['tamhog_to_rooms'] = df['tamhog']/df['rooms'] # 가구 크기를 방 수로 나눔
    df['r4t3_to_tamhog'] = df['r4t3']/df['tamhog'] # r4t3 - 총 인ㅇㅝㄴ을 가구 크기로 나눔
    df['r4t3_to_rooms'] = df['r4t3']/df['rooms'] # r4t3 - 총 인원을 방 수로 나눔
    df['v2a1_to_r4t3'] = df['v2a1']/df['r4t3'] # 임대료를 총 인원으로 나눔
    df['v2a1_to_r4t3'] = df['v2a1']/(df['r4t3'] - df['r4t1']) # 임대료를 12세 미만으로 나눔
    df['hhsize_to_rooms'] = df['hhsize']/df['rooms'] # 가구크기를 방 수로 나눔
    df['rent_to_hhsize'] = df['v2a1']/df['hhsize'] # 임대료를 가구 크기로 나눔
    df['rent_to_over_18'] = df['v2a1']/df['num_over_18'] # 임대료를 18세 이상 인원으로 나눔, 없는 경우에는 전체 임대료로 대체
    #
    df.loc[df.num_over_18 == 0, "rent_to_over_18"] = df[df.num_over_18 == 0].v2a1

# 데이터프레임에 위에서 추출한 feature 포함시킨다.
extract_features(train)
extract_features(test)

In [None]:
# drop duplicated columns
needless_cols = ['r4t3', 'tamhog', 'tamviv', 'hhsize', 'v18q', 'v14a', 'agesq',
                 'mobilephone', 'female', ] # 제거할 컬럼들 (중복되거나 필요하지 않음 )

instlevel_cols = [s for s in train.columns.tolist() if 'instlevel' in s] # 컬럼 중에서 instlevel이 포함된 컬럼 저장 -> 교육 수준과 관련

needless_cols.extend(instlevel_cols) # 제거할 컬럼 리스트에 교육 관련 컬럼들 추가

train = train.drop(needless_cols, axis=1) # needless 칼럼 제거
test = test.drop(needless_cols, axis=1)

Split the data

In [None]:
# train, test 0.8:0.2로 분할
def split_data(train, y, sample_weight=None, households=None, test_percentage=0.20, seed=None):
    # uncomment for extra randomness
#     np.random.seed(seed=seed)

    train2 = train.copy()

    # pick some random households to use for the test data
    cv_hhs = np.random.choice(households, size=int(len(households) * test_percentage), replace=False)

    # select households which are in the random selection
    cv_idx = np.isin(households, cv_hhs)
    X_test = train2[cv_idx] # 테스트 데이터, 훈련 데이터 분리
    y_test = y[cv_idx]

    X_train = train2[~cv_idx]
    y_train = y[~cv_idx]

    if sample_weight is not None:
        y_train_weights = sample_weight[~cv_idx]
        return X_train, y_train, X_test, y_test, y_train_weights

    return X_train, y_train, X_test, y_test

In [None]:
# parentesco가 1인 데이터만 선택하여 X에 저장
X = train.query('parentesco1==1')
# X = train.copy()

# Target 컬럼에서 y만 빼서 저장, X에서 제거 -> 입력할 데이터 / 레이블 구분
y = X['Target'] - 1
X = X.drop(['Target'], axis=1)

np.random.seed(seed=None)

train2 = X.copy() # X는 바뀌지 않도록 train2로 데이터 복사

train_hhs = train2.idhogar # train에서 가구 ID 추출하여 households 리스트에 고유한 가구ID 저장

households = train2.idhogar.unique()
cv_hhs = np.random.choice(households, size=int(len(households) * 0.15), replace=False) # 15% 무작위 선택하여 저장, 테스트 데이터

cv_idx = np.isin(train2.idhogar, cv_hhs) # 선택한 가구 ID가 train2에 포함되어 있는지 확인

X_test = train2[cv_idx]
y_test = y[cv_idx]

X_train = train2[~cv_idx]
y_train = y[~cv_idx]

# train on entire dataset
X_train = train2
y_train = y

train_households = X_train.idhogar

In [None]:
# 클래스 불균형 문제 해결을 위해 샘플 가중치 계산
# 'balanced' : 클래스 샘플수와 반비례하는 가중치 부여
#y_train : 타겟 변ㅅㅜ, 클래스 레이블 포함
y_train_weights = class_weight.compute_sample_weight('balanced', y_train, indices=None)

In [None]:
# 모델 성능 최적화, 과적합 방지를 위해 특성 제거
extra_drop_features = [
 'agg18_estadocivil1_MEAN',
 'agg18_estadocivil6_COUNT',
 'agg18_estadocivil7_COUNT',
 'agg18_parentesco10_COUNT',
 'agg18_parentesco11_COUNT',
 'agg18_parentesco12_COUNT',
 'agg18_parentesco1_COUNT',
 'agg18_parentesco2_COUNT',
 'agg18_parentesco3_COUNT',
 'agg18_parentesco4_COUNT',
 'agg18_parentesco5_COUNT',
 'agg18_parentesco6_COUNT',
 'agg18_parentesco7_COUNT',
 'agg18_parentesco8_COUNT',
 'agg18_parentesco9_COUNT',
 'geo_elimbasu_LE_4',
 'geo_energcocinar_LE_1',
 'geo_energcocinar_LE_2',
 'geo_epared_LE_0',
 'geo_hogar_mayor',
 'geo_manual_elec_LE_2',
 'geo_pared_LE_3',
 'geo_pared_LE_4',
 'geo_pared_LE_5',
 'geo_pared_LE_6',
 'num_over_18',
 'parentesco_LE',
 'rez_esc



In [None]:
xgb_drop_cols = extra_drop_features + ["idhogar",  'parentesco1'] # 제거할 컬럼

Fit a voting classifier

In [None]:
# 여러셋의 하이퍼파라미터 정의 XGBoost 모델의 성능 최적화, 모델의 복잡성, 학습률, 정규화 등 조절
#4
opt_parameters = {'max_depth':35, 'eta':0.1, 'silent':0, 'objective':'multi:softmax', 'min_child_weight': 1, 'num_class': 4, 'gamma': 2.0, 'colsample_bylevel': 0.9, 'subsample': 0.84, 'colsample_bytree': 0.88, 'reg_lambda': 0.40 }
#5
opt_parameters = {'max_depth':35, 'eta':0.15, 'silent':1, 'objective':'multi:softmax', 'min_child_weight': 2, 'num_class': 4, 'gamma': 2.5, 'colsample_bylevel': 1, 'subsample': 0.95, 'colsample_bytree': 0.85, 'reg_lambda': 0.35 }
#6
# opt_parameters = {'max_depth':35, 'eta':0.15, 'silent':0, 'objective':'multi:softmax', 'min_child_weight': 2, 'num_class': 4, 'gamma': 2.75, 'colsample_bylevel': 0.95, 'subsample': 0.95, 'colsample_bytree': 0.85, 'reg_lambda': 0.35 }
# 7
# opt_parameters = {'max_depth':35, 'eta':0.12, 'silent':0, 'objective':'multi:softmax', 'min_child_weight': 2, 'num_class': 4, 'gamma': 3.25, 'colsample_bylevel': 0.95, 'subsample': 0.88, 'colsample_bytree': 0.88, 'reg_lambda': 0.35 }

def evaluate_macroF1_lgb(predictions, truth):  # LightGBM 모델의 예측 결과 평가 예측-실제 레이블 비교
    # this follows the discussion in https://github.com/Microsoft/LightGBM/issues/1483
    pred_labels = predictions.argmax(axis=1)
    truth = truth.get_label()
    f1 = f1_score(truth, pred_labels, average='macro')
    return ('macroF1', 1-f1)

# 룬련 파라미터 설정
fit_params={"early_stopping_rounds":500,
            "eval_metric" : evaluate_macroF1_lgb,
            "eval_set" : [(X_train,y_train), (X_test,y_test)],
            'verbose': False,
           }

# 학습률을 반복에 따라 조정, 50회마다 로그 출력
def learning_rate_power_0997(current_iter):
    base_learning_rate = 0.1
    min_learning_rate = 0.02
    lr = base_learning_rate  * np.power(.995, current_iter)
    return max(lr, min_learning_rate)

fit_params['verbose'] = 50

In [None]:
np.random.seed(100)

# estimator을 데이터에 맞추기 위해 필요한 절차 정의 (split, 모델 학습,  )
def _parallel_fit_estimator(estimator1, X, y, sample_weight=None, threshold=True, **fit_params):
    estimator = clone(estimator1)

    # randomly split the data so we have a test set for early stopping
    if sample_weight is not None:
        X_train, y_train, X_test, y_test, y_train_weight = split_data(X, y, sample_weight, households=train_households)
    else:
        X_train, y_train, X_test, y_test = split_data(X, y, None, households=train_households)

    # update the fit params with our new split
    fit_params["eval_set"] = [(X_test,y_test)]

    # fit the estimator
    if sample_weight is not None:
        if isinstance(estimator1, ExtraTreesClassifier) or isinstance(estimator1, RandomForestClassifier):
            estimator.fit(X_train, y_train)
        else:
            _ = estimator.fit(X_train, y_train, sample_weight=y_train_weight, **fit_params)
    else:
        if isinstance(estimator1, ExtraTreesClassifier) or isinstance(estimator1, RandomForestClassifier):
            estimator.fit(X_train, y_train)
        else:
            _ = estimator.fit(X_train, y_train, **fit_params)

    if not isinstance(estimator1, ExtraTreesClassifier) and not isinstance(estimator1, RandomForestClassifier) and not isinstance(estimator1, xgb.XGBClassifier):
        best_cv_round = np.argmax(estimator.evals_result_['validation_0']['mlogloss'])
        best_cv = np.max(estimator.evals_result_['validation_0']['mlogloss'])
        best_train = estimator.evals_result_['train']['macroF1'][best_cv_round]
    else:
        best_train = f1_score(y_train, estimator.predict(X_train), average="macro")
        best_cv = f1_score(y_test, estimator.predict(X_test), average="macro")
        print("Train F1:", best_train)
        print("Test F1:", best_cv)

    # reject some estimators based on their performance on train and test sets
    if threshold:
        # if the valid score is very high we'll allow a little more leeway with the train scores
        if ((best_cv > 0.37) and (best_train > 0.75)) or ((best_cv > 0.44) and (best_train > 0.65)):
            return estimator

        # else recurse until we get a better one
        else:
            print("Unacceptable!!! Trying again...")
            return _parallel_fit_estimator(estimator1, X, y, sample_weight=sample_weight, **fit_params)

    else:
        return estimator

class VotingClassifierLGBM(VotingClassifier):
    '''
    This implements the fit method of the VotingClassifier propagating fit_params
    '''
    # 레이블 y가 다차원 배열일 경우 예외. voting 매개변수가 soft인지 hard인지, estimators가 비어있는지 확인
    def fit(self, X, y, sample_weight=None, threshold=True, **fit_params):

        if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
            raise NotImplementedError('Multilabel and multi-output'
                                      ' classification is not supported.')

        if self.voting not in ('soft', 'hard'):
            raise ValueError("Voting must be 'soft' or 'hard'; got (voting=%r)"
                             % self.voting)

        if self.estimators is None or len(self.estimators) == 0:
            raise AttributeError('Invalid `estimators` attribute, `estimators`'
                                 ' should be a list of (string, estimator)'
                                 ' tuples')

        if (self.weights is not None and
                len(self.weights) != len(self.estimators)):
            raise ValueError('Number of classifiers and weights must be equal'
                             '; got %d weights, %d estimators'
                             % (len(self.weights), len(self.estimators)))

        names, clfs = zip(*self.estimators)
        self._validate_names(names)

        n_isnone = np.sum([clf is None for _, clf in self.estimators])
        if n_isnone == len(self.estimators):
            raise ValueError('All estimators are None. At least one is '
                             'required to be a classifier!')

        self.le_ = LabelEncoder().fit(y)
        self.classes_ = self.le_.classes_
        self.estimators_ = []

        transformed_y = self.le_.transform(y)

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
                delayed(_parallel_fit_estimator)(clone(clf), X, transformed_y,
                                                 sample_weight=sample_weight, threshold=threshold, **fit_params)
                for clf in clfs if clf is not None)

        return self

In [None]:
clfs = []
for i in range(15): # 15개의 XGBOost 분류기를 만들어 리스트 clfs에 추가. 랜덤시드는 모두 다른 조건, 트리는 300개, 학습률 0.15
    clf = xgb.XGBClassifier(random_state=217+i, n_estimators=300, learning_rate=0.15, n_jobs=4, **opt_parameters)

    clfs.append(('xgb{}'.format(i), clf))

# 생성한 XGBoost 분류기들은 소프트 보팅 방식으로 결합, 각 모델의 예측 확률을 평균내어 최종 예측 결정.
vc = VotingClassifierLGBM(clfs, voting='soft')
del(clfs)

#최종학습
_ = vc.fit(X_train.drop(xgb_drop_cols, axis=1), y_train, sample_weight=y_train_weights, threshold=False, **fit_params)

clf_final = vc.estimators_[0]

In [None]:
# clf_final을 사용하여 테스트 데이터에 대한 F1 점수 계산
global_score = f1_score(y_test, clf_final.predict(X_test.drop(xgb_drop_cols, axis=1)), average='macro')
# 소프트 보팅 방식으로 설정하고, F1 점수 계산
vc.voting = 'soft'
global_score_soft = f1_score(y_test, vc.predict(X_test.drop(xgb_drop_cols, axis=1)), average='macro')
# 하드 보팅 점수계산
vc.voting = 'hard'
global_score_hard = f1_score(y_test, vc.predict(X_test.drop(xgb_drop_cols, axis=1)), average='macro')

print('Validation score of a single LGBM Classifier: {:.4f}'.format(global_score))
print('Validation score of a VotingClassifier on 3 LGBMs with soft voting strategy: {:.4f}'.format(global_score_soft))
print('Validation score of a VotingClassifier on 3 LGBMs with hard voting strategy: {:.4f}'.format(global_score_hard))

In [None]:
# see which features are not used by ANY models
useless_features = [] # 특성 저장할 list
drop_features = set() # 모든 모델에서 사용되지 않는 피쳐 저장할 set
counter = 0 # 반복 횟수
for est in vc.estimators_:
    # 각 모데에서 사용된 특성과 사용되지 않는 특성 평가
    ranked_features, unused_features = feature_importance(est, X_train.drop(xgb_drop_cols, axis=1), display_results=False)
    useless_features.append(unused_features)
    # 모델에서 사용되지 않는 특성과의 교집합을 구하여 모든 모델에서 사용되지 않는 특성만 남김
    if counter == 0:
        drop_features = set(unused_features)
    else:
        drop_features = drop_features.intersection(set(unused_features))
    counter += 1

drop_features

In [None]:
# 특성 중요도 평가하는 과정. 중요도를 높은 순으로 정렬하여 반환
ranked_features = feature_importance(clf_final, X_train.drop(xgb_drop_cols, axis=1))

Random Forest

In [None]:
# 리스트에 제거할 특성의 이름 초기화. 모델 학습에 필요 없거나 중복된 정보
et_drop_cols = ['agg18_age_MAX', 'agg18_age_MEAN', 'agg18_age_MIN', 'agg18_dis_MEAN',
       'agg18_escolari_MAX', 'agg18_escolari_MEAN', 'agg18_escolari_MIN',
       'agg18_estadocivil1_COUNT', 'agg18_estadocivil1_MEAN',
       'agg18_estadocivil2_COUNT', 'agg18_estadocivil2_MEAN',
       'agg18_estadocivil3_COUNT', 'agg18_estadocivil3_MEAN',
       'agg18_estadocivil4_COUNT', 'agg18_estadocivil4_MEAN',
       'agg18_estadocivil5_COUNT', 'agg18_estadocivil5_MEAN',
       'agg18_estadocivil6_COUNT', 'agg18_estadocivil6_MEAN',
       'agg18_estadocivil7_COUNT', 'agg18_estadocivil7_MEAN',
       'agg18_parentesco10_COUNT', 'agg18_parentesco10_MEAN',
       'agg18_parentesco11_COUNT', 'agg18_parentesco11_MEAN',
       'agg18_parentesco12_COUNT', 'agg18_parentesco12_MEAN',
       'agg18_parentesco1_COUNT', 'agg18_parentesco1_MEAN',
       'agg18_parentesco2_COUNT', 'agg18_parentesco2_MEAN',
       'agg18_parentesco3_COUNT', 'agg18_parentesco3_MEAN',
       'agg18_parentesco4_COUNT', 'agg18_parentesco4_MEAN',
       'agg18_parentesco5_COUNT', 'agg18_parentesco5_MEAN',
       'agg18_parentesco6_COUNT', 'agg18_parentesco6_MEAN',
       'agg18_parentesco7_COUNT', 'agg18_parentesco7_MEAN',
       'agg18_parentesco8_COUNT', 'agg18_parentesco8_MEAN',
       'agg18_parentesco9_COUNT', 'agg18_parentesco9_MEAN'] #+ ['parentesco_LE', 'rez_esc']

et_drop_cols.extend(["idhogar", "parentesco1", 'fe_rent_per_person', 'fe_rent_per_room',
       'fe_tablet_adult_density', 'fe_tablet_density'])

In [None]:
# 10개의 랜덤 포레스트 분류기를 생성하여 리스트 ets에 추가
ets = []
for i in range(10):
    rf = RandomForestClassifier(max_depth=None, random_state=217+i, n_jobs=4, n_estimators=700, min_impurity_decrease=1e-3, min_samples_leaf=2, verbose=0, class_weight="balanced")
    ets.append(('rf{}'.format(i), rf))

# 생성한 랜덤 포레스트 분류기들을 소프트 보팅 방식으로 결합하는 객체 생성.
vc2 = VotingClassifierLGBM(ets, voting='soft')
_ = vc2.fit(X_train.drop(et_drop_cols, axis=1), y_train, threshold=False)

In [None]:
# soft/hard 방식으로 예측한 결과에 대해 F1 점수 출력
vc2.voting = 'soft'
global_rf_score_soft = f1_score(y_test, vc2.predict(X_test.drop(et_drop_cols, axis=1)), average='macro')
vc2.voting = 'hard'
global_rf_score_hard = f1_score(y_test, vc2.predict(X_test.drop(et_drop_cols, axis=1)), average='macro')

print('Validation score of a VotingClassifier on 3 LGBMs with soft voting strategy: {:.4f}'.format(global_rf_score_soft))
print('Validation score of a VotingClassifier on 3 LGBMs with hard voting strategy: {:.4f}'.format(global_rf_score_hard))

In [None]:
# w/o threshold, extra drop cols
vc2.voting = 'soft'
global_rf_score_soft = f1_score(y_test, vc2.predict(X_test.drop(et_drop_cols, axis=1)), average='macro')
vc2.voting = 'hard'
global_rf_score_hard = f1_score(y_test, vc2.predict(X_test.drop(et_drop_cols, axis=1)), average='macro')

print('Validation score of a VotingClassifier on 3 LGBMs with soft voting strategy: {:.4f}'.format(global_rf_score_soft))
print('Validation score of a VotingClassifier on 3 LGBMs with hard voting strategy: {:.4f}'.format(global_rf_score_hard))

In [None]:
# 사용되지 않는 특성 평가, 제거
useless_features = [
drop_features = set()
counter = 0
for est in vc2.estimators_:
    ranked_features, unused_features = feature_importance(est, X_train.drop(et_drop_cols, axis=1), display_results=False)
    useless_features.append(unused_features)
    if counter == 0:
        drop_features = set(unused_features)
    else:
        drop_features = drop_features.intersection(set(unused_features))
    counter += 1

drop_features

In [None]:
# vc, vc2의 예측 결과를 결합하여 최종 예측을 생성하는 함수를 정의, F1 점수를 계산
def combine_voters(data, weights=[0.5, 0.5]):
    # do soft voting with both classifiers
    vc.voting="soft"
    vc1_probs = vc.predict_proba(data.drop(xgb_drop_cols, axis=1))
    vc2.voting="soft"
    vc2_probs = vc2.predict_proba(data.drop(et_drop_cols, axis=1))

    # 각 모델의 예측확률을 가중치 따라 결합, 곱한 후 합
    final_vote = (vc1_probs * weights[0]) + (vc2_probs * weights[1])
    predictions = np.argmax(final_vote, axis=1)

    return predictions

In [None]:
# 각 분류기의 예측 확률 계산
combo_preds = combine_voters(X_test, weights=[0.5, 0.5])
global_combo_score_soft = f1_score(y_test, combo_preds, average='macro')
global_combo_score_soft

In [None]:
combo_preds = combine_voters(X_test, weights=[0.4, 0.6])
global_combo_score_soft= f1_score(y_test, combo_preds, average='macro')
global_combo_score_soft

In [None]:
combo_preds = combine_voters(X_test, weights=[0.6, 0.4])
global_combo_score_soft = f1_score(y_test, combo_preds, average='macro')
global_combo_score_soft

Prepare submission

In [None]:
y_subm = pd.DataFrame()
y_subm['Id'] = test_ids

In [None]:
vc.voting = 'soft'
y_subm_lgb = y_subm.copy(deep=True)
y_subm_lgb['Target'] = vc.predict(test.drop(xgb_drop_cols, axis=1)) + 1

vc2.voting = 'soft'
y_subm_rf = y_subm.copy(deep=True)
y_subm_rf['Target'] = vc2.predict(test.drop(et_drop_cols, axis=1)) + 1

y_subm_ens = y_subm.copy(deep=True)
y_subm_ens['Target'] = combine_voters(test) + 1

In [None]:
from datetime import datetime
now = datetime.now()

sub_file_lgb = 'submission_soft_XGB_{:.4f}_{}.csv'.format(global_score_soft, str(now.strftime('%Y-%m-%d-%H-%M')))
sub_file_rf = 'submission_soft_RF_{:.4f}_{}.csv'.format(global_rf_score_soft, str(now.strftime('%Y-%m-%d-%H-%M')))
sub_file_ens = 'submission_ens_{:.4f}_{}.csv'.format(global_combo_score_soft, str(now.strftime('%Y-%m-%d-%H-%M')))

y_subm_lgb.to_csv(sub_file_lgb, index=False)
y_subm_rf.to_csv(sub_file_rf, index=False)
y_subm_ens.to_csv(sub_file_ens, index=False)