# Costa Rican Household Poverty Level Prediction - XGBoost

참고 : https://www.kaggle.com/skooch/xgboost

# 1st

# LGMB with random split for early stopping

- 이 커널은 (가구의 집계를 추출한 후) 가정의 가장만 학습시킨다. 점수는 가정의 가장으로만 평가된다. 모든 가족구성원은 test + sample submission에 포함되지만, 가정만 점수가 매겨진다. 하지만, 현재 평가는 가장이 아닌 가족구성원에게도 이루어지는 것으로 보인다.
- 클래스 빈도의 균형을 맞추는 것은 아주 중요하다. 균형을 맞추지 않으면 학습된 모델은 성능이 낮다. 이 작업은 직접 할 수도 있고, 언더샘플링을 할 수도 있다. 하지만 가장 간단한 것은 (그리고 언더샘플링보다 더 강력한 것은) sklearn API의 LightGBM모델 생성시 class_weight='balanced'로 설정하는 것이다.
- 이 커널은 macro F1 score를 사용해 학습을 조기중단한다. 이 작업은 scoring 전략에 맞춰 수행한다.
- 범주형 변수는 blind label encoding 대신 적절히 매핑된 숫자로 변환된다.
- OHE(One-Hot Encoding)가 label encoding되면, 트리 모델에서 digest하기 더 쉽다. 이 트릭은 트리모델이 아닌 모델에 안 좋을 수 있으니 조심해야한다.
- idhogar은 학습에 사용되지 않는다. 정보를 얻는 유일한 방법은 데이터 누락이 없는 경우다. 
- 가정 내에서 집계가 이루어지며 새로운 변수를 직접 생성한다. 대부분의 변수는 가정 수준에서 이미 사용되기 때문에 집계될 수 있는 변수가 많지 않다.
- voting classifier은 여러 LgithGBM 모델의 평균을 내는 데 사용한다.

In [1]:
import numpy as np   # linear algebra
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import f1_score
#from sklearn.externals.joblib import Parallel, delayed
from joblib import Parallel, delayed
from sklearn.base import clone
from sklearn.ensemble import VotingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.utils import class_weight

import warnings
warnings.filterwarnings("ignore")

범주형 변수 매핑

In [2]:
from sklearn.preprocessing import LabelEncoder

# 여기선 idhogar 필드만 변환. 이 함수는 다른 데에서도 사용
def encode_data(df):
    df['idhogar'] = LabelEncoder().fit_transform(df['idhogar'])
    
# sklearn의 의사결정나무를 위한 변수 중요도 plot
def feature_importance(forest, X_train, display_results=True):
    ranked_list = []
    zero_features = []
    
    importances = forest.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    if display_results:
        # 변수 순위 출력
        print("Feature ranking :")
    
    for f in range(X_train.shape[1]):
        if display_results:
            print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]) 
                  + " - " + X_train.columns[indices[f]])
        ranked_list.append(X_train.columns[indices[f]])
        
        if importances[indices[f]] == 0.0:
            zero_features.append(X_train.columns[indices[f]])
            
    return ranked_list, zero_features

In [3]:
def do_features(df):
    feats_div = [('children_fraction', 'r4t1', 'r4t3'), 
                 ('working_man_fraction', 'r4h2', 'r4t3'),
                 ('all_man_fraction', 'r4h3', 'r4t3'),
                 ('human_density', 'tamviv', 'rooms'),
                 ('human_bed_density', 'tamviv', 'bedrooms'),
                 ('rent_per_person', 'v2a1', 'r4t3'),
                 ('rent_per_room', 'v2a1', 'rooms'),
                 ('mobile_density', 'qmobilephone', 'r4t3'),
                 ('tablet_density', 'v18q1', 'r4t3'),
                 ('mobile_adult_density', 'qmobilephone', 'r4t2'),
                 ('tablet_adult_density', 'v18q1', 'r4t2')]
    feats_sub = [('people_not_living', 'tamhog', 'tamviv'),
                 ('people_weird_stat', 'tamhog', 'r4t3')]
    
    for f_new, f1, f2 in feats_div:
        df['fe_' + f_new] = (df[f1] / df[f2]).astype(np.float32)       
    for f_new, f1, f2 in feats_sub:
        df['fe_' + f_new] = (df[f1] - df[f2]).astype(np.float32)
        
    # 가정으로 집계 규칙
    aggs_num = {'age': ['min', 'max', 'mean'], 'escolari': ['min', 'max', 'mean']}
    aggs_cat = {'dis': ['mean']}
    for s_ in ['estadocivil', 'parentesco', 'instlevel']:
        for f_ in [f_ for f_ in df.columns if f_.startswith(s_)]:
            aggs_cat[f_] = ['mean', 'count']
            
    # 가장으로 집계
    for name_, df_ in [('18', df.query('age >= 18'))]:
        df_agg = df_.groupby('idhogar').agg({**aggs_num, **aggs_cat}).astype(np.float32)
        df_agg.columns = pd.Index(['agg' + name_ + '_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
        df = df.join(df_agg, how='left', on='idhogar')
        del df_agg
        
    # id 삭제
    df.drop(['Id'], axis=1, inplace=True)
    
    return df

In [4]:
# one hot encoding된 필드를 laebl encoding으로 변경
def convert_OHE2LE(df):
    tmp_df = df.copy(deep=True)
    for s_ in ['pared', 'piso', 'techo', 'abastagua', 'sanitario', 'energcocinar', 'elimbasu', 
               'epared', 'etecho', 'eviv', 'estadocivil', 'parentesco', 'instlevel', 'lugar', 'tipovivi',
               'manual_elec']:
        if 'manual_' not in s_:
            cols_s_ = [f_ for f_ in df.columns if f_.startswith(s_)]
        elif 'elec' in s_:
            cols_s_ = ['public', 'planpri', 'noelec', 'coopele']
        sum_ohe = tmp_df[cols_s_].sum(axis=1).unique()
        
        # columns==0인 합계가 있는 OHE 열을 처리
        if 0 in sum_ohe:
            print('The OHE in {} is incomplete. A new column will be added before label encoding'.format(s_))
            # 더미변수 이름 추가
            col_dummy = s_+'_dummy'
            # 데이터프레임에 열 추가
            tmp_df[col_dummy] = (tmp_df[cols_s_].sum(axis=1) == 0).astype(np.int8)
            # label encoding 위해 열 리스트에 이름 추가
            cols_s_.append(col_dummy)
            # 범주 인코딩이 완료된 것을 증명
            sum_ohe = tmp_df[cols_s_].sum(axis=1).unique()
            if 0 in sum_ohe:
                print('The category completion did not work')
                
        tmp_cat = tmp_df[cols_s_].idxmax(axis=1)
        tmp_df[s_ + '_LE'] = LabelEncoder().fit_transform(tmp_cat).astype(np.int16)
        if 'parentesco1' in cols_s_:
            cols_s_.remove('parentesco1')
        tmp_df.drop(cols_s_, axis=1, inplace=True)
        
    return tmp_df

# Read in the data and clean it up

In [5]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

test_ids = test.Id

In [6]:
def process_df(df_):
    # idhogar 인코딩
    encode_data(df_)
    # 집계 변수 생성
    return do_features(df_)

train = process_df(train)
test = process_df(test)

결측 데이터를 처리하고 object를 numeric으로 변환한다.

In [7]:
# dependency는 Na값이 있으므로 제곱근으로 대체
train['dependency'] = np.sqrt(train['SQBdependency'])
test['dependency'] = np.sqrt(test['SQBdependency'])

# education의 no를 0으로 대체
train.loc[train['edjefa'] == "no", "edjefa"] = 0
train.loc[train['edjefe'] == "no", "edjefe"] = 0
test.loc[test['edjefa'] == "no", "edjefa"] = 0
test.loc[test['edjefe'] == "no", "edjefe"] = 0

# education이 yes고 가장이면 escolari로 대체
train.loc[(train['edjefa'] == "yes") & (train['parentesco1'] == 1), "edjefa"] = train.loc[(train['edjefa'] == "yes") & (train['parentesco1'] == 1), "escolari"]
train.loc[(train['edjefe'] == "yes") & (train['parentesco1'] == 1), "edjefe"] = train.loc[(train['edjefe'] == "yes") & (train['parentesco1'] == 1), "escolari"]
test.loc[(test['edjefa'] == "yes") & (test['parentesco1'] == 1), "edjefa"] = test.loc[(test['edjefa'] == "yes") & (test['parentesco1'] == 1), "escolari"]
test.loc[(test['edjefe'] == "yes") & (test['parentesco1'] == 1), "edjefe"] = test.loc[(test['edjefe'] == "yes") & (test['parentesco1'] == 1), "escolari"]

# 이 필드는 gender와 escolari 사이의 상호작용이어야하지만, 'yes'의 의미가 분명하지 않다. 4로 대체한다.
train.loc[train['edjefa'] == "yes", "edjefa"] = 4
train.loc[train['edjefe'] == "yes", "edjefe"] = 4
test.loc[test['edjefa'] == "yes", "edjefa"] = 4
test.loc[test['edjefe'] == "yes", "edjefe"] = 4

# 모델을 위해 int로 변환
train['edjefe'] = train['edjefe'].astype("int")
train['edjefa'] = train['edjefa'].astype("int")
test['edjefe'] = test['edjefe'].astype("int")
test['edjefa'] = test['edjefa'].astype("int")

# 가장의 교육 최대값 변수 생성
train['edjef'] = np.max(train[['edjefa','edjefe']], axis=1)
test['edjef'] = np.max(test[['edjefa','edjefe']], axis=1)

# 일부 na 대체
train['v2a1'] = train['v2a1'].fillna(0)
test['v2a1'] = test['v2a1'].fillna(0)
train['v18q1'] = train['v18q1'].fillna(0)
test['v18q1'] = test['v18q1'].fillna(0)
train['rez_esc']=train['rez_esc'].fillna(0)
test['rez_esc']=test['rez_esc'].fillna(0)

train.loc[train.meaneduc.isnull(), "meaneduc"] = 0
train.loc[train.SQBmeaned.isnull(), "SQBmeaned"] = 0
test.loc[test.meaneduc.isnull(), "meaneduc"] = 0
test.loc[test.SQBmeaned.isnull(), "SQBmeaned"] = 0


# 데이터 불일치 수정 - 일부 행은 가정에 화장실이 있는지 없는지를 나타냄
# water가 없으면 물 마시지 않는다고 가정
train.loc[(train.v14a ==  1) & (train.sanitario1 ==  1) & (train.abastaguano == 0), "v14a"] = 0
train.loc[(train.v14a ==  1) & (train.sanitario1 ==  1) & (train.abastaguano == 0), "sanitario1"] = 0
test.loc[(test.v14a ==  1) & (test.sanitario1 ==  1) & (test.abastaguano == 0), "v14a"] = 0
test.loc[(test.v14a ==  1) & (test.sanitario1 ==  1) & (test.abastaguano == 0), "sanitario1"] = 0

In [8]:
def train_test_apply_func(train_, test_, func_):
    test_['Target'] = 0
    xx = pd.concat([train_, test_])

    xx_func = func_(xx)
    train_ = xx_func.iloc[:train_.shape[0], :]
    test_ = xx_func.iloc[train_.shape[0]:, :].drop('Target', axis=1)
    
    del xx, xx_func
    return train_, test_

In [9]:
train, test = train_test_apply_func(train, test, convert_OHE2LE)

The OHE in techo is incomplete. A new column will be added before label encoding
The OHE in instlevel is incomplete. A new column will be added before label encoding
The OHE in manual_elec is incomplete. A new column will be added before label encoding


# Geo aggregate

In [10]:
cols_2_ohe = ['eviv_LE', 'etecho_LE', 'epared_LE', 'elimbasu_LE', 
              'energcocinar_LE', 'sanitario_LE', 'manual_elec_LE', 'pared_LE']
cols_nums = ['age', 'meaneduc', 'dependency', 'hogar_nin', 'hogar_adul', 
             'hogar_mayor', 'hogar_total', 'bedrooms', 'overcrowding']

def convert_geo2aggs(df_):
    tmp_df = pd.concat([df_[(['lugar_LE', 'idhogar']+cols_nums)],
                        pd.get_dummies(df_[cols_2_ohe], columns=cols_2_ohe)],axis=1)
    geo_agg = tmp_df.groupby(['lugar_LE','idhogar']).mean().groupby('lugar_LE').mean().astype(np.float32)
    geo_agg.columns = pd.Index(['geo_' + e for e in geo_agg.columns.tolist()])
    
    del tmp_df
    return df_.join(geo_agg, how='left', on='lugar_LE')

# geography 집계 추가
train, test = train_test_apply_func(train, test, convert_geo2aggs)

In [11]:
# 각 가정의 18세 이상 사람 수 추가
train['num_over_18'] = 0
train['num_over_18'] = train[train.age>=18].groupby('idhogar').transform("count")
train['num_over_18'] = train.groupby("idhogar")["num_over_18"].transform("max")
train['num_over_18'] = train['num_over_18'].fillna(0)

test['num_over_18'] = 0
test['num_over_18'] = test[test.age>=18].groupby('idhogar').transform("count")
test['num_over_18'] = test.groupby("idhogar")["num_over_18"].transform("max")
test['num_over_18'] = test['num_over_18'].fillna(0)


# 변수 추가
def extract_features(df):
    df['bedrooms_to_rooms'] = df['bedrooms'] / df['rooms']
    df['rent_to_rooms'] = df['v2a1'] / df['rooms']
    df['tamhog_to_rooms'] = df['tamhog'] / df['rooms']  # tamhog : 가족규모
    df['r4t3_to_tamhog'] = df['r4t3'] / df['tamhog'] # r4t3 : 가정 총 인원수
    df['r4t3_to_rooms'] = df['r4t3'] / df['rooms'] 
    df['v2a1_to_r4t3'] = df['v2a1'] / df['r4t3'] # 가정 인원수 대비 임대
    df['v2a1_to_r4t3'] = df['v2a1'] / (df['r4t3'] - df['r4t1']) # 12세 이하 인원 대비 임대
    df['hhsize_to_rooms'] = df['hhsize'] / df['rooms'] # 사람 당 방
    df['rent_to_hhsize'] = df['v2a1'] / df['hhsize'] # 가족규모 대비 임대
    df['rent_to_over_18'] = df['v2a1'] / df['num_over_18']
    # 18세 이상 인원이 없는 가정은 총 임대를 사용
    df.loc[df.num_over_18 == 0, "rent_to_over_18"] = df[df.num_over_18 == 0].v2a1
    
extract_features(train)
extract_features(test)

In [12]:
# 중복 열 삭제
needless_cols = ['r4t3', 'tamhog', 'tamviv', 'hhsize', 'v18q', 'v14a', 'agesq', 'mobilephone', 'female', ]
instlevel_cols = [s for s in train.columns.tolist() if 'instlevel' in s]
needless_cols.extend(instlevel_cols)

train = train.drop(needless_cols, axis=1)
test = test.drop(needless_cols, axis=1)

## Split the data

동일한 가정에 속하는 행은 보통 동일한 target을 가지므로, 누락을 방지하기 위해 가정별로 데이터를 분할한다. 가장만 포함하도록 데이터를 필터링해서 기술적으로 필요하진 않지만, 그렇게 하려면 전체 train 데이터를 쉽게 사용할 수 있다.

데이터를 분할한 후, train 데이터를 전체 데이터셋으로 덮어써, 모든 데이터를 학습시킬 수 있다. split_data 함수는 데이터를 덮어쓰지 않고 동일한 작업을 하며, train 반복 내에서 K-Fold 분할을 근사화하는 데 사용된다.

In [13]:
def split_data(train, y, sample_weight=None, households=None, test_percentage=0.20, seed=None):
    # np.randome.seed(seed=seed)
    train2 = train.copy()
    # test 데이터에 사용할 가정 랜덤 선택
    cv_hhs = np.random.choice(households, size=int(len(households) * test_percentage), replace=False)
    # 랜덤 선택된 가정 선택
    cv_idx = np.isin(households, cv_hhs)
    X_test = train2[cv_idx]
    y_test = y[cv_idx]
    X_train = train2[~cv_idx]
    y_train = y[~cv_idx]
    
    if sample_weight is not None:
        y_train_weights = sample_weight[~cv_idx]
        return X_train, y_train, X_test, y_test, y_train_weights
    
    return X_train, y_train, X_test, y_test

In [14]:
X = train.query('parentesco1==1')
# X = train.copy()

# target 변수를 추출하고 삭제
y = X['Target'] - 1
X = X.drop(['Target'], axis=1)

np.random.seed(seed=None)
train2 = X.copy()
train_hhs = train2.idhogar

households = train2.idhogar.unique()
cv_hhs = np.random.choice(households, size=int(len(households) * 0.15), replace=False)

cv_idx = np.isin(train2.idhogar, cv_hhs)

X_test = train2[cv_idx]
y_test = y[cv_idx]
X_train = train2[~cv_idx]
y_train = y[~cv_idx]

# train에 전체 데이터셋 씌우기
X_train = train2
y_train = y

train_households = X_train.idhogar

In [15]:
# 불균형한 클래스를 학습시키기 위해 클래스 가중치 계산
y_train_weights = class_weight.compute_sample_weight('balanced', y_train, indices=None)

In [16]:
# LGBM에 사용되지 않거나 중요도가 매우 낮은 변수 제거
extra_drop_features = [
 'agg18_estadocivil1_MEAN',
 'agg18_estadocivil6_COUNT',
 'agg18_estadocivil7_COUNT',
 'agg18_parentesco10_COUNT',
 'agg18_parentesco11_COUNT',
 'agg18_parentesco12_COUNT',
 'agg18_parentesco1_COUNT',
 'agg18_parentesco2_COUNT',
 'agg18_parentesco3_COUNT',
 'agg18_parentesco4_COUNT',
 'agg18_parentesco5_COUNT',
 'agg18_parentesco6_COUNT',
 'agg18_parentesco7_COUNT',
 'agg18_parentesco8_COUNT',
 'agg18_parentesco9_COUNT',
 'geo_elimbasu_LE_4',
 'geo_energcocinar_LE_1',
 'geo_energcocinar_LE_2',
 'geo_epared_LE_0',
 'geo_hogar_mayor',
 'geo_manual_elec_LE_2',
 'geo_pared_LE_3',
 'geo_pared_LE_4',
 'geo_pared_LE_5',
 'geo_pared_LE_6',
 'num_over_18',
 'parentesco_LE',
 'rez_esc']

In [17]:
xgb_drop_cols = extra_drop_features + ["idhogar", 'parentesco1']

# Fit a voting classifier

조기중지를 위해 fit_params를 전달할 수 있도록 파생된 VotingClassifier 클래스를 정의한다. vote는 macro F1 score에 기반한 조기정지와 학습률 저하가 있는 LGBM 모델에 기반한다.

In [34]:
opt_parameters = {'max_depth':35, 'eta':0.1, 'objective':'multi:softmax', 'min_child_weight': 1, 'num_class': 4, 'gamma': 2.0, 'colsample_bylevel': 0.9, 'subsample': 0.84, 'colsample_bytree': 0.88, 'reg_lambda': 0.40 }
opt_parameters = {'max_depth':35, 'eta':0.15, 'objective':'multi:softmax', 'min_child_weight': 2, 'num_class': 4, 'gamma': 2.5, 'colsample_bylevel': 1, 'subsample': 0.95, 'colsample_bytree': 0.85, 'reg_lambda': 0.35 }

def evaluate_macroF1_lgb(predictions, truth):
    pred_labels = predictions.argmax(axis=1)
    truth = truth.get_label()
    f1 = f1_score(truth, pred_labels, average='macro')
    return ('macroF1', 1-f1) 

fit_params={"early_stopping_rounds":500,
            "eval_metric" : evaluate_macroF1_lgb, 
            "eval_set" : [(X_train,y_train), (X_test,y_test)],
            'verbose': False,}

def learning_rate_power_0997(current_iter):
    base_learning_rate = 0.1
    min_learning_rate = 0.02
    lr = base_learning_rate * np.power(.995, current_iter)
    return max(lr, min_learning_rate)

fit_params['verbose'] = 50

In [52]:
np.random.seed(100)

def _parallel_fit_estimator(estimator1, X, y, sample_weight=None, threshold=True, **fit_params):
    estimator = clone(estimator1)
    
    # 데이터를 랜덤 분할해 조기중지할 test 셋을 얻는다
    if sample_weight is not None:
        X_train, y_train, X_test, y_test, y_train_weight = split_data(X,y,sample_weight,households=train_households)
    else:
        X_train, y_train, X_test, y_test = split_data(X,y,None,households=train_households)
        
    # 새로 분할한 것에 대해 fit params 업데이트
    fit_params['eval_set'] = [(X_test,y_test)]
    
    # estimator 적합
    if sample_weight is not None:
        if isinstance(estimator1, ExtraTreesClassifier) or isinstance(estimator1, RandomForestClassifier):
            estimator.fit(X_train, y_train)  ### isinstance(A,B) : A가 B인지 확인
        else:
            _ = estimator.fit(X_train, y_train, sample_weight=y_train_weight, **fit_params)
    else :
        if isinstance(estimator1, ExtraTreesClassifier) or isinstance(estimator1, RandomForestClassifier):
            estimator.fit(X_train, y_train)
        else:
            _ = estimator.fit(X_train, y_train, **fit_params)
            
    if not isinstance(estimator1, ExtraTreesClassifier) and not isinstance(estimator1, RandomForestClassifier) and not isinstance(estimator1, xgb.XGBClassifier):
        best_cv_round = np.argmax(estimator.evals_result_['validation_0']['mlogloss'])
        best_cv = np.max(estimator.evals_result_['validation_0']['mlogloss'])
        best_train = estimator.evals_result_['train']['macroF1'][best_cv_round]
    else:
        best_train = f1_score(y_train, estimator.predict(X_train), average='macro')
        best_cv = f1_score(y_test, estimator.predict(X_test), average='macro')
        print('Train F1 :', best_train)
        print('Test F1 :', best_cv)
        
    # train, test 셋의 성능에 기반한 일부 estimator 거부
    if threshold:
        # valid score가 너무 높으면 train score에 여유를 약간 줌
        if ((best_cv>0.37) and (best_train>0.75)) or ((best_cv>0.44) and (best_train>0.64)):
            return estimator
        # 아니면 더 나은 점수를 얻을 때까지 반복
        else:
            print('Unacceptable!! Trying again...')
            return _parallel_fit_estimator(estimator1, X, y, sample_weight=sample_weight, **fit_params)
    
    else :
        return estimator
        
        
# fit_params를 전파하는 VotingClassifier의 적합 방법 구현
class VotingClassifierLGBM(VotingClassifier):
    def fit(self, X, y, sample_weight=None, threshold=True, **fit_params):
        if isinstance(y, np.ndarray) and len(y.shape)>1 and y.shape[1]>1:
            raise NotImplementedError('Multilabel and multi-output'
                                     ' classification is not supported.')
        if self.voting not in ('soft','hard'):
            raise ValueError("Voting must be 'soft' or 'hard'; got (voting=%r)" %self.voting)
        
        if self.estimators is None or len(self.estimators)==0:
            raise AttributeError("Invalid 'estimators' attribute, 'estimators' should be a list of (string, estimator)"" tuples")
            
        if (self.weights is not None and len(self.weights)!=len(self.estimators)):
            raise ValueError('Number of classifiers and weights must be equal'
                            '; got %d weights %d estimators' %(len(self.weights),len(self.estimators)))
    
        names, clfs = zip(*self.estimators)
        self._validate_names(names)
        
        n_isnone = np.sum([clf is None for _, clf in self.estimators])
        if n_isnone == len(self.estimators):
            raise ValueError('All estimators are None. At least one is '
                            'required to be a classifier!')
        
        self.le_ = LabelEncoder().fit(y)
        self.classes_ = self.le_.classes_
        self.estimators_ = []
        
        transformed_y = self.le_.transform(y)
        
        self.estimators_ = Parallel(n_jobs=self.n_jobs)(delayed(_parallel_fit_estimator)
                                      (clone(clf),X,transformed_y,sample_weight=sample_weight,threshold=threshold,**fit_params)
                                                        for clf in clfs if clf is not None)
        return self

In [40]:
clfs = []
for i in range(15):
    clf = xgb.XGBClassifier(random_state=217+i, n_estimators=300, learning_rate=0.15, n_jobs=4, **opt_parameters)
    clfs.append(('xgb{}'.format(i), clf))
    
vc = VotingClassifierLGBM(clfs, voting='soft')
del(clfs)

# 학습률 저하로 최종 모델 학습
_ = vc.fit(X_train.drop(xgb_drop_cols, axis=1), y_train, sample_weight=y_train_weights, threshold=False, **fit_params)

clf_final = vc.estimators_[0]

[0]	validation_0-mlogloss:1.29894	validation_0-macroF1:0.63874
[50]	validation_0-mlogloss:0.89615	validation_0-macroF1:0.56756
[100]	validation_0-mlogloss:0.89371	validation_0-macroF1:0.55565
[150]	validation_0-mlogloss:0.89062	validation_0-macroF1:0.56537
[200]	validation_0-mlogloss:0.89161	validation_0-macroF1:0.56447
[250]	validation_0-mlogloss:0.89255	validation_0-macroF1:0.56603
[299]	validation_0-mlogloss:0.89179	validation_0-macroF1:0.56650
Train F1 : 0.9115660908746297
Test F1 : 0.44462641001160175
[0]	validation_0-mlogloss:1.30399	validation_0-macroF1:0.62540
[50]	validation_0-mlogloss:0.91589	validation_0-macroF1:0.58098
[100]	validation_0-mlogloss:0.91543	validation_0-macroF1:0.59476
[150]	validation_0-mlogloss:0.91787	validation_0-macroF1:0.60346
[200]	validation_0-mlogloss:0.91642	validation_0-macroF1:0.60225
[250]	validation_0-mlogloss:0.91610	validation_0-macroF1:0.60209
[299]	validation_0-mlogloss:0.91594	validation_0-macroF1:0.60731
Train F1 : 0.8976233528333496
Test F

[0]	validation_0-mlogloss:1.29873	validation_0-macroF1:0.64866
[50]	validation_0-mlogloss:0.93059	validation_0-macroF1:0.57796
[100]	validation_0-mlogloss:0.93042	validation_0-macroF1:0.58456
[150]	validation_0-mlogloss:0.93141	validation_0-macroF1:0.58464
[200]	validation_0-mlogloss:0.93280	validation_0-macroF1:0.58348
[250]	validation_0-mlogloss:0.93234	validation_0-macroF1:0.58326
[299]	validation_0-mlogloss:0.93234	validation_0-macroF1:0.58357
Train F1 : 0.8890905288434153
Test F1 : 0.4291004079449764
[0]	validation_0-mlogloss:1.29814	validation_0-macroF1:0.63167
[50]	validation_0-mlogloss:0.93109	validation_0-macroF1:0.60363
[100]	validation_0-mlogloss:0.92695	validation_0-macroF1:0.60890
[150]	validation_0-mlogloss:0.92862	validation_0-macroF1:0.61470
[200]	validation_0-mlogloss:0.92751	validation_0-macroF1:0.60809
[250]	validation_0-mlogloss:0.92977	validation_0-macroF1:0.60797
[299]	validation_0-mlogloss:0.93202	validation_0-macroF1:0.61026
Train F1 : 0.9022537754659561
Test F1

In [42]:
# 400 early stop - 15 estimators - l1 used features - weighted
global_score = f1_score(y_test, clf_final.predict(X_test.drop(xgb_drop_cols,axis=1)),average='macro')
vc.voting = 'soft'
global_score_soft = f1_score(y_test, vc.predict(X_test.drop(xgb_drop_cols, axis=1)), average='macro')
vc.voting = 'hard'
global_score_hard = f1_score(y_test, vc.predict(X_test.drop(xgb_drop_cols, axis=1)), average='macro')

print('Validation score of a single LGBM Classifier: {:.4f}'.format(global_score))
print('Validation score of a VotingClassifier on 3 LGBMs with soft voting strategy: {:.4f}'.format(global_score_soft))
print('Validation score of a VotingClassifier on 3 LGBMs with hard voting strategy: {:.4f}'.format(global_score_hard))

Validation score of a single LGBM Classifier: 0.8403
Validation score of a VotingClassifier on 3 LGBMs with soft voting strategy: 0.9253
Validation score of a VotingClassifier on 3 LGBMs with hard voting strategy: 0.9321


In [45]:
# 어느 모델에도 사용되지 않는 변수 확인
useless_features = []
drop_features = set()
counter = 0

for est in vc.estimators_:
    ranked_features, unused_features = feature_importance(est, X_train.drop(xgb_drop_cols, axis=1), display_results=False)
    useless_features.append(unused_features)
    if counter == 0:
        drop_features = set(unused_features)
    else:
        drop_features = drop_features.intersection(set(unused_features))
    counter += 1
    
drop_features

{'agg18_estadocivil4_COUNT',
 'agg18_estadocivil5_COUNT',
 'geo_energcocinar_LE_0',
 'geo_epared_LE_2'}

In [46]:
ranked_feautres = feature_importance(clf_final, X_train.drop(xgb_drop_cols, axis=1))

Feature ranking :
1. feature 114 (0.030279) - geo_epared_LE_1
2. feature 42 (0.019682) - fe_children_fraction
3. feature 74 (0.018983) - agg18_parentesco2_MEAN
4. feature 59 (0.017248) - agg18_escolari_MAX
5. feature 133 (0.016442) - geo_pared_LE_1
6. feature 60 (0.014158) - agg18_escolari_MEAN
7. feature 40 (0.013327) - SQBdependency
8. feature 22 (0.012823) - dependency
9. feature 34 (0.012632) - SQBescolari
10. feature 12 (0.011761) - r4t1
11. feature 37 (0.011632) - SQBedjefe
12. feature 112 (0.011575) - geo_etecho_LE_1
13. feature 126 (0.010669) - geo_sanitario_LE_3
14. feature 96 (0.010495) - estadocivil_LE
15. feature 116 (0.010293) - geo_elimbasu_LE_0
16. feature 11 (0.010211) - r4m3
17. feature 100 (0.010105) - geo_age
18. feature 94 (0.010058) - etecho_LE
19. feature 17 (0.009895) - male
20. feature 105 (0.009877) - geo_hogar_total
21. feature 87 (0.009871) - piso_LE
22. feature 39 (0.009789) - SQBovercrowding
23. feature 63 (0.009531) - agg18_estadocivil2_MEAN
24. feature 11

## Random Forest

In [47]:
et_drop_cols = ['agg18_age_MAX', 'agg18_age_MEAN', 'agg18_age_MIN', 'agg18_dis_MEAN',
       'agg18_escolari_MAX', 'agg18_escolari_MEAN', 'agg18_escolari_MIN',
       'agg18_estadocivil1_COUNT', 'agg18_estadocivil1_MEAN',
       'agg18_estadocivil2_COUNT', 'agg18_estadocivil2_MEAN',
       'agg18_estadocivil3_COUNT', 'agg18_estadocivil3_MEAN',
       'agg18_estadocivil4_COUNT', 'agg18_estadocivil4_MEAN',
       'agg18_estadocivil5_COUNT', 'agg18_estadocivil5_MEAN',
       'agg18_estadocivil6_COUNT', 'agg18_estadocivil6_MEAN',
       'agg18_estadocivil7_COUNT', 'agg18_estadocivil7_MEAN',
       'agg18_parentesco10_COUNT', 'agg18_parentesco10_MEAN',
       'agg18_parentesco11_COUNT', 'agg18_parentesco11_MEAN',
       'agg18_parentesco12_COUNT', 'agg18_parentesco12_MEAN',
       'agg18_parentesco1_COUNT', 'agg18_parentesco1_MEAN',
       'agg18_parentesco2_COUNT', 'agg18_parentesco2_MEAN',
       'agg18_parentesco3_COUNT', 'agg18_parentesco3_MEAN',
       'agg18_parentesco4_COUNT', 'agg18_parentesco4_MEAN',
       'agg18_parentesco5_COUNT', 'agg18_parentesco5_MEAN',
       'agg18_parentesco6_COUNT', 'agg18_parentesco6_MEAN',
       'agg18_parentesco7_COUNT', 'agg18_parentesco7_MEAN',
       'agg18_parentesco8_COUNT', 'agg18_parentesco8_MEAN',
       'agg18_parentesco9_COUNT', 'agg18_parentesco9_MEAN'] #+ ['parentesco_LE', 'rez_esc']

et_drop_cols.extend(["idhogar", "parentesco1", 'fe_rent_per_person', 'fe_rent_per_room',
       'fe_tablet_adult_density', 'fe_tablet_density'])

In [53]:
# 추가적인 트리 분류기에 같은 작업 수행
ets = []
for i in range(10):
    rf = RandomForestClassifier(max_depth=None, random_state=217+i, n_jobs=1, n_estimators=700, min_impurity_decrease=1e-3,
                               min_samples_leaf=2, verbose=0, class_weight='balanced')
    ets.append(('rf{}'.format(i), rf))
    
vc2 = VotingClassifierLGBM(ets, voting='soft')    
_ = vc2.fit(X_train.drop(et_drop_cols, axis=1), y_train, threshold=False)    

Train F1 : 0.8970955314751967
Test F1 : 0.4339837350335132
Train F1 : 0.8978154737944584
Test F1 : 0.40102587275854407
Train F1 : 0.8984260514089941
Test F1 : 0.4178603966114821
Train F1 : 0.8989646974584208
Test F1 : 0.4474071002684059
Train F1 : 0.8946277247021607
Test F1 : 0.4040623090134163
Train F1 : 0.8942347190248382
Test F1 : 0.4420524924852226
Train F1 : 0.883377868986868
Test F1 : 0.38759359143567657
Train F1 : 0.8830889573677272
Test F1 : 0.3968373439696289
Train F1 : 0.8889972541497917
Test F1 : 0.4310054243471582
Train F1 : 0.8857021263049742
Test F1 : 0.4316117388522157


In [54]:
# w/ threshold, 추가 제거 열
vc2.voting = 'soft'
global_rf_score_soft = f1_score(y_test, vc2.predict(X_test.drop(et_drop_cols,axis=1)),average='macro')
vc2.voting = 'hard'
global_rf_score_hard = f1_score(y_test, vc2.predict(X_test.drop(et_drop_cols, axis=1)), average='macro')

print('Validation score of a VotingClassifier on 3 LGBMs with soft voting strategy: {:.4f}'.format(global_rf_score_soft))
print('Validation score of a VotingClassifier on 3 LGBMs with hard voting strategy: {:.4f}'.format(global_rf_score_hard))

Validation score of a VotingClassifier on 3 LGBMs with soft voting strategy: 0.8620
Validation score of a VotingClassifier on 3 LGBMs with hard voting strategy: 0.8883


In [55]:
# w/o threshold, 추가 제거 열
vc2.voting = 'soft'
global_rf_score_soft = f1_score(y_test, vc2.predict(X_test.drop(et_drop_cols, axis=1)), average='macro')
vc2.voting = 'hard'
global_rf_score_hard = f1_score(y_test, vc2.predict(X_test.drop(et_drop_cols, axis=1)), average='macro')

print('Validation score of a VotingClassifier on 3 LGBMs with soft voting strategy: {:.4f}'.format(global_rf_score_soft))
print('Validation score of a VotingClassifier on 3 LGBMs with hard voting strategy: {:.4f}'.format(global_rf_score_hard))

Validation score of a VotingClassifier on 3 LGBMs with soft voting strategy: 0.8620
Validation score of a VotingClassifier on 3 LGBMs with hard voting strategy: 0.8883


In [56]:
# 어떤 모델에서 사용되지 않는 변수 확인
useless_features = []
drop_features = set()

counter = 0
for est in vc2.estimators_:
    ranked_features, unused_features = feature_importance(est, X_train.drop(et_drop_cols, axis=1), display_results=False)
    useless_features.append(unused_features)
    if counter == 0:
        drop_features = set(unused_features)
    else:
        drop_features = drop_features.intersection(set(unused_features))
    counter += 1
    
drop_features

{'parentesco_LE', 'rez_esc'}

In [65]:
def combine_voters(data, weights=[0.5,0.5]):
    # 두 분류기로 soft voting
    vc.voting='soft'
    vc1_probs = vc.predict_proba(data.drop(xgb_drop_cols,axis=1))
    vc2.voting='soft'
    vc2_probs = vc2.predict_proba(data.drop(et_drop_cols,axis=1))
    
    final_vote = (vc1_probs*weights[0]) + (vc2_probs*weights[1])
    predictions = np.argmax(final_vote, axis=1)
    
    return predictions

In [66]:
combo_preds = combine_voters(X_test, weights=[0.5,0.5])
global_combo_score_soft = f1_score(y_test, combo_preds, average='macro')
global_combo_score_soft

0.9058558642748031

In [67]:
combo_preds = combine_voters(X_test, weights=[0.4, 0.6])
global_combo_score_soft= f1_score(y_test, combo_preds, average='macro')
global_combo_score_soft

0.89532954063404

In [68]:
combo_preds = combine_voters(X_test, weights=[0.6, 0.4])
global_combo_score_soft = f1_score(y_test, combo_preds, average='macro')
global_combo_score_soft

0.9121894008700175

# Prepare submission

In [69]:
y_subm = pd.DataFrame()
y_subm['Id'] = test_ids

In [71]:
vc.voting = 'soft'
y_subm_lgb = y_subm.copy(deep=True)
y_subm_lgb['Target'] = vc.predict(test.drop(xgb_drop_cols, axis=1)) + 1

vc2.voting = 'soft'
y_subm_rf = y_subm.copy(deep=True)
y_subm_rf['Target'] = vc2.predict(test.drop(et_drop_cols, axis=1)) + 1

y_subm_ens = y_subm.copy(deep=True)
y_subm_ens['Target'] = combine_voters(test) + 1

In [74]:
from datetime import datetime
now = datetime.now()

sub_file_lgb = 'submission_soft_XGB_{:.4f}_{}.csv'.format(global_score_soft, str(now.strftime('%Y-%m-%d-%H-%M')))
sub_file_rf = 'submission_soft_RF_{:.4f}_{}.csv'.format(global_rf_score_soft, str(now.strftime('%Y-%m-%d-%H-%M')))
sub_file_ens = 'submission_ens_{:.4f}_{}.csv'.format(global_combo_score_soft, str(now.strftime('%Y-%m-%d-%H-%M')))

y_subm_lgb.to_csv(sub_file_lgb, index=False)
y_subm_rf.to_csv(sub_file_rf, index=False)
y_subm_ens.to_csv(sub_file_ens, index=False)