# 1회차

- LightGBM 모델은 XGBoost로 대체되었으며 그에 따라 코드가 업데이트됐습니다.
- 또한 랜덤 포레스트의 투표 분류기를 장착하고 XGB의 결과를 RF와 결합합니다.
- 데이터를 한 번 쪼개서 LGBM 조기 정지에 대한 검증 데이터를 사용하는 대신 전체 교육 세트를 훈련할 수 있도록 교육 중 데이터를 분할했습니다. 이것이 이 케이스에서 kfold 분할보다 더 효과가 있음을 발견했습니다.

- 이 커널은 하이퍼 파라미터 최적화를 실행하는 대신 해당 커널의 최적값을 사용하여 더 빨리 실행됩니다.

- __몇 가지 주요 요점은 다음과 같습니다.__:
    - 이 커널은 가장에 대해서만 교육을 실행합니다. 
    - 클래스 크기의 균형을 맞추는 것이 매우 중요해보입니다. 이는 수작업으로 할 수 있고, 언더 샘플링으로 달성할 수 있습니다. 그러나 sklearn API의 LGBM 모델 생성자에 `class_weight='balance'`을 설정하는 것이 가장 간단합니다.
    - 이 커널은 매크로 F1점수를 사용하여 교육을 조기 중단합니다. 
    - 범주는 블라인드 레이블 인코딩 대신 적절한 매핑을 가진 숫자로 바뀝니다.
    - OHE를 라벨 인코딩으로 반전시키면, 트리 모델에 더 적합할 수 있으나 , 논트리 모델에는 안 좋을 수 있으니 유의해야합니다.
    - `idhogar`는 훈련에서 사용되지 않습니다. 정보가 있을 수 있는 유일한 방법은 데이터 누출입니다. 
    - 가구 내에서 집계가 이루어지며 새로운 기능은 수작업으로 제작됩니다. 대부분의 기능이 이미 가정 수준에서 인용되었기 때문에 집계할 수 있는 기능이 많지 않습니다.
    - 투표 분류기는 여러 LGBM 모델에서 평균을 내는데 사용됩니다

### import

In [113]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import f1_score
from joblib import Parallel, delayed
from sklearn.base import clone
from sklearn.ensemble import VotingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.utils import class_weight
import warnings
warnings.filterwarnings('ignore')

### mapping

In [114]:
from sklearn.preprocessing import LabelEncoder

# this only transforms the idhogar field, the other things this function used to do are done elsewhere
def encode_data(df):
    df['idhogar'] = LabelEncoder().fit_transform(df['idhogar'])

# plot feature importance for sklearn decision trees    
def feature_importance(forest, X_train, display_results=True):
    ranked_list = []
    zero_features = []
    
    importances = forest.feature_importances_

    indices = np.argsort(importances)[::-1]
    
    if display_results:
        # Print the feature ranking
        print("Feature ranking:")

    for f in range(X_train.shape[1]):
        if display_results:
            print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]) + " - " + X_train.columns[indices[f]])
        
        ranked_list.append(X_train.columns[indices[f]])
        
        if importances[indices[f]] == 0.0:
            zero_features.append(X_train.columns[indices[f]])
            
    return ranked_list, zero_features

### feature engineering

In [115]:
def do_features(df):
    feats_div = [('children_fraction', 'r4t1', 'r4t3'), 
                 ('working_man_fraction', 'r4h2', 'r4t3'),
                 ('all_man_fraction', 'r4h3', 'r4t3'),
                 ('human_density', 'tamviv', 'rooms'),
                 ('human_bed_density', 'tamviv', 'bedrooms'),
                 ('rent_per_person', 'v2a1', 'r4t3'),
                 ('rent_per_room', 'v2a1', 'rooms'),
                 ('mobile_density', 'qmobilephone', 'r4t3'),
                 ('tablet_density', 'v18q1', 'r4t3'),
                 ('mobile_adult_density', 'qmobilephone', 'r4t2'),
                 ('tablet_adult_density', 'v18q1', 'r4t2'),
                ]
    
    feats_sub = [('people_not_living', 'tamhog', 'tamviv'),
                 ('people_weird_stat', 'tamhog', 'r4t3')]

    for f_new, f1, f2 in feats_div:
        df['fe_' + f_new] = (df[f1] / df[f2]).astype(np.float32)       
    for f_new, f1, f2 in feats_sub:
        df['fe_' + f_new] = (df[f1] - df[f2]).astype(np.float32)
    
    # aggregation rules over household
    aggs_num = {'age': ['min', 'max', 'mean'],
                'escolari': ['min', 'max', 'mean']
               }
    
    aggs_cat = {'dis': ['mean']}
    for s_ in ['estadocivil', 'parentesco', 'instlevel']:
        for f_ in [f_ for f_ in df.columns if f_.startswith(s_)]:
            aggs_cat[f_] = ['mean', 'count']

    # aggregation over household
    for name_, df_ in [('18', df.query('age >= 18'))]:
        df_agg = df_.groupby('idhogar').agg({**aggs_num, **aggs_cat}).astype(np.float32)
        df_agg.columns = pd.Index(['agg' + name_ + '_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
        df = df.join(df_agg, how='left', on='idhogar')
        del df_agg

    # Drop id's
    df.drop(['Id'], axis=1, inplace=True)
    
    return df

In [116]:
# convert one hot edcoded fields to label encoding
def convert_OHE2LE(df):
    tmp_df = df.copy(deep=True)
    for s_ in ['pared', 'piso', 'techo', 'abastagua', 'sanitario', 'energcocinar', 'elimbasu', 
               'epared', 'etecho', 'eviv', 'estadocivil', 'parentesco', 
               'instlevel', 'lugar', 'tipovivi',
               'manual_elec']:
        if 'manual_' not in s_:
            cols_s_ = [f_ for f_ in df.columns if f_.startswith(s_)]
        elif 'elec' in s_:
            cols_s_ = ['public', 'planpri', 'noelec', 'coopele']
        sum_ohe = tmp_df[cols_s_].sum(axis=1).unique()
        #deal with those OHE, where there is a sum over columns == 0
        if 0 in sum_ohe:
            print('The OHE in {} is incomplete. A new column will be added before label encoding'
                  .format(s_))
            # dummy colmn name to be added
            col_dummy = s_+'_dummy'
            # add the column to the dataframe
            tmp_df[col_dummy] = (tmp_df[cols_s_].sum(axis=1) == 0).astype(np.int8)
            # add the name to the list of columns to be label-encoded
            cols_s_.append(col_dummy)
            # proof-check, that now the category is complete
            sum_ohe = tmp_df[cols_s_].sum(axis=1).unique()
            if 0 in sum_ohe:
                 print("The category completion did not work")
        tmp_cat = tmp_df[cols_s_].idxmax(axis=1)
        tmp_df[s_ + '_LE'] = LabelEncoder().fit_transform(tmp_cat).astype(np.int16)
        if 'parentesco1' in cols_s_:
            cols_s_.remove('parentesco1')
        tmp_df.drop(cols_s_, axis=1, inplace=True)
    return tmp_df

## Read in the data and clean it up

In [117]:
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')

test_ids = test.Id

In [118]:
def process_df(df_):
    # encode the idhogar
    encode_data(df_)
    
    # create aggregate features
    return do_features(df_)

train = process_df(train)
test = process_df(test)

결측치를 정리하고 오브젝트를 숫자형으로 변환합니다.

In [119]:
# Some dependencies are Na, fill those with the square root of the square
train['dependency'] = np.sqrt(train['SQBdependency'])
test['dependency'] = np.sqrt(test['SQBdependency'])

# fill 'no's for education with 0s
train.loc[train['edjefa'] == 'no', 'edjefa'] = 0
train.loc[train['edjefe'] == 'no', 'edjefe'] = 0
test.loc[test['edjefa'] == 'no', 'edjefa'] = 0
test.loc[test['edjefe'] == 'no', 'edjefe'] = 0

# if education is 'yes' and person is head of household, fill with escolari
train.loc[(train['edjefa'] == 'yes') & (train['parentesco1'] == 1), 'edjefa'] = train.loc[(train['edjefa'] == 'yes') & (train['parentesco1'] == 1), 'escolari']
train.loc[(train['edjefe'] == 'yes') & (train['parentesco1'] == 1), 'edjefe'] = train.loc[(train['edjefe'] == 'yes') & (train['parentesco1'] == 1), 'escolari']

test.loc[(test['edjefa'] == 'yes') & (test['parentesco1'] == 1), 'edjefa'] = test.loc[(test['edjefa'] == 'yes') & (test['parentesco1'] == 1), 'escolari']
test.loc[(test['edjefe'] == 'yes') & (test['parentesco1'] == 1), 'edjefe'] = test.loc[(test['edjefe'] == 'yes') & (test['parentesco1'] == 1), 'escolari']

# this field is supposed to be interaction between gender and escolari, but it isn't clear what 'yes' means. let's fill it with 4
train.loc[train['edjefa'] == 'yes', 'edjefa'] = 4
train.loc[train['edjefe'] == 'yes', 'edjefe'] = 4

test.loc[test['edjefa'] == 'yes', 'edjefa'] = 4
test.loc[test['edjefe'] == 'yes', 'edjefe'] = 4

# Convert to int for our models
train['edjefe'] = train['edjefe'].astype('int')
train['edjefa'] = train['edjefa'].astype('int')
test['edjefe'] = test['edjefe'].astype('int')
test['edjefa'] = test['edjefa'].astype('int')

# create feature with max education of either head of household
train['edjef'] = np.max(train[['edjefa', 'edjefe']], axis=1)
test['edjef'] = np.max(test[['edjefa', 'edjefe']], axis=1)

# fill some nas
train['v2a1'] = train['v2a1'].fillna(0)
test['v2a1'] = test['v2a1'].fillna(0)

train['v18q1'] = train['v18q1'].fillna(0)
test['v18q1'] = test['v18q1'].fillna(0)

train['rez_esc'] = train['rez_esc'].fillna(0)
test['rez_esc'] = test['rez_esc'].fillna(0)

train.loc[train.meaneduc.isnull(), 'meaneduc'] = 0
train.loc[train.SQBmeaned.isnull(), 'SQBmeaned'] = 0

test.loc[test.meaneduc.isnull(), 'meaneduc'] = 0
test.loc[test.SQBmeaned.isnull(), 'SQBmeaned'] = 0

# fix some inconsistencies in the data - some rows indicate both that the household does and does not not have a toilet,
# if there is no water we'll assume they do not
train.loc[(train.v14a == 1) & (train.sanitario1 == 1) & (train.abastaguano == 0), 'v14a'] = 0
train.loc[(train.v14a == 1) & (train.sanitario1 == 1) & (train.abastaguano == 0), 'sanitario'] = 0

test.loc[(test.v14a == 1) & (test.sanitario1 == 1) & (test.abastaguano == 0), 'v14a'] = 0
test.loc[(test.v14a == 1) & (test.sanitario1 == 1) & (test.abastaguano == 0), 'sanitario'] = 0

In [120]:
def train_test_apply_func(train_, test_, func_):
    test_['Target'] = 0
    xx = pd.concat([train_, test_])
    
    xx_func = func_(xx)
    train_ = xx_func.iloc[:train_.shape[0], :]
    test_ = xx_func.iloc[train_.shape[0]:, :].drop('Target', axis=1)
    
    del xx, xx_func
    return train_, test_

In [121]:
# convert the one hot fields into label encoded
train, test = train_test_apply_func(train, test, convert_OHE2LE)

The OHE in techo is incomplete. A new column will be added before label encoding
The OHE in instlevel is incomplete. A new column will be added before label encoding
The OHE in manual_elec is incomplete. A new column will be added before label encoding


### Geo aggregates

In [122]:
cols_2_ohe = ['eviv_LE', 'etecho_LE', 'epared_LE', 'elimbasu_LE', 'energcocinar_LE', 'sanitario_LE', 'manual_elec_LE', 'pared_LE']
cols_nums = ['age', 'meaneduc', 'dependency', 'hogar_nin', 'hogar_adul', 'hogar_mayor', 'hogar_total', 'bedrooms' ,'overcrowding']

def convert_geo2aggs(df_):
    tmp_df = pd.concat([df_[(['lugar_LE', 'idhogar'] + cols_nums)], pd.get_dummies(df_[cols_2_ohe], columns=cols_2_ohe)], axis=1)
    
    geo_agg = tmp_df.groupby(['lugar_LE', 'idhogar']).mean().groupby('lugar_LE').mean().astype(np.float32)
    geo_agg.columns = pd.Index(['geo_' + e for e in geo_agg.columns.tolist()])
    
    del tmp_df
    return df_.join(geo_agg, how='left', on='lugar_LE')

# add some aggregates by geography
train, test = train_test_apply_func(train, test, convert_geo2aggs)

In [123]:
# add the number of people over 18 in each household
train['num_over_18'] = 0
train['num_over_18'] = train[train.age >= 18].groupby('idhogar').transform('count')
train['num_over_18'] = train.groupby('idhogar')['num_over_18'].transform('max')
train['num_over_18'] = train['num_over_18'].fillna(0)

test['num_over_18'] = 0
test['num_over_18'] = test[test.age >= 18].groupby('idhogar').transform('count')
test['num_over_18'] = test.groupby('idhogar')['num_over_18'].transform('max')
test['num_over_18'] = test['num_over_18'].fillna(0)

# add some extra features, these were taken from another kernel
def extract_features(df):
    df['bedrooms_to_rooms'] = df['bedrooms'] / df['rooms']
    df['rent_to_rooms'] = df['v2a1'] / df['rooms']
    df['tamhog_to_rooms'] = df['tamhog'] / df['rooms'] # tamhog - size of the household
    df['r4t3_to_tamhog'] = df['r4t3'] / df['tamhog'] # r4t3 - Total persons in the household
    df['r4t3_to_rooms'] = df['r4t3'] / df['rooms']
    df['v2a1_to_r4t3'] = df['v2a1'] / df['r4t3'] # rent to people in household
    df['v2a1_to_under_12'] = df['v2a1'] / (df['r4t3'] - df['r4t1']) # rent to people under age 12
    df['hhsize_to_rooms'] = df['hhsize'] / df['rooms'] # rooms per person
    df['rent_to_hhsize'] = df['v2a1'] / df['hhsize'] # rent to household size
    df['rent_to_over_18'] = df['v2a1'] / df['num_over_18'] # some households have no one over 18, use the total rent for those
    df.loc[df.num_over_18 == 0, 'rent_to_over_18'] = df[df.num_over_18 == 0].v2a1
    
extract_features(train)
extract_features(test)

In [124]:
# drop duplicated columns
needless_cols = ['r4t3', 'tamhog', 'tamviv', 'hhsize', 'v18q', 'v14a', 'agesq', 'mobilephone', 'female']

instlevel_cols = [s for s in train.columns.tolist() if 'instlevel' in s]

needless_cols.extend(instlevel_cols)

train = train.drop(needless_cols, axis=1)
test = test.drop(needless_cols, axis=1)

### Split the data
동일한 가구에 속하는 행은 대개 동일한 타겟을 가지므로, 누출을 방지하기 위해 가구별로 데이터를 분할합니다. 가장만 포함하도록 데이터를 필터링하고 이는 기술적으로는 필요하지 않지만, 그렇게 하면 전체 교육 데이터 세트를 쉽게 사용할 수 있습니다.

데이터를 분할한 후에는 열차 데이터를 전체 데이터 세트로 덮어써 모든 데이터를 훈련할 수 있습니다. split_data 기능은 데이터를 덮어쓰지 않고 동일한 작업을 수행하며, 교육 루프 내에서 K-Fold 분할을 근사화하는 데 사용됩니다.

In [125]:
def split_data(train, y, sample_weight=None, households=None, test_percentage=0.20, seed=None):
    
    train2 = train.copy()
    
    # pick some random households to use for the test data
    cv_hhs = np.random.choice(households, size=int(len(households) * test_percentage), replace=False)
    
    # select households which are in the random selection
    cv_idx = np.isin(households, cv_hhs)
    X_test = train2[cv_idx]
    y_test = y[cv_idx]
    
    X_train = train2[~cv_idx]
    y_train = y[~cv_idx]
    
    if sample_weight is not None:
        y_train_weights = sample_weight[~cv_idx]
        return X_train, y_train, X_test, y_test, y_train_weights
    
    return X_train, y_train, X_test, y_test

In [126]:
X = train.query('parentesco1==1')

y = X['Target'] - 1
X = X.drop(['Target'], axis=1)

np.random.seed(seed=None)

train2 = X.copy()

train_hhs = train2.idhogar

households = train2.idhogar.unique()
cv_hhs = np.random.choice(households, size=int(len(households) * 0.15), replace=False)

cv_idx = np.isin(train2.idhogar, cv_hhs)

X_test = train2[cv_idx]
y_test = y[cv_idx]

X_train = train2[~cv_idx]
y_train = y[~cv_idx]

# train on entire dataset
X_train = train2
y_train = y

train_households = X_train.idhogar

In [127]:
# figure out the class weights for training with unbalanced classes
y_train_weights = class_weight.compute_sample_weight('balanced', y_train, indices=None)

In [128]:
# drop some features which aren't used by the LGBM or have very low importance
extra_drop_features = [
 'agg18_estadocivil1_MEAN',
 'agg18_estadocivil6_COUNT',
 'agg18_estadocivil7_COUNT',
 'agg18_parentesco10_COUNT',
 'agg18_parentesco11_COUNT',
 'agg18_parentesco12_COUNT',
 'agg18_parentesco1_COUNT',
 'agg18_parentesco2_COUNT',
 'agg18_parentesco3_COUNT',
 'agg18_parentesco4_COUNT',
 'agg18_parentesco5_COUNT',
 'agg18_parentesco6_COUNT',
 'agg18_parentesco7_COUNT',
 'agg18_parentesco8_COUNT',
 'agg18_parentesco9_COUNT',
 'geo_elimbasu_LE_4',
 'geo_energcocinar_LE_1',
 'geo_energcocinar_LE_2',
 'geo_epared_LE_0',
 'geo_hogar_mayor',
 'geo_manual_elec_LE_2',
 'geo_pared_LE_3',
 'geo_pared_LE_4',
 'geo_pared_LE_5',
 'geo_pared_LE_6',
 'num_over_18',
 'parentesco_LE',
 'rez_esc']

In [129]:
xgb_drop_cols = extra_drop_features + ['idhogar', 'parentesco1']

### Fit a voting classifier
조기 중지를 위해 `fit_params`를 전달할 수 있도록 VotingClassifier을 정의합니다. LGBM에 기반하여 매크로 F1과 학습률의 줄어듬에 따라 조기 중지합니다.

In [130]:
# 4
opt_parameters = {'max_depth':35, 'eta':0.1, 'silent':0, 'objective':'multi:softmax', 'min_child_weight': 1, 'num_class': 4, 'gamma': 2.0, 'colsample_bylevel': 0.9, 'subsample': 0.84, 'colsample_bytree': 0.88, 'reg_lambda': 0.40}
# 5
opt_parameters = {'max_depth':35, 'eta':0.15, 'silent':1, 'objective':'multi:softmax', 'min_child_weight': 2, 'num_class': 4, 'gamma': 2.5, 'colsample_bylevel': 1, 'subsample': 0.95, 'colsample_bytree': 0.85, 'reg_lambda': 0.35}

def evaluate_macroF1_lgb(predictions, truth):
    pred_labels = predictions.argmax(axis=1)
    truth = truth.get_label()
    f1 = f1_score(truth, pred_labels, average='macro')
    return ('macroF1', 1-f1)

fit_params = {'early_stopping_rounds': 500,
             'eval_metric': evaluate_macroF1_lgb,
             'eval_set': [(X_train, y_train), (X_test, y_test)],
             'verbose': False}

def learning_rate_power_0997(current_iter):
    base_learning_rate = 0.1
    min_learning_rate = 0.02
    lr = base_learning_rate * np.power(.995, current_iter)
    return max(lr, min_learning_rate)

fit_params['verbose'] = 50

In [131]:
np.random.seed(100)

def _parallel_fit_estimator(estimator1, X, y, sample_weight=None, threshold=True, **fit_params):
    estimator = clone(estimator1)
    
    # randomly split the data so we have a test set for early stopping
    if sample_weight is not None:
        X_train, y_train, X_test, y_test, y_train_weight = split_data(X, y, sample_weight, households=train_households)
    else:
        X_train, y_train, X_test, y_test = split_data(X, y, None, households=train_households)
        
    # update the fit params with our new split
    fit_params["eval_set"] = [(X_test,y_test)]
    
    # fit the estimator
    if sample_weight is not None:
        if isinstance(estimator1, ExtraTreesClassifier) or isinstance(estimator1, RandomForestClassifier):
            estimator.fit(X_train, y_train)
        else:
            _ = estimator.fit(X_train, y_train, sample_weight=y_train_weight, **fit_params)
    else:
        if isinstance(estimator1, ExtraTreesClassifier) or isinstance(estimator1, RandomForestClassifier):
            estimator.fit(X_train, y_train)
        else:
            _ = estimator.fit(X_train, y_train, **fit_params)
    
    if not isinstance(estimator1, ExtraTreesClassifier) and not isinstance(estimator1, RandomForestClassifier) and not isinstance(estimator1, xgb.XGBClassifier):
        best_cv_round = np.argmax(estimator.evals_result_['validation_0']['mlogloss'])
        best_cv = np.max(estimator.evals_result_['validation_0']['mlogloss'])
        best_train = estimator.evals_result_['train']['macroF1'][best_cv_round]
    else:
        best_train = f1_score(y_train, estimator.predict(X_train), average="macro")
        best_cv = f1_score(y_test, estimator.predict(X_test), average="macro")
        print("Train F1:", best_train)
        print("Test F1:", best_cv)
        
    # reject some estimators based on their performance on train and test sets
    if threshold:
        # if the valid score is very high we'll allow a little more leeway with the train scores
        if ((best_cv > 0.37) and (best_train > 0.75)) or ((best_cv > 0.44) and (best_train > 0.65)):
            return estimator

        # else recurse until we get a better one
        else:
            print("Unacceptable!!! Trying again...")
            return _parallel_fit_estimator(estimator1, X, y, sample_weight=sample_weight, **fit_params)
    
    else:
        return estimator
    
class VotingClassifierLGBM(VotingClassifier):
    '''
    This implements the fit method of the VotingClassifier propagating fit_params
    '''
    def fit(self, X, y, sample_weight=None, threshold=True, **fit_params):
        
        if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
            raise NotImplementedError('Multilabel and multi-output'
                                      ' classification is not supported.')

        if self.voting not in ('soft', 'hard'):
            raise ValueError("Voting must be 'soft' or 'hard'; got (voting=%r)"
                             % self.voting)

        if self.estimators is None or len(self.estimators) == 0:
            raise AttributeError('Invalid `estimators` attribute, `estimators`'
                                 ' should be a list of (string, estimator)'
                                 ' tuples')

        if (self.weights is not None and
                len(self.weights) != len(self.estimators)):
            raise ValueError('Number of classifiers and weights must be equal'
                             '; got %d weights, %d estimators'
                             % (len(self.weights), len(self.estimators)))

        names, clfs = zip(*self.estimators)
        self._validate_names(names)

        n_isnone = np.sum([clf is None for _, clf in self.estimators])
        if n_isnone == len(self.estimators):
            raise ValueError('All estimators are None. At least one is '
                             'required to be a classifier!')

        self.le_ = LabelEncoder().fit(y)
        self.classes_ = self.le_.classes_
        self.estimators_ = []

        transformed_y = self.le_.transform(y)

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
                delayed(_parallel_fit_estimator)(clone(clf), X, transformed_y,
                                                 sample_weight=sample_weight, threshold=threshold, **fit_params)
                for clf in clfs if clf is not None)

        return self

In [132]:
clfs = []
for i in range(15):
    clf = xgb.XGBClassifier(random_state=217*i, n_estimators=300, learning_rate=0.15, n_jobs=-1, **opt_parameters)
    
    clfs.append(('xgb{}'.format(i), clf))
    
vc = VotingClassifierLGBM(clfs, voting='soft')
del(clfs)

# Train the final model with learning rate decay
_ = vc.fit(X_train.drop(xgb_drop_cols, axis=1), y_train, sample_weight=y_train_weights, threshold=False, **fit_params)

clf_final = vc.estimators_[0]

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-mlogloss:1.30039	validation_0-macroF1:0.64244
[50]	validation_0-mlogloss:0.91911	validation_0-macroF1:0.59801
[100]	validation_0-mlogloss:0.91816	validation_0-macroF1:0.61256
[150]	validation_0-mlogloss:0.91795	validation_0-macroF1:0.62226
[200]	validation_0-mlogloss:0.91941	validation_0-macroF1:0.61902
[250]	validation_0-mlogloss:0.92048	validation_0-macroF1:0.61853
[299]	validation_0-mlogloss:0.92179	validation_0-macroF1:0.62086
Train F1: 0.9023897544564117
Test F1: 0.40598167996933926
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip thr

[250]	validation_0-mlogloss:0.91859	validation_0-macroF1:0.58896
[299]	validation_0-mlogloss:0.91927	validation_0-macroF1:0.59010
Train F1: 0.8952954970083489
Test F1: 0.4185828757772069
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-mlogloss:1.30113	validation_0-macroF1:0.62662
[50]	validation_0-mlogloss:0.94249	validation_0-macroF1:0.61305
[100]	validation_0-mlogloss:0.93815	validation_0-macroF1:0.61161
[150]	validation_0-mlogloss:0.94009	validation_0-macroF1:0.62866
[200]	validation_0-mlogloss:0.94015	validation_0-macroF1:0.62751
[250]	validation_0-mlogloss:0.93901	validation_0-macroF1:0.62519
[299]	validation_0-mlogloss:0.93804	validation_0-macroF1:0.62389
Train F1: 0.8938783060148262
Test F1: 0.39977809923009056
Parameters: { silent 

[100]	validation_0-mlogloss:0.90577	validation_0-macroF1:0.54980
[150]	validation_0-mlogloss:0.90260	validation_0-macroF1:0.55784
[200]	validation_0-mlogloss:0.90185	validation_0-macroF1:0.56932
[250]	validation_0-mlogloss:0.90101	validation_0-macroF1:0.55274
[299]	validation_0-mlogloss:0.90182	validation_0-macroF1:0.56280
Train F1: 0.9121177304574287
Test F1: 0.45772563588510834
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-mlogloss:1.29922	validation_0-macroF1:0.63620
[50]	validation_0-mlogloss:0.88286	validation_0-macroF1:0.56866
[100]	validation_0-mlogloss:0.87406	validation_0-macroF1:0.55138
[150]	validation_0-mlogloss:0.87344	validation_0-macroF1:0.54867
[200]	validation_0-mlogloss:0.87268	validation_0-macroF1:0.55747
[250]	validat

In [133]:
# params 4 - 400 early stop - 15 estimators - l1 used features - weighted
global_score = f1_score(y_test, clf_final.predict(X_test.drop(xgb_drop_cols, axis=1)), average='macro')
vc.voting = 'soft'
global_score_soft = f1_score(y_test, vc.predict(X_test.drop(xgb_drop_cols, axis=1)), average='macro')
vc.voting = 'hard'
global_score_hard = f1_score(y_test, vc.predict(X_test.drop(xgb_drop_cols, axis=1)), average='macro')

print('Validation score of a single LGBM Classifier: {:.4f}'.format(global_score))
print('Validation score of a VotingClassifier on 3 LGBMs with soft voting strategy: {:.4f}'.format(global_score_soft))
print('Validation score of a VotingClassifier on 3 LGBMs with hard voting strategy: {:.4f}'.format(global_score_hard))

Validation score of a single LGBM Classifier: 0.8236
Validation score of a VotingClassifier on 3 LGBMs with soft voting strategy: 0.9088
Validation score of a VotingClassifier on 3 LGBMs with hard voting strategy: 0.9178


In [134]:
# see which features are not used by ANY models
useless_features = []
drop_features = set()
counter = 0
for est in vc.estimators_:
    ranked_features, unused_features = feature_importance(est, X_train.drop(xgb_drop_cols, axis=1), display_results=False)
    useless_features.append(unused_features)
    if counter == 0:
        drop_features = set(unused_features)
    else:
        drop_features = drop_features.intersection(set(unused_features))
        counter += 1
        
drop_features

{'agg18_estadocivil3_COUNT',
 'agg18_estadocivil4_COUNT',
 'agg18_estadocivil5_COUNT',
 'agg18_parentesco10_MEAN',
 'fe_human_bed_density',
 'fe_people_weird_stat',
 'geo_elimbasu_LE_3',
 'geo_energcocinar_LE_0',
 'geo_epared_LE_2',
 'geo_eviv_LE_0',
 'geo_manual_elec_LE_3',
 'geo_manual_elec_LE_4',
 'geo_pared_LE_0',
 'geo_pared_LE_1',
 'geo_sanitario_LE_0',
 'geo_sanitario_LE_2',
 'hacapo'}

In [135]:
ranked_features = feature_importance(clf_final, X_train.drop(xgb_drop_cols, axis=1))

Feature ranking:
1. feature 59 (0.022441) - agg18_escolari_MAX
2. feature 85 (0.021199) - edjef
3. feature 42 (0.017726) - fe_children_fraction
4. feature 123 (0.015770) - geo_sanitario_LE_0
5. feature 119 (0.015614) - geo_elimbasu_LE_3
6. feature 17 (0.015298) - male
7. feature 74 (0.015210) - agg18_parentesco2_MEAN
8. feature 37 (0.014125) - SQBedjefe
9. feature 3 (0.013019) - hacapo
10. feature 107 (0.011342) - geo_overcrowding
11. feature 104 (0.011086) - geo_hogar_adul
12. feature 112 (0.011028) - geo_etecho_LE_1
13. feature 22 (0.010883) - dependency
14. feature 35 (0.010827) - SQBage
15. feature 41 (0.010744) - SQBmeaned
16. feature 15 (0.010707) - cielorazo
17. feature 110 (0.010428) - geo_eviv_LE_2
18. feature 87 (0.010364) - piso_LE
19. feature 10 (0.010159) - r4m2
20. feature 38 (0.009978) - SQBhogar_nin
21. feature 60 (0.009952) - agg18_escolari_MEAN
22. feature 109 (0.009545) - geo_eviv_LE_1
23. feature 12 (0.009511) - r4t1
24. feature 49 (0.009490) - fe_mobile_density
25.

### Random Forest

In [136]:
et_drop_cols = ['agg18_age_MAX', 'agg18_age_MEAN', 'agg18_age_MIN', 'agg18_dis_MEAN',
       'agg18_escolari_MAX', 'agg18_escolari_MEAN', 'agg18_escolari_MIN',
       'agg18_estadocivil1_COUNT', 'agg18_estadocivil1_MEAN',
       'agg18_estadocivil2_COUNT', 'agg18_estadocivil2_MEAN',
       'agg18_estadocivil3_COUNT', 'agg18_estadocivil3_MEAN',
       'agg18_estadocivil4_COUNT', 'agg18_estadocivil4_MEAN',
       'agg18_estadocivil5_COUNT', 'agg18_estadocivil5_MEAN',
       'agg18_estadocivil6_COUNT', 'agg18_estadocivil6_MEAN',
       'agg18_estadocivil7_COUNT', 'agg18_estadocivil7_MEAN',
       'agg18_parentesco10_COUNT', 'agg18_parentesco10_MEAN',
       'agg18_parentesco11_COUNT', 'agg18_parentesco11_MEAN',
       'agg18_parentesco12_COUNT', 'agg18_parentesco12_MEAN',
       'agg18_parentesco1_COUNT', 'agg18_parentesco1_MEAN',
       'agg18_parentesco2_COUNT', 'agg18_parentesco2_MEAN',
       'agg18_parentesco3_COUNT', 'agg18_parentesco3_MEAN',
       'agg18_parentesco4_COUNT', 'agg18_parentesco4_MEAN',
       'agg18_parentesco5_COUNT', 'agg18_parentesco5_MEAN',
       'agg18_parentesco6_COUNT', 'agg18_parentesco6_MEAN',
       'agg18_parentesco7_COUNT', 'agg18_parentesco7_MEAN',
       'agg18_parentesco8_COUNT', 'agg18_parentesco8_MEAN',
       'agg18_parentesco9_COUNT', 'agg18_parentesco9_MEAN']

et_drop_cols.extend(['idhogar', 'parentesco1', 'fe_rent_per_person', 'fe_rent_per_room', 'fe_tablet_adult_density', 'fe_tablet_density'])

In [137]:
# do the same thing for some extra trees classifiers
ets = []
for i in range(10):
    rf = RandomForestClassifier(max_depth=None, random_state=217+i, n_jobs=-1, n_estimators=700, min_impurity_decrease=1e-3, min_samples_leaf=2, verbose=0, class_weight='balanced')
    ets.append(('rf{}'.format(i), rf))
    
vc2 = VotingClassifierLGBM(ets, voting='soft')
_ = vc2.fit(X_train.drop(et_drop_cols, axis=1), y_train, threshold=False)

Train F1: 0.9017837485614384
Test F1: 0.4226694191575666
Train F1: 0.897162614447881
Test F1: 0.3979130421976129
Train F1: 0.8881944338931395
Test F1: 0.3875998252669628
Train F1: 0.8921515721754021
Test F1: 0.4279783767850778
Train F1: 0.8934553837403496
Test F1: 0.4601330497233288
Train F1: 0.8966981952722992
Test F1: 0.43569219669688775
Train F1: 0.8919247831845957
Test F1: 0.42471715056612247
Train F1: 0.894263835407065
Test F1: 0.46390572961302057
Train F1: 0.906635460295664
Test F1: 0.4338689224939225
Train F1: 0.895577038796373
Test F1: 0.4242719526777626


In [138]:
# w/o threshold, extra drop cols
vc2.voting = 'soft'
global_rf_score_soft = f1_score(y_test, vc2.predict(X_test.drop(et_drop_cols, axis=1)), average='macro')
vc2.voting = 'hard'
global_rf_score_hard = f1_score(y_test, vc2.predict(X_test.drop(et_drop_cols, axis=1)), average='macro')

print('Validation score of a VotingClassifier on 3 LGBM with soft voting strategy: {:.4f}'.format(global_rf_score_soft))
print('Validation score of a VotingClassifier on 3 LGBM with hard voting strategy: {:.4f}'.format(global_rf_score_hard))

Validation score of a VotingClassifier on 3 LGBM with soft voting strategy: 0.8588
Validation score of a VotingClassifier on 3 LGBM with hard voting strategy: 0.8821


In [139]:
# see which features are not used by ANY models
useless_features =[]
drop_features = set()
counter = 0
for est in vc2.estimators_:
    ranked_features, unused_features = feature_importance(est, X_train.drop(et_drop_cols, axis=1), display_results=False)
    useless_features.append(unused_features)
    if counter == 0:
        drop_features = set(unused_features)
    else:
        drop_features = drop_features.intersection(set(unused_features))
    counter += 1
    
drop_features

{'parentesco_LE', 'rez_esc'}

In [142]:
def combine_voters(data, weights=[0.5, 0.5]):
    # do soft voting with both classifiers
    vc.voting="soft"
    vc1_probs = vc.predict_proba(data.drop(xgb_drop_cols, axis=1))
    vc2.voting="soft"
    vc2_probs = vc2.predict_proba(data.drop(et_drop_cols, axis=1))
    
    final_vote = (vc1_probs * weights[0]) + (vc2_probs * weights[1])
    predictions = np.argmax(final_vote, axis=1)
    
    return predictions

In [143]:
combo_preds = combine_voters(X_test, weights=[0.5, 0.5])
global_combo_score_soft = f1_score(y_test, combo_preds, average='macro')
global_combo_score_soft

0.9033903866168669

In [144]:
combo_preds = combine_voters(X_test, weights=[0.4, 0.6])
global_combo_score_soft = f1_score(y_test, combo_preds, average='macro')
global_combo_score_soft

0.8957383332482677

In [145]:
combo_preds = combine_voters(X_test, weights=[0.6, 0.4])
global_combo_score_soft = f1_score(y_test, combo_preds, average='macro')
global_combo_score_soft

0.9037477506086342

### Prepare submission

In [146]:
y_subm = pd.DataFrame()
y_subm['Id'] = test_ids

In [147]:
vc.voting = 'soft'
y_subm_lgb = y_subm.copy(deep=True)
y_subm_lgb['Target'] = vc.predict(test.drop(xgb_drop_cols, axis=1)) + 1

vc2.voting = 'soft'
y_subm_rf = y_subm.copy(deep=True)
y_subm_rf['Target'] = vc2.predict(test.drop(et_drop_cols, axis=1)) + 1

y_subm_ens = y_subm.copy(deep=True)
y_subm_ens['Target'] = combine_voters(test) + 1

In [148]:
from datetime import datetime
now = datetime.now()

sub_file_lgb = 'submission_soft_XGB_{:.4f}_{}.csv'.format(global_score_soft, str(now.strftime('%Y-%m-%d-%H-%M')))
sub_file_rf = 'submission_soft_RF_{:.4f}_{}.csv'.format(global_rf_score_soft, str(now.strftime('%Y-%m-%d-%H-%M')))
sub_file_ens = 'submission_ens_{:.4f}_{}.csv'.format(global_combo_score_soft, str(now.strftime('%Y-%m-%d-%H-%M')))

y_subm_lgb.to_csv(sub_file_lgb, index=False)
y_subm_rf.to_csv(sub_file_rf, index=False)
y_subm_ens.to_csv(sub_file_ens, index=False)

# 2회차

- LightGBM 모델은 XGBoost로 대체되었으며 그에 따라 코드가 업데이트됐습니다.
- 또한 랜덤 포레스트의 투표 분류기를 장착하고 XGB의 결과를 RF와 결합합니다.
- 데이터를 한 번 쪼개서 LGBM 조기 정지에 대한 검증 데이터를 사용하는 대신 전체 교육 세트를 훈련할 수 있도록 교육 중 데이터를 분할했습니다. 이것이 이 케이스에서 kfold 분할보다 더 효과가 있음을 발견했습니다.

- 이 커널은 하이퍼 파라미터 최적화를 실행하는 대신 해당 커널의 최적값을 사용하여 더 빨리 실행됩니다.

- __몇 가지 주요 요점은 다음과 같습니다.__:
    - 이 커널은 가장에 대해서만 교육을 실행합니다. 
    - 클래스 크기의 균형을 맞추는 것이 매우 중요해보입니다. 이는 수작업으로 할 수 있고, 언더 샘플링으로 달성할 수 있습니다. 그러나 sklearn API의 LGBM 모델 생성자에 `class_weight='balance'`을 설정하는 것이 가장 간단합니다.
    - 이 커널은 매크로 F1점수를 사용하여 교육을 조기 중단합니다. 
    - 범주는 블라인드 레이블 인코딩 대신 적절한 매핑을 가진 숫자로 바뀝니다.
    - OHE를 라벨 인코딩으로 반전시키면, 트리 모델에 더 적합할 수 있으나 , 논트리 모델에는 안 좋을 수 있으니 유의해야합니다.
    - `idhogar`는 훈련에서 사용되지 않습니다. 정보가 있을 수 있는 유일한 방법은 데이터 누출입니다. 
    - 가구 내에서 집계가 이루어지며 새로운 기능은 수작업으로 제작됩니다. 대부분의 기능이 이미 가정 수준에서 인용되었기 때문에 집계할 수 있는 기능이 많지 않습니다.
    - 투표 분류기는 여러 LGBM 모델에서 평균을 내는데 사용됩니다

### import 

In [149]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import f1_score
from joblib import Parallel, delayed
from sklearn.base import clone
from sklearn.ensemble import VotingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.utils import class_weight
import warnings
warnings.filterwarnings('ignore')

### mapping

In [151]:
from sklearn.preprocessing import LabelEncoder

def encode_data(df):
    df['idhogar'] = LabelEncoder().fit_transform(df['idhogar'])
    
def feature_importance(forest, X_train, display_results=True):
    ranked_list=[]
    zero_features=[]
    
    importances = forest.feature_importances_
    
    indices = np.argsort(importances)[::-1]
    
    if display_results:
        print('Feature ranking:')
        
    for f in range(X_train.shape[1]):
        if display_results:
            print('%d. feature %d (%f)' % (f+1, indices[f], importances[indices[f]]) + ' - ' + X_train.columns[indices[f]])
            
        ranked_list.append(X_train.columns[indices[f]])
        
        if importances[indices[f]] == 0.0:
            zero_features.append(X_train.columns[indices[f]])
            
    return ranked_list, zero_features

### feature engineering

In [152]:
def do_features(df):
    feats_div = [('children_fraction', 'r4t1', 'r4t3'), 
                 ('working_man_fraction', 'r4h2', 'r4t3'),
                 ('all_man_fraction', 'r4h3', 'r4t3'),
                 ('human_density', 'tamviv', 'rooms'),
                 ('human_bed_density', 'tamviv', 'bedrooms'),
                 ('rent_per_person', 'v2a1', 'r4t3'),
                 ('rent_per_room', 'v2a1', 'rooms'),
                 ('mobile_density', 'qmobilephone', 'r4t3'),
                 ('tablet_density', 'v18q1', 'r4t3'),
                 ('mobile_adult_density', 'qmobilephone', 'r4t2'),
                 ('tablet_adult_density', 'v18q1', 'r4t2'),
                ]
    
    feats_sub = [('people_not_living', 'tamhog', 'tamviv'),
                 ('people_weird_stat', 'tamhog', 'r4t3')]
    
    for f_new, f1, f2 in feats_div:
        df['fe_' + f_new] = (df[f1] / df[f2]).astype(np.float32)
    for f_new, f1, f2 in feats_sub:
        df['fe_' + f_new] = (df[f1] - df[f2]).astype(np.float32)
        
    aggs_num = {'age': ['min', 'max', 'mean'],
               'escolari': ['min', 'max', 'mean']}
    
    aggs_cat = {'dis': ['mean']}
    for s_ in ['estadocivil', 'parentesco', 'instlevel']:
        for f_ in [f_ for f_ in df.columns if f_.startswith(s_)]:
            aggs_cat[f_] = ['mean', 'count']
            
    for name_, df_ in [('18', df.query('age >= 18'))]:
        df_agg = df_.groupby('idhogar').agg({**aggs_num, **aggs_cat}).astype(np.float32)
        df_agg.columns = pd.Index(['agg' + name_ + '_' + e[0] + '_' + e[1].upper() for e in df_agg.columns.tolist()])
        df = df.join(df_agg, how='left', on='idhogar')
        del df_agg
        
    df.drop(['Id'], axis=1, inplace=True)
    
    return df

In [168]:
# convert one hot edcoded fields to label encoding
def convert_OHE2LE(df):
    tmp_df = df.copy(deep=True)
    for s_ in ['pared', 'piso', 'techo', 'abastagua', 'sanitario', 'energcocinar', 'elimbasu', 
               'epared', 'etecho', 'eviv', 'estadocivil', 'parentesco', 
               'instlevel', 'lugar', 'tipovivi',
               'manual_elec']:
        if 'manual_' not in s_:
            cols_s_ = [f_ for f_ in df.columns if f_.startswith(s_)]
        elif 'elec' in s_:
            cols_s_ = ['public', 'planpri', 'noelec', 'coopele']
        sum_ohe = tmp_df[cols_s_].sum(axis=1).unique()
        #deal with those OHE, where there is a sum over columns == 0
        if 0 in sum_ohe:
            print('The OHE in {} is incomplete. A new column will be added before label encoding'
                  .format(s_))
            # dummy colmn name to be added
            col_dummy = s_+'_dummy'
            # add the column to the dataframe
            tmp_df[col_dummy] = (tmp_df[cols_s_].sum(axis=1) == 0).astype(np.int8)
            # add the name to the list of columns to be label-encoded
            cols_s_.append(col_dummy)
            # proof-check, that now the category is complete
            sum_ohe = tmp_df[cols_s_].sum(axis=1).unique()
            if 0 in sum_ohe:
                 print("The category completion did not work")
        tmp_cat = tmp_df[cols_s_].idxmax(axis=1)
        tmp_df[s_ + '_LE'] = LabelEncoder().fit_transform(tmp_cat).astype(np.int16)
        if 'parentesco1' in cols_s_:
            cols_s_.remove('parentesco1')
        tmp_df.drop(cols_s_, axis=1, inplace=True)
    return tmp_df

### Read in the data and clean it up

In [169]:
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')

test_ids = test.Id

In [170]:
def process_df(df_):
    encode_data(df_)
    
    return do_features(df_)

train = process_df(train)
test = process_df(test)

결측치를 정리하고 오브젝트를 숫자형으로 변환합니다.

In [171]:
# Some dependencies are Na, fill those with the square root of the square
train['dependency'] = np.sqrt(train['SQBdependency'])
test['dependency'] = np.sqrt(test['SQBdependency'])

# fill 'no's for education with 0s
train.loc[train['edjefa'] == 'no', 'edjefa'] = 0
train.loc[train['edjefe'] == 'no', 'edjefe'] = 0
test.loc[test['edjefa'] == 'no', 'edjefa'] = 0
test.loc[test['edjefe'] == 'no', 'edjefe'] = 0

# if education is 'yes' and person is head of household, fill with escolari
train.loc[(train['edjefa'] == 'yes') & (train['parentesco1'] == 1), 'edjefa'] = train.loc[(train['edjefa'] == 'yes') & (train['parentesco1'] == 1), 'escolari']
train.loc[(train['edjefe'] == 'yes') & (train['parentesco1'] == 1), 'edjefe'] = train.loc[(train['edjefe'] == 'yes') & (train['parentesco1'] == 1), 'escolari']

test.loc[(test['edjefa'] == 'yes') & (test['parentesco1'] == 1), 'edjefa'] = test.loc[(test['edjefa'] == 'yes') & (test['parentesco1'] == 1), 'escolari']
test.loc[(test['edjefe'] == 'yes') & (test['parentesco1'] == 1), 'edjefe'] = test.loc[(test['edjefe'] == 'yes') & (test['parentesco1'] == 1), 'escolari']

# this field is supposed to be interaction between gender and escolari, but it isn't clear what 'yes' means. let's fill it with 4
train.loc[train['edjefa'] == 'yes', 'edjefa'] = 4
train.loc[train['edjefe'] == 'yes', 'edjefe'] = 4

test.loc[test['edjefa'] == 'yes', 'edjefa'] = 4
test.loc[test['edjefe'] == 'yes', 'edjefe'] = 4

# Convert to int for our models
train['edjefe'] = train['edjefe'].astype('int')
train['edjefa'] = train['edjefa'].astype('int')
test['edjefe'] = test['edjefe'].astype('int')
test['edjefa'] = test['edjefa'].astype('int')

# create feature with max education of either head of household
train['edjef'] = np.max(train[['edjefa', 'edjefe']], axis=1)
test['edjef'] = np.max(test[['edjefa', 'edjefe']], axis=1)

# fill some nas
train['v2a1'] = train['v2a1'].fillna(0)
test['v2a1'] = test['v2a1'].fillna(0)

train['v18q1'] = train['v18q1'].fillna(0)
test['v18q1'] = test['v18q1'].fillna(0)

train['rez_esc'] = train['rez_esc'].fillna(0)
test['rez_esc'] = test['rez_esc'].fillna(0)

train.loc[train.meaneduc.isnull(), 'meaneduc'] = 0
train.loc[train.SQBmeaned.isnull(), 'SQBmeaned'] = 0

test.loc[test.meaneduc.isnull(), 'meaneduc'] = 0
test.loc[test.SQBmeaned.isnull(), 'SQBmeaned'] = 0

# fix some inconsistencies in the data - some rows indicate both that the household does and does not not have a toilet,
# if there is no water we'll assume they do not
train.loc[(train.v14a == 1) & (train.sanitario1 == 1) & (train.abastaguano == 0), 'v14a'] = 0
train.loc[(train.v14a == 1) & (train.sanitario1 == 1) & (train.abastaguano == 0), 'sanitario'] = 0

test.loc[(test.v14a == 1) & (test.sanitario1 == 1) & (test.abastaguano == 0), 'v14a'] = 0
test.loc[(test.v14a == 1) & (test.sanitario1 == 1) & (test.abastaguano == 0), 'sanitario'] = 0

In [172]:
def train_test_apply_func(train_, test_, func_):
    test_['Target'] = 0
    xx = pd.concat([train_, test_])
    
    xx_func = func_(xx)
    train_ = xx_func.iloc[:train_.shape[0], :]
    test_ = xx_func.iloc[train_.shape[0]:, :].drop('Target', axis=1)
    
    del xx, xx_func
    return train_, test_

In [173]:
# convert the one hot fields into label encoded
train, test = train_test_apply_func(train, test, convert_OHE2LE)

The OHE in techo is incomplete. A new column will be added before label encoding
The OHE in instlevel is incomplete. A new column will be added before label encoding
The OHE in manual_elec is incomplete. A new column will be added before label encoding


### geo aggregates

In [174]:
cols_2_ohe = ['eviv_LE', 'etecho_LE', 'epared_LE', 'elimbasu_LE', 'energcocinar_LE', 'sanitario_LE', 'manual_elec_LE', 'pared_LE']
cols_nums = ['age', 'meaneduc', 'dependency', 'hogar_nin', 'hogar_adul', 'hogar_mayor', 'hogar_total', 'bedrooms' ,'overcrowding']

def convert_geo2aggs(df_):
    tmp_df = pd.concat([df_[(['lugar_LE', 'idhogar'] + cols_nums)], pd.get_dummies(df_[cols_2_ohe], columns=cols_2_ohe)], axis=1)
    
    geo_agg = tmp_df.groupby(['lugar_LE', 'idhogar']).mean().groupby('lugar_LE').mean().astype(np.float32)
    geo_agg.columns = pd.Index(['geo_' + e for e in geo_agg.columns.tolist()])
    
    del tmp_df
    return df_.join(geo_agg, how='left', on='lugar_LE')

train, test = train_test_apply_func(train, test, convert_geo2aggs)

In [175]:
# add the number of people over 18 in each household
train['num_over_18'] = 0
train['num_over_18'] = train[train.age >= 18].groupby('idhogar').transform('count')
train['num_over_18'] = train.groupby('idhogar')['num_over_18'].transform('max')
train['num_over_18'] = train['num_over_18'].fillna(0)

test['num_over_18'] = 0
test['num_over_18'] = test[test.age >= 18].groupby('idhogar').transform('count')
test['num_over_18'] = test.groupby('idhogar')['num_over_18'].transform('max')
test['num_over_18'] = test['num_over_18'].fillna(0)

# add some extra features, these were taken from another kernel
def extract_features(df):
    df['bedrooms_to_rooms'] = df['bedrooms'] / df['rooms']
    df['rent_to_rooms'] = df['v2a1'] / df['rooms']
    df['tamhog_to_rooms'] = df['tamhog'] / df['rooms'] # tamhog - size of the household
    df['r4t3_to_tamhog'] = df['r4t3'] / df['tamhog'] # r4t3 - Total persons in the household
    df['r4t3_to_rooms'] = df['r4t3'] / df['rooms']
    df['v2a1_to_r4t3'] = df['v2a1'] / df['r4t3'] # rent to people in household
    df['v2a1_to_under_12'] = df['v2a1'] / (df['r4t3'] - df['r4t1']) # rent to people under age 12
    df['hhsize_to_rooms'] = df['hhsize'] / df['rooms'] # rooms per person
    df['rent_to_hhsize'] = df['v2a1'] / df['hhsize'] # rent to household size
    df['rent_to_over_18'] = df['v2a1'] / df['num_over_18'] # some households have no one over 18, use the total rent for those
    df.loc[df.num_over_18 == 0, 'rent_to_over_18'] = df[df.num_over_18 == 0].v2a1
    
extract_features(train)
extract_features(test)

In [176]:
needless_cols = ['r4t3', 'tamhog', 'tamviv', 'hhsize', 'v18q', 'v14a' ,'agesq', 'mobilephone', 'female']

instlevel_cols = [s for s in train.columns.tolist() if 'instlevel' in s]

needless_cols.extend(instlevel_cols)

train = train.drop(needless_cols, axis=1)
test = test.drop(needless_cols, axis=1)

### Split the data
동일한 가구에 속하는 행은 대개 동일한 타겟을 가지므로, 누출을 방지하기 위해 가구별로 데이터를 분할합니다. 가장만 포함하도록 데이터를 필터링하고 이는 기술적으로는 필요하지 않지만, 그렇게 하면 전체 교육 데이터 세트를 쉽게 사용할 수 있습니다.

데이터를 분할한 후에는 열차 데이터를 전체 데이터 세트로 덮어써 모든 데이터를 훈련할 수 있습니다. split_data 기능은 데이터를 덮어쓰지 않고 동일한 작업을 수행하며, 교육 루프 내에서 K-Fold 분할을 근사화하는 데 사용됩니다.

In [177]:
def split_data(train, y, sample_weight=None, households=None, test_percentage=0.20, seed=None):
    
    train2 = train.copy()
    
    cv_hhs = np.random.choice(households, size=int(len(households) * test_percentage), replace=False)
    
    cv_idx = np.isin(households, cv_hhs)
    X_test = train2[cv_idx]
    y_test = y[cv_idx]
    
    X_train = train2[~cv_idx]
    y_train = y[~cv_idx]
    
    if sample_weight is not None:
        y_train_weights = sample_weight[~cv_idx]
        return X_train, y_train, X_test, y_test, y_train_weights
    
    return X_train, y_train, X_test, y_test

In [178]:
X = train.query('parentesco1==1')

y = X['Target'] - 1
X = X.drop(['Target'], axis=1)

np.random.seed(seed=None)

train2 = X.copy()

train_hhs = train2.idhogar

households = train2.idhogar.unique()
cv_hhs = np.random.choice(households, size=int(len(households) * 0.15), replace=False)

cv_idx = np.isin(train2.idhogar, cv_hhs)

X_test = train2[cv_idx]
y_test = y[cv_idx]

X_train = train2[~cv_idx]
y_train = y[~cv_idx]

X_train = train2
y_train = y

train_households = X_train.idhogar

In [179]:
y_train_weights = class_weight.compute_sample_weight('balanced', y_train, indices=None)

In [180]:
# drop some features which aren't used by the LGBM or have very low importance
extra_drop_features = [
 'agg18_estadocivil1_MEAN',
 'agg18_estadocivil6_COUNT',
 'agg18_estadocivil7_COUNT',
 'agg18_parentesco10_COUNT',
 'agg18_parentesco11_COUNT',
 'agg18_parentesco12_COUNT',
 'agg18_parentesco1_COUNT',
 'agg18_parentesco2_COUNT',
 'agg18_parentesco3_COUNT',
 'agg18_parentesco4_COUNT',
 'agg18_parentesco5_COUNT',
 'agg18_parentesco6_COUNT',
 'agg18_parentesco7_COUNT',
 'agg18_parentesco8_COUNT',
 'agg18_parentesco9_COUNT',
 'geo_elimbasu_LE_4',
 'geo_energcocinar_LE_1',
 'geo_energcocinar_LE_2',
 'geo_epared_LE_0',
 'geo_hogar_mayor',
 'geo_manual_elec_LE_2',
 'geo_pared_LE_3',
 'geo_pared_LE_4',
 'geo_pared_LE_5',
 'geo_pared_LE_6',
 'num_over_18',
 'parentesco_LE',
 'rez_esc']

In [181]:
xgb_drop_cols = extra_drop_features + ['idhogar', 'parentesco1']

### Fit a voting classifier
조기 중지를 위해 fit_params를 전달할 수 있도록 VotingClassifier을 정의합니다. LGBM에 기반하여 매크로 F1과 학습률의 줄어듬에 따라 조기 중지합니다.

In [182]:
opt_parameters = {'max_depth':35, 'eta':0.15, 'silent':1, 'objective':'multi:softmax', 'min_child_weight': 2, 'num_class': 4, 'gamma': 2.5, 'colsample_bylevel': 1, 'subsample': 0.95, 'colsample_bytree': 0.85, 'reg_lambda': 0.35}

def evaluate_macroF1_lgb(predictions, truth):
    pred_labels = predictions.argmax(axis=1)
    truth = truth.get_label()
    f1 = f1_score(truth, pred_labels, average='macro')
    return ('macro1', 1-f1)

fit_params = {'early_stopping_rounds': 500,
             'eval_metric': evaluate_macroF1_lgb,
             'eval_set': [(X_train, y_train), (X_test, y_test)],
             'verbose': False}

def learning_rate_power_0997(current_iter):
    base_learning_rate = 0.1
    min_learning_rate = 0.02
    lr = base_learning_rate * np.power(.995, current_iter)
    return max(lr, min_learning_rate)

fit_params['verbose'] = 50

In [183]:
np.random.seed(100)

def _parallel_fit_estimator(estimator1, X, y, sample_weight=None, threshold=True, **fit_params):
    estimator = clone(estimator1)
    
    # randomly split the data so we have a test set for early stopping
    if sample_weight is not None:
        X_train, y_train, X_test, y_test, y_train_weight = split_data(X, y, sample_weight, households=train_households)
    else:
        X_train, y_train, X_test, y_test = split_data(X, y, None, households=train_households)
        
    # update the fit params with our new split
    fit_params["eval_set"] = [(X_test,y_test)]
    
    # fit the estimator
    if sample_weight is not None:
        if isinstance(estimator1, ExtraTreesClassifier) or isinstance(estimator1, RandomForestClassifier):
            estimator.fit(X_train, y_train)
        else:
            _ = estimator.fit(X_train, y_train, sample_weight=y_train_weight, **fit_params)
    else:
        if isinstance(estimator1, ExtraTreesClassifier) or isinstance(estimator1, RandomForestClassifier):
            estimator.fit(X_train, y_train)
        else:
            _ = estimator.fit(X_train, y_train, **fit_params)
    
    if not isinstance(estimator1, ExtraTreesClassifier) and not isinstance(estimator1, RandomForestClassifier) and not isinstance(estimator1, xgb.XGBClassifier):
        best_cv_round = np.argmax(estimator.evals_result_['validation_0']['mlogloss'])
        best_cv = np.max(estimator.evals_result_['validation_0']['mlogloss'])
        best_train = estimator.evals_result_['train']['macroF1'][best_cv_round]
    else:
        best_train = f1_score(y_train, estimator.predict(X_train), average="macro")
        best_cv = f1_score(y_test, estimator.predict(X_test), average="macro")
        print("Train F1:", best_train)
        print("Test F1:", best_cv)
        
    # reject some estimators based on their performance on train and test sets
    if threshold:
        # if the valid score is very high we'll allow a little more leeway with the train scores
        if ((best_cv > 0.37) and (best_train > 0.75)) or ((best_cv > 0.44) and (best_train > 0.65)):
            return estimator

        # else recurse until we get a better one
        else:
            print("Unacceptable!!! Trying again...")
            return _parallel_fit_estimator(estimator1, X, y, sample_weight=sample_weight, **fit_params)
    
    else:
        return estimator
    
class VotingClassifierLGBM(VotingClassifier):
    '''
    This implements the fit method of the VotingClassifier propagating fit_params
    '''
    def fit(self, X, y, sample_weight=None, threshold=True, **fit_params):
        
        if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
            raise NotImplementedError('Multilabel and multi-output'
                                      ' classification is not supported.')

        if self.voting not in ('soft', 'hard'):
            raise ValueError("Voting must be 'soft' or 'hard'; got (voting=%r)"
                             % self.voting)

        if self.estimators is None or len(self.estimators) == 0:
            raise AttributeError('Invalid `estimators` attribute, `estimators`'
                                 ' should be a list of (string, estimator)'
                                 ' tuples')

        if (self.weights is not None and
                len(self.weights) != len(self.estimators)):
            raise ValueError('Number of classifiers and weights must be equal'
                             '; got %d weights, %d estimators'
                             % (len(self.weights), len(self.estimators)))

        names, clfs = zip(*self.estimators)
        self._validate_names(names)

        n_isnone = np.sum([clf is None for _, clf in self.estimators])
        if n_isnone == len(self.estimators):
            raise ValueError('All estimators are None. At least one is '
                             'required to be a classifier!')

        self.le_ = LabelEncoder().fit(y)
        self.classes_ = self.le_.classes_
        self.estimators_ = []

        transformed_y = self.le_.transform(y)

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
                delayed(_parallel_fit_estimator)(clone(clf), X, transformed_y,
                                                 sample_weight=sample_weight, threshold=threshold, **fit_params)
                for clf in clfs if clf is not None)

        return self

In [184]:
clfs = []
for i in range(15):
    clf = xgb.XGBClassifier(random_state=217*i, n_estimators=300, learning_rate=0.15, n_jobs=-1, **opt_parameters)
    
    clfs.append(('xgb{}'.format(i), clf))
    
vc = VotingClassifierLGBM(clfs, voting='soft')

_ = vc.fit(X_train.drop(xgb_drop_cols, axis=1), y_train, sample_weight=y_train_weights, threshold=False, **fit_params)

clf_final = vc.estimators_[0]

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-mlogloss:1.30039	validation_0-macro1:0.64244
[50]	validation_0-mlogloss:0.91911	validation_0-macro1:0.59801
[100]	validation_0-mlogloss:0.91816	validation_0-macro1:0.61256
[150]	validation_0-mlogloss:0.91795	validation_0-macro1:0.62226
[200]	validation_0-mlogloss:0.91941	validation_0-macro1:0.61902
[250]	validation_0-mlogloss:0.92048	validation_0-macro1:0.61853
[299]	validation_0-mlogloss:0.92179	validation_0-macro1:0.62086
Train F1: 0.9023897544564117
Test F1: 0.40598167996933926
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through th

[250]	validation_0-mlogloss:0.91859	validation_0-macro1:0.58896
[299]	validation_0-mlogloss:0.91927	validation_0-macro1:0.59010
Train F1: 0.8952954970083489
Test F1: 0.4185828757772069
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-mlogloss:1.30113	validation_0-macro1:0.62662
[50]	validation_0-mlogloss:0.94249	validation_0-macro1:0.61305
[100]	validation_0-mlogloss:0.93815	validation_0-macro1:0.61161
[150]	validation_0-mlogloss:0.94009	validation_0-macro1:0.62866
[200]	validation_0-mlogloss:0.94015	validation_0-macro1:0.62751
[250]	validation_0-mlogloss:0.93901	validation_0-macro1:0.62519
[299]	validation_0-mlogloss:0.93804	validation_0-macro1:0.62389
Train F1: 0.8938783060148262
Test F1: 0.39977809923009056
Parameters: { silent } might n

[150]	validation_0-mlogloss:0.90260	validation_0-macro1:0.55784
[200]	validation_0-mlogloss:0.90185	validation_0-macro1:0.56932
[250]	validation_0-mlogloss:0.90101	validation_0-macro1:0.55274
[299]	validation_0-mlogloss:0.90182	validation_0-macro1:0.56280
Train F1: 0.9121177304574287
Test F1: 0.45772563588510834
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-mlogloss:1.29922	validation_0-macro1:0.63620
[50]	validation_0-mlogloss:0.88286	validation_0-macro1:0.56866
[100]	validation_0-mlogloss:0.87406	validation_0-macro1:0.55138
[150]	validation_0-mlogloss:0.87344	validation_0-macro1:0.54867
[200]	validation_0-mlogloss:0.87268	validation_0-macro1:0.55747
[250]	validation_0-mlogloss:0.87244	validation_0-macro1:0.56708
[299]	validation_0-mlog

In [185]:
global_score = f1_score(y_test, clf_final.predict(X_test.drop(xgb_drop_cols, axis=1)), average='macro')
vc.voting = 'soft'
global_score_soft = f1_score(y_test, vc.predict(X_test.drop(xgb_drop_cols, axis=1)), average='macro')
vc.voting = 'hard'
global_score_hard = f1_score(y_test, vc.predict(X_test.drop(xgb_drop_cols, axis=1)), average='macro')

print('단일 lgbm 분류기 교차 평가 점수: {:.4f}'.format(global_score))
print('3 lgbm + soft 보팅: {:.4f}'.format(global_score_soft))
print('3 lgbm + hard 보팅: {:.4f}'.format(global_score_hard))

단일 lgbm 분류기 교차 평가 점수: 0.8463
3 lgbm + soft 보팅: 0.9342
3 lgbm + hard 보팅: 0.9361


In [186]:
useless_features = []
drop_features = set()
counter = 0
for est in vc.estimators_:
    ranked_features, unused_features = feature_importance(est, X_train.drop(xgb_drop_cols, axis=1), display_results=False)
    useless_features.append(unused_features)
    if counter == 0:
        drop_features = set(unused_features)
    else:
        drop_features = drop_features.intersection(set(unused_features))
        counter+=1
        
drop_features

{'agg18_estadocivil3_COUNT',
 'agg18_estadocivil4_COUNT',
 'agg18_estadocivil5_COUNT',
 'agg18_parentesco10_MEAN',
 'fe_human_bed_density',
 'fe_people_weird_stat',
 'geo_elimbasu_LE_3',
 'geo_energcocinar_LE_0',
 'geo_epared_LE_2',
 'geo_eviv_LE_0',
 'geo_manual_elec_LE_3',
 'geo_manual_elec_LE_4',
 'geo_pared_LE_0',
 'geo_pared_LE_1',
 'geo_sanitario_LE_0',
 'geo_sanitario_LE_2',
 'hacapo'}

In [187]:
ranked_features = feature_importance(clf_final, X_train.drop(xgb_drop_cols, axis=1))

Feature ranking:
1. feature 59 (0.022441) - agg18_escolari_MAX
2. feature 85 (0.021199) - edjef
3. feature 42 (0.017726) - fe_children_fraction
4. feature 123 (0.015770) - geo_sanitario_LE_0
5. feature 119 (0.015614) - geo_elimbasu_LE_3
6. feature 17 (0.015298) - male
7. feature 74 (0.015210) - agg18_parentesco2_MEAN
8. feature 37 (0.014125) - SQBedjefe
9. feature 3 (0.013019) - hacapo
10. feature 107 (0.011342) - geo_overcrowding
11. feature 104 (0.011086) - geo_hogar_adul
12. feature 112 (0.011028) - geo_etecho_LE_1
13. feature 22 (0.010883) - dependency
14. feature 35 (0.010827) - SQBage
15. feature 41 (0.010744) - SQBmeaned
16. feature 15 (0.010707) - cielorazo
17. feature 110 (0.010428) - geo_eviv_LE_2
18. feature 87 (0.010364) - piso_LE
19. feature 10 (0.010159) - r4m2
20. feature 38 (0.009978) - SQBhogar_nin
21. feature 60 (0.009952) - agg18_escolari_MEAN
22. feature 109 (0.009545) - geo_eviv_LE_1
23. feature 12 (0.009511) - r4t1
24. feature 49 (0.009490) - fe_mobile_density
25.

### Random Forest

In [188]:
et_drop_cols = ['agg18_age_MAX', 'agg18_age_MEAN', 'agg18_age_MIN', 'agg18_dis_MEAN',
       'agg18_escolari_MAX', 'agg18_escolari_MEAN', 'agg18_escolari_MIN',
       'agg18_estadocivil1_COUNT', 'agg18_estadocivil1_MEAN',
       'agg18_estadocivil2_COUNT', 'agg18_estadocivil2_MEAN',
       'agg18_estadocivil3_COUNT', 'agg18_estadocivil3_MEAN',
       'agg18_estadocivil4_COUNT', 'agg18_estadocivil4_MEAN',
       'agg18_estadocivil5_COUNT', 'agg18_estadocivil5_MEAN',
       'agg18_estadocivil6_COUNT', 'agg18_estadocivil6_MEAN',
       'agg18_estadocivil7_COUNT', 'agg18_estadocivil7_MEAN',
       'agg18_parentesco10_COUNT', 'agg18_parentesco10_MEAN',
       'agg18_parentesco11_COUNT', 'agg18_parentesco11_MEAN',
       'agg18_parentesco12_COUNT', 'agg18_parentesco12_MEAN',
       'agg18_parentesco1_COUNT', 'agg18_parentesco1_MEAN',
       'agg18_parentesco2_COUNT', 'agg18_parentesco2_MEAN',
       'agg18_parentesco3_COUNT', 'agg18_parentesco3_MEAN',
       'agg18_parentesco4_COUNT', 'agg18_parentesco4_MEAN',
       'agg18_parentesco5_COUNT', 'agg18_parentesco5_MEAN',
       'agg18_parentesco6_COUNT', 'agg18_parentesco6_MEAN',
       'agg18_parentesco7_COUNT', 'agg18_parentesco7_MEAN',
       'agg18_parentesco8_COUNT', 'agg18_parentesco8_MEAN',
       'agg18_parentesco9_COUNT', 'agg18_parentesco9_MEAN']

et_drop_cols.extend(['idhogar', 'parentesco1', 'fe_rent_per_person', 'fe_rent_per_room', 'fe_tablet_adult_density', 'fe_tablet_density'])

In [189]:
ets = []
for i in range(10):
    rf = RandomForestClassifier(max_depth=None, random_state=217*i, n_jobs=-1, n_estimators=700, min_impurity_decrease=1e-3, min_samples_leaf=2, verbose=0, class_weight='balanced')
    ets.append(('rf{}'.format(i), rf))
    
vc2 = VotingClassifierLGBM(ets, voting='soft')
_ = vc2.fit(X_train.drop(et_drop_cols, axis=1), y_train, threshold=False)

Train F1: 0.8960097986045293
Test F1: 0.4197620915001753
Train F1: 0.8949242875694011
Test F1: 0.3962328973434437
Train F1: 0.8896363360215604
Test F1: 0.3990621498954871
Train F1: 0.8916606744530696
Test F1: 0.4209929529814863
Train F1: 0.8930001004603103
Test F1: 0.4563449843062528
Train F1: 0.8952465781659489
Test F1: 0.4276967858518299
Train F1: 0.8917648965754281
Test F1: 0.4154181763083784
Train F1: 0.8935472605642061
Test F1: 0.4749914411703019
Train F1: 0.9055709347245716
Test F1: 0.4316801709058267
Train F1: 0.8944678384804032
Test F1: 0.4192605691105316


In [190]:
vc2.voting = 'soft'
global_rf_score_soft = f1_score(y_test, vc2.predict(X_test.drop(et_drop_cols, axis=1)), average='macro')
vc2.voting = 'hard'
global_rf_score_hard = f1_score(y_test, vc2.predict(X_test.drop(et_drop_cols, axis=1)), average='macro')

print('Validation score of a VotingClassifier on 3 LGBM with soft voting strategy: {:.4f}'.format(global_rf_score_soft))
print('Validation score of a VotingClassifier on 3 LGBM with hard voting strategy: {:.4f}'.format(global_rf_score_hard))

Validation score of a VotingClassifier on 3 LGBM with soft voting strategy: 0.8830
Validation score of a VotingClassifier on 3 LGBM with hard voting strategy: 0.8981


In [191]:
useless_features = []
drop_features = set()
counter = 0
for est in vc2.estimators_:
    ranked_features, unused_features = feature_importance(est, X_train.drop(et_drop_cols, axis=1), display_results=False)
    useless_features.append(unused_features)
    if counter == 0:
        drop_features = set(unused_features)
    else:
        drop_features = drop_features.intersection(set(unused_features))
    counter += 1
    
drop_features

{'parentesco_LE', 'rez_esc'}

In [198]:
def combine_voters(data, weights=[0.5, 0.5]):
    vc.voting = 'soft'
    vc1_probs = vc.predict_proba(data.drop(xgb_drop_cols, axis=1))
    vc2.voting = 'soft'
    vc2_probs = vc2.predict_proba(data.drop(et_drop_cols, axis=1))
    
    final_vote = (vc1_probs * weights[0]) + (vc2_probs * weights[1])
    predictions = np.argmax(final_vote, axis=1)
    
    return predictions

In [199]:
combo_preds = combine_voters(X_test, weights=[0.5, 0.5])
global_combo_score_soft = f1_score(y_test, combo_preds, average='macro')
global_combo_score_soft

0.9256488044654121

In [200]:
combo_preds = combine_voters(X_test, weights=[0.4, 0.6])
global_combo_score_soft = f1_score(y_test, combo_preds, average='macro')
global_combo_score_soft

0.9169118840613286

In [201]:
combo_preds = combine_voters(X_test, weights=[0.6, 0.4])
global_combo_score_soft = f1_score(y_test, combo_preds, average='macro')
global_combo_score_soft

0.9297333720410643

### Prepare submission

In [202]:
y_subm = pd.DataFrame()
y_subm['Id'] = test_ids

In [203]:
vc.voting = 'soft'
y_subm_lgb = y_subm.copy(deep=True)
y_subm_lgb['Target'] = vc.predict(test.drop(xgb_drop_cols, axis=1)) + 1

vc2.voting = 'soft'
y_subm_rf = y_subm.copy(deep=True)
y_subm_rf['Target'] = vc2.predict(test.drop(et_drop_cols, axis=1)) + 1

y_subm_ens = y_subm.copy(deep=True)
y_subm_ens['Target'] = combine_voters(test) + 1

In [204]:
from datetime import datetime
now = datetime.now()

sub_file_lgb = 'submission_soft_XGB_{:.4f}_{}.csv'.format(global_score_soft, str(now.strftime('%Y-%m-%d-%H-%M')))
sub_file_rf = 'submission_soft_RF_{:.4f}_{}.csv'.format(global_rf_score_soft, str(now.strftime('%Y-%m-%d-%H-%M')))
sub_file_rns = 'submission_soft_ens_{:.4f}_{}.csv'.format(global_combo_score_soft, str(now.strftime('%Y-%m-%d-%H-%M')))

y_subm_lgb.to_csv(sub_file_lgb, index=False)
y_subm_rf.to_csv(sub_file_rf, index=False)
y_subm_ens.to_csv(sub_file_ens, index=False)