In [1]:
def Gini(actual, pred):
    assert (len(actual) == len(pred))
    all = np.asarray(np.c_[actual, pred, np.arange(len(actual))], dtype=np.float)
    all = all[np.lexsort((all[:, 2], -1 * all[:, 1]))]
    totalLosses = all[:, 0].sum()
    GiniSum = all[:, 0].cumsum().sum() / totalLosses

    GiniSum -= (len(actual) + 1) / 2.
    return GiniSum / len(actual)


def Gini_normalized(actual, pred):
    return Gini(actual, pred) / Gini(actual, actual)

In [2]:
import pandas as pd

# 훈련/테스트 데이터를 읽어온다
train = pd.read_csv("../input/train.csv")
train_label = train['target']
train_id = train['id']

In [3]:
train_id

0               7
1               9
2              13
3              16
4              17
           ...   
595207    1488013
595208    1488016
595209    1488017
595210    1488021
595211    1488027
Name: id, Length: 595212, dtype: int64

In [4]:
del train['target'], train['id']

In [5]:
test = pd.read_csv("../input/test.csv")
test_id = test['id']
del test['id']

In [6]:
# 파생 변수 01 : 결측값을 의미하는 “-1”의 개수를 센다
train['missing'] = (train==-1).sum(axis=1).astype(float)
test['missing'] = (test==-1).sum(axis=1).astype(float)

In [7]:
train['missing']

0         1.0
1         2.0
2         3.0
3         0.0
4         2.0
         ... 
595207    1.0
595208    3.0
595209    2.0
595210    2.0
595211    3.0
Name: missing, Length: 595212, dtype: float64

In [8]:
# 파생 변수 02 : 이진 변수의 합
bin_features = [c for c in train.columns if 'bin' in c]
train['bin_sum'] = train[bin_features].sum(axis=1)
test['bin_sum'] = test[bin_features].sum(axis=1)

In [9]:
train['bin_sum']

0         5
1         5
2         5
3         2
4         4
         ..
595207    6
595208    6
595209    3
595210    5
595211    3
Name: bin_sum, Length: 595212, dtype: int64

In [10]:
# 파생 변수 03 : 단일변수 타겟 비율 분석으로 선정한 변수를 기반으로 Target Encoding을 수행한다. Target Encoding은 교차 검증 과정에서 진행한다.
features = ['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_12_bin', 'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_ind_04_cat',
            'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat',
            'ps_car_11_cat', 'ps_ind_01', 'ps_ind_03', 'ps_ind_15', 'ps_car_11']

In [11]:
# LightGBM 모델의 설정값이다.
num_boost_round = 10000
params = {"objective": "binary",    # 목표: 이진 분류
          "boosting_type": "gbdt",  # 부스팅방법: Gradient Boosted Decision Trees
          "learning_rate": 0.1,     # 학습률: 기본값(0.1), 제약(learning_rate > 0.0)
          "num_leaves": 15,         # 한 나무의 최대 잎 수: 기본값(31), 제약(1 < num_leaves <= 131072)
          "max_bin": 256,           # 히스토그램 빈 갯수: 기본값(255), 제약(max_bin > 1)
          "feature_fraction": 0.6,  # colum 샘플링 비율: 기본값(1.0), 제약(0.0 < feature_fraction <= 1.0)
          "verbosity": 0,           # LightGBM의 상세 수준을 제어, 출력내용 조절: 기본값(1), 0=오류, 1=정보
          "drop_rate": 0.1,         # 드랍아웃 비율: 기본값(0.1), 제약(0.0 <= drop_rate <= 1.0), used only in dart(boosting_type) # 여기서는 왜 사용했는지 모르겠음
          "is_unbalance": False,    # 불균형 셋 조정: 기본값(False), used only in binary and multiclassova applications
          "max_drop": 50,           # 최대 트리 삭제 수: 기본값(50), used only in dart(boosting_type)
          "min_child_samples": 10,  # 한 잎사귀에 들어갈 데이터의 최소 개수: 기본값(20), 제약(min_data_in_leaf >= 0)
          "min_child_weight": 150,  # 분할을 유지하기 위해 잎에서 헤센의 최소 합: 기본값(1e-3), 제약(min_sum_hessian_in_leaf >= 0.0)
          "min_split_gain": 0,      # 분리되기 위한 최소한의 정보이득: 기본값(0.0), 제약(min_gain_to_split >= 0.0)
          "subsample": 0.9,         # row 샘플링 비율: 기본값(1.0), 제약(0.0 < bagging_fraction <= 1.0)
          "seed": 2018
}

In [12]:

# Stratified 5-Fold 내부 교차 검증을 준비한다
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgbm

NFOLDS = 5
kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=218)
kf = kfold.split(train, train_label)

In [13]:
cv_train = np.zeros(len(train_label))
cv_pred = np.zeros(len(test_id))    
best_trees = []
fold_scores = []

↓ for i, (train_fold, validate) in enumerate(kf): 행의 내용을 보고싶어서 출력함

In [14]:
for i, (train_fold, validate) in enumerate(kf):
    print(i, (train_fold, validate))

0 (array([     0,      2,      3, ..., 595208, 595209, 595210]), array([     1,      5,      7, ..., 595200, 595202, 595211]))
1 (array([     1,      2,      4, ..., 595209, 595210, 595211]), array([     0,      3,      6, ..., 595185, 595194, 595205]))
2 (array([     0,      1,      3, ..., 595209, 595210, 595211]), array([     2,     13,     19, ..., 595189, 595201, 595208]))
3 (array([     0,      1,      2, ..., 595209, 595210, 595211]), array([     8,     26,     29, ..., 595191, 595197, 595204]))
4 (array([     0,      1,      2, ..., 595205, 595208, 595211]), array([     4,     10,     12, ..., 595207, 595209, 595210]))


In [15]:
X_train, X_validate, label_train, label_validate = train.iloc[train_fold, :], train.iloc[validate, :], train_label[train_fold], train_label[validate]

In [16]:
pd.DataFrame([X_train['ps_ind_06_bin'], label_train]).T.groupby('ps_ind_06_bin')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000029717837438>

In [17]:
print(pd.DataFrame([X_train['ps_ind_06_bin'], label_train]).T.groupby('ps_ind_06_bin'))

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000297166802E8>


In [18]:
pd.DataFrame([X_train['ps_ind_06_bin'], label_train]).T.groupby('ps_ind_06_bin').agg('mean')

Unnamed: 0_level_0,target
ps_ind_06_bin,Unnamed: 1_level_1
0,0.041418
1,0.028796


In [19]:
pd.DataFrame([X_train['ps_ind_06_bin'], label_train]).T.groupby('ps_ind_06_bin').agg('mean').to_dict()

{'target': {0: 0.041417801859730326, 1: 0.02879600695370242}}

In [20]:
pd.DataFrame([X_train['ps_ind_06_bin'], label_train]).T.groupby('ps_ind_06_bin').agg('mean').to_dict()['target']

{0: 0.041417801859730326, 1: 0.02879600695370242}

In [21]:
map_dic = pd.DataFrame([X_train['ps_ind_06_bin'], label_train]).T.groupby('ps_ind_06_bin').agg('mean')
map_dic = map_dic.to_dict()['target']

In [22]:
map_dic.get(0, ), map_dic.get(1, )

(0.041417801859730326, 0.02879600695370242)

In [14]:
for i, (train_fold, validate) in enumerate(kf):
    # 훈련/검증 데이터를 분리한다
    X_train, X_validate, label_train, label_validate = train.iloc[train_fold, :], train.iloc[validate, :], train_label[train_fold], train_label[validate]
    
    # target encoding 피쳐 엔지니어링을 수행한다
    for feature in features:
        # 훈련 데이터에서 feature 고유값별 타겟 변수의 평균을 구한다
        map_dic = pd.DataFrame([X_train[feature], label_train]).T.groupby(feature).agg('mean')
        map_dic = map_dic.to_dict()['target']
        # 훈련/검증/테스트 데이터에 평균값을 매핑한다
        X_train[feature + '_target_enc'] = X_train[feature].apply(lambda x: map_dic.get(x, 0))
        X_validate[feature + '_target_enc'] = X_validate[feature].apply(lambda x: map_dic.get(x, 0))
        test[feature + '_target_enc'] = test[feature].apply(lambda x: map_dic.get(x, 0))

    dtrain = lgbm.Dataset(X_train, label_train)
    dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain)
    # 훈련 데이터를 학습하고, evalerror() 함수를 통해 검증 데이터에 대한 정규화 Gini 계수 점수를 기준으로 최적의 트리 개수를 찾는다.
    # feval=evalerror 가 뭔지 몰라서 삭제함
    bst = lgbm.train(params, dtrain, num_boost_round, valid_sets=dvalid, verbose_eval=100, early_stopping_rounds=100)
    best_trees.append(bst.best_iteration)
    # 테스트 데이터에 대한 예측값을 cv_pred에 더한다.
    cv_pred += bst.predict(test, num_iteration=bst.best_iteration)
    cv_train[validate] += bst.predict(X_validate)

    # 검증 데이터에 대한 평가 점수를 출력한다.
    score = Gini(label_validate, cv_train[validate])
    print(score)
    fold_scores.append(score)

cv_pred /= NFOLDS

# 시드값별로 교차 검증 점수를 출력한다.
print("cv score:")
print(Gini(train_label, cv_train))
print(fold_scores)
print(best_trees, np.mean(best_trees))

# 테스트 데이터에 대한 결과물을 저장한다.
pd.DataFrame({'id': test_id, 'target': cv_pred}).to_csv('../model/lgbm_baseline.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.15162
[200]	valid_0's binary_logloss: 0.151561
Early stopping, best iteration is:
[142]	valid_0's binary_logloss: 0.151545
0.14055740338526007
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.152333
[200]	valid_0's binary_logloss: 0.15242
Early stopping, best iteration is:
[105]	valid_0's binary_logloss: 0.152323
0.12915459497334839
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.152118
[200]	valid_0's binary_logloss: 0.152077
Early stopping, best iteration is:
[144]	valid_0's binary_logloss: 0.152063
0.13266918148744083
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.151973
[200]	valid_0's binary_logloss: 0.152025
Early stopping, best iteration is:
[150]	valid_0's binary_logloss: 0.151954
0.13498718231918155
Training until validation scores don't imp