In [315]:
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder, TargetEncoder
import lightgbm as lgb
# import optuna.integration.lightgbm as lgb

import category_encoders as ce
import math

# pandasの行を省略しない
# pd.set_option('display.max_columns', None)

In [316]:
train_df = pd.read_csv('C:/python/signate/data/train.csv', index_col=0)
test_df = pd.read_csv('C:/python/signate/data/test.csv', index_col=0)
sample_df = pd.read_csv('C:/python/signate/data/sample_submission.csv', header=None, index_col=0)

データはほとんどカテゴリカル
#### 思いついた精度向上案
- commonを個別名称として、latinの上を属名として利用する
- 地区が多すぎるので整理する
- 季節性を導入
- 郵便番号は連続性があるからカテゴリカルにしないほうがよいのでは
- 高い確率で状態が悪いと推測された木の近くの木はフラグ立てる。つまり2段階モデル
- 同じ人が記録した場合、1日のうちの0,1,2の割合は無意識にバイアスがかかって同じくらいにしてしまうのでは？
- 曜日の導入

In [317]:
def cleansing(df):
    df['created_at'] = pd.to_datetime(df['created_at'])
    # df['cos_day'] = df['created_at'].dt.dayofyear
    # df['cos_day'] = df['cos_day'].apply(lambda x: np.cos(math.radians(90 - (x/365)*365)))
    # df['sin_day'] = df['created_at'].dt.dayofyear
    # df['sin_day'] = df['sin_day'].apply(lambda x: np.sin(math.radians(90 - (x/365)*365)))  
    # df['year'] = df['created_at'].dt.year
    df['month'] = df['created_at'].dt.month
    # df['weekday'] = df['created_at'].dt.weekday
    # df['day'] = df.created_at.dt.day
    # df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    # df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    # df.drop('month', axis=1, inplace=True)
    df.drop('created_at', axis=1, inplace=True)
    df['steward'].fillna('0', inplace=True)
    df['guards'].fillna('0', inplace=True)
    df['problems'].fillna('NoProblem', inplace=True)
    df['spc_genus'] = df['spc_latin'].str.split(' ').str[0]
    df.drop('spc_latin', axis=1, inplace=True)

    # df['curb_loc'] = df['curb_loc'].map({'OnCurb':3, 'OffsetFromCurb':1})
    # df['sidewalk'] = df['sidewalk'].map({'NoDamage':1, 'Damage':3})
    # df['guards'] = df['guards'].map({'Helpful':1, 'Harmful':3, '0':2, 'unsure':2})
    # df['status_point'] = df['curb_loc'] * df['sidewalk'] * df['guards']
    return df

train_clean = cleansing(train_df)
test_clean = cleansing(test_df)

In [318]:

# 種目ごとの平均直径とその差を特徴量にする
test_clean['health'] = 0
all_df = pd.concat([train_clean, test_clean], axis=0)   
tmp_df = all_df[['tree_dbh', 'spc_common', ]]
dbh_mean_common = (pd.DataFrame(tmp_df.groupby('spc_common').mean()['tree_dbh']))
dbh_mean_common.rename(columns={'tree_dbh': 'dbh_mean_common'}, inplace=True)
all_df = pd.merge(all_df, dbh_mean_common, on='spc_common', how='left')
# all_df['dbh_diff_common'] = all_df['tree_dbh'] - all_df['dbh_mean_common']

tmp_df = all_df[['tree_dbh', 'cb_num']]
dbh_mean_cb = (pd.DataFrame(tmp_df.groupby('cb_num').mean()['tree_dbh']))
dbh_mean_cb.rename(columns={'tree_dbh': 'dbh_mean_cb'}, inplace=True)
all_df = pd.merge(all_df, dbh_mean_cb, on='cb_num', how='left')

# all_df['diff'] = all_df['dbh_mean_common'] - all_df['tree_dbh']

train_clean = all_df.iloc[:len(train_clean)]
test_clean = all_df.iloc[len(train_clean):]
test_clean.drop('health', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_clean.drop('health', axis=1, inplace=True)


In [319]:
'''
# trainとtestに含まれるproblemは全く同じ
# problemのonehotカラムを用意
problem_list = ['BranchLights', 'BranchOther', 'MetalGrates', 'RootOther', 'Stones', 'Sneakers', 'TrunkLights',
                 'TrunkOther', 'WiresRope', 'NoProblem']	

for problem in problem_list:
    train_clean.loc[:, problem] = 0
    test_clean.loc[:, problem] = 0

train_clean['problem_count'] = 0
test_clean['problem_count'] = 0

# problemlistにあったらonehotする
for i in train_clean.index:
    p_count = 0
    for problem in problem_list:
        if(problem in train_clean.loc[i, 'problems']):
            train_clean.loc[i, problem] = 1
            if(problem != 'NoProblem'):
                p_count+=1
    train_clean.loc[i, 'problem_count'] = p_count

for i in test_clean.index:
    p_count = 0
    for problem in problem_list:
        if(problem in test_clean.loc[i, 'problems']):
            test_clean.loc[i, problem] = 1
            if(problem != 'NoProblem'):
                p_count+=1
    test_clean.loc[i, 'problem_count'] = p_count

test_clean.head()
'''

"\n# trainとtestに含まれるproblemは全く同じ\n# problemのonehotカラムを用意\nproblem_list = ['BranchLights', 'BranchOther', 'MetalGrates', 'RootOther', 'Stones', 'Sneakers', 'TrunkLights',\n                 'TrunkOther', 'WiresRope', 'NoProblem']\t\n\nfor problem in problem_list:\n    train_clean.loc[:, problem] = 0\n    test_clean.loc[:, problem] = 0\n\ntrain_clean['problem_count'] = 0\ntest_clean['problem_count'] = 0\n\n# problemlistにあったらonehotする\nfor i in train_clean.index:\n    p_count = 0\n    for problem in problem_list:\n        if(problem in train_clean.loc[i, 'problems']):\n            train_clean.loc[i, problem] = 1\n            if(problem != 'NoProblem'):\n                p_count+=1\n    train_clean.loc[i, 'problem_count'] = p_count\n\nfor i in test_clean.index:\n    p_count = 0\n    for problem in problem_list:\n        if(problem in test_clean.loc[i, 'problems']):\n            test_clean.loc[i, problem] = 1\n            if(problem != 'NoProblem'):\n                p_count+=1\n    test_clean.

## やりたい実験
- あらゆるエンコーディング方法の比較 ok
- 地区ごとの予測モデルの実装 
- 365日で一周するsin, cos 意味無し


In [320]:
# エンコーディングパート

# すべてのカテゴリカル変数
all_categorical_cols = ['curb_loc', 'steward', 'guards', 'sidewalk',
       'user_type', 'problems', 'spc_common', 'nta', 'nta_name', 'borocode',
       'boro_ct', 'boroname', 'zip_city', 'cb_num', 'st_senate', 'st_assem',
       'cncldist', 'year', 'month',  'spc_genus']

# 

# 落とすカラム
drop_cols = ['nta_name', 'boroname']#'',
# カウントエンコーディング
ce_columns = ['curb_loc', 'steward', 'guards', 'sidewalk',
       'user_type', 'problems', 'spc_common', 'nta', 
              'borocode', 'boro_ct',  'zip_city', 'cb_num', 'st_senate',
       'st_assem', 'cncldist', 'spc_genus', ]
# 'nta',
#  
# 

# ターゲットエンコーディング
te_columns = []
# ラベルエンコーディング
le_columns = []
encoding_cals = list(drop_cols + ce_columns + te_columns + drop_cols)
if(encoding_cals != all_categorical_cols):
    print('error! categorical is not same')
    print('足りないカラム:', set(all_categorical_cols)-set(encoding_cals))

error! categorical is not same
足りないカラム: {'month', 'year'}


In [321]:
# ターゲットエンコーディングパート
if(len(te_columns) > 0):
    # onehot正解ラベルの作成
    for i in range(3):
        train_clean['onehot' + str(i)] = 0
        train_clean['onehot' + str(i)] = train_clean['onehot' + str(i)].mask(train_clean['health'] == i, 1)

    # ターゲットエンコーディングのカラムを3つずつに分ける
    te_columns_list = []

    for te_column in te_columns:
        tmp_list = []
        for i in range(3):
            train_clean[te_column + '-te' + str(i)] = train_clean[te_column]
            test_clean[te_column + '-te' + str(i)] = test_clean[te_column]
            tmp_list.append(te_column + '-te' + str(i))
        te_columns_list.append(tmp_list)
        train_clean.drop(te_column, axis=1, inplace=True)
        test_clean.drop(te_column, axis=1, inplace=True)

    te_columns_list = np.array(te_columns_list)

    # ターゲットエンコーディング
    for i in range(3):
        te_target = te_columns_list[:, i]
        target_enc = TargetEncoder(target_type='binary', random_state=42)
        
        train_clean[te_target] = target_enc.fit_transform(train_clean[te_target], train_clean['onehot' + str(i)])
        # テストデータにtransform
        test_clean[te_target]  = target_enc.transform(test_clean[te_target])

    # onehot正解ラベルを削除
    for i in range(3):
        dropcol = 'onehot' + str(i)
        train_clean.drop(dropcol, axis=1, inplace=True)


# 目的変数を指定
y = train_clean['health']

# 使わないcolumnを削除
train_clean_drop = train_clean.drop(['health'], axis=1)
train_clean_drop = train_clean_drop.drop(drop_cols, axis=1)
test_clean_drop = test_clean.drop(drop_cols,axis=1)

# ラベルエンコーディングパート
le = LabelEncoder()
for column in le_columns:
    train_clean_drop[column] = le.fit_transform(train_clean_drop[column])
    test_clean_drop[column] = le.transform(test_clean_drop[column])

# カテゴリカル変数を指定
for col in le_columns:
    train_clean_drop[col] = train_clean_drop[col].astype('category')
    test_clean_drop[col] = test_clean_drop[col].astype('category')  

# カウントエンコーディングパート
# mergeしてfit, そのあとtransform
all_df = pd.concat([train_clean_drop, test_clean_drop], axis=0)

# count encoding
cencoder = ce.CountEncoder(cols=ce_columns)
cencoder.fit(all_df)

# count encodingを適用
train_clean_drop = cencoder.transform(train_clean_drop)
test_clean_drop = cencoder.transform(test_clean_drop)

In [322]:
train_df = pd.concat([train_clean_drop, y], axis=1)
origin_df = pd.read_csv('C:/python/signate/data/train.csv', index_col=0)
group_num_big = 1
group_num_small = 0
solo_num = 2

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
first_valid_scores = []
second_valid_scores = []
first_models = []
second_models = []
# train_dfと同じindexを持つデータフレームを作成
lgb_pred_proba = origin_df[['health']].copy()
lgb_pred_proba.rename(columns= {'health':'lgb'}, inplace=True)

for fold, (train_idx, valid_idx) in enumerate(kf.split(train_df, y)):
    print(f'fold{fold} start')

    # 0,2と1の予測モデル
    X_train = train_df.iloc[train_idx]
    X_valid = train_df.iloc[valid_idx]

    # 元々のyを保持
    y_all_train = X_train['health']
    y_all_valid = X_valid['health']

    # 0,2のhelathを0とする
    X_train_first = X_train.copy()
    X_valid_first = X_valid.copy()

    # groupは0,soloは1にする
    X_train_first['health'] = X_train_first['health'].mask(X_train_first['health'] == group_num_big, 1)
    X_valid_first['health'] = X_valid_first['health'].mask(X_valid_first['health'] == group_num_big, 1)
    X_train_first['health'] = X_train_first['health'].mask(X_train_first['health'] == group_num_small, 1)
    X_valid_first['health'] = X_valid_first['health'].mask(X_valid_first['health'] == group_num_small, 1)
    X_train_first['health'] = X_train_first['health'].mask(X_train_first['health'] == solo_num, 0)
    X_valid_first['health'] = X_valid_first['health'].mask(X_valid_first['health'] == solo_num, 0)

    # yを分離
    y_train_first = X_train_first['health']
    X_train_first_drop = X_train_first.drop('health', axis=1)
    y_valid_first = X_valid_first['health']
    X_valid_first_drop = X_valid_first.drop('health', axis=1)

    # 1段目の予測
    lgb_train = lgb.Dataset(X_train_first_drop, y_train_first, weight=compute_sample_weight(class_weight='balanced', y=y_train_first).astype('float32'))
    lgb_eval = lgb.Dataset(X_valid_first_drop, y_valid_first, weight=np.ones(len(X_valid_first)).astype('float32'))

    verbose_eval = -1 # この数字を1にすると学習時のスコア推移がコマンドライン表示される
    params = {'objective': 'binary',
    'metric': 'binary_logloss', 'seed': 0,}
    
    first_model = lgb.train(params,
                      lgb_train,
                      valid_sets=[lgb_train, lgb_eval],
                      num_boost_round=1000,
                       callbacks=[lgb.early_stopping(stopping_rounds=50, 
                                verbose=True), # early_stopping用コールバック関数
                           lgb.log_evaluation(verbose_eval)], # コマンドライン出力用コールバック関数))
                        # feval=f1,
                        # categorical_feature=le_columns,
                    )

    y_pred = first_model.predict(X_valid_first_drop, num_iteration=first_model.best_iteration)
    # lgb_pred_proba.iloc[valid_idx] = y_pred
    # 予測結果の2値化
    y_pred_max = np.where(y_pred < 0.5, 0, 1)
    score = f1_score(y_valid_first, y_pred_max, average='macro')
    print(f'fold{fold} f1_score: {score:.4f}')
    first_valid_scores.append(score)
    first_models.append(first_model)

    # 0と2の予測
    X_train_second = X_train.copy()
    X_valid_second = X_valid.copy()
    # groupnumを抽出
    X_train_second = X_train_second[X_train_second['health'] != solo_num]
    X_valid_second = X_valid_second[X_valid_second['health'] != solo_num]

    # yをmask
    X_train_second['health'] = X_train_second['health'].mask(X_train_second['health'] == group_num_big, 1)
    X_train_second['health'] = X_train_second['health'].mask(X_train_second['health'] == group_num_small, 0)
    X_valid_second['health'] = X_valid_second['health'].mask(X_valid_second['health'] == group_num_big, 1)
    X_valid_second['health'] = X_valid_second['health'].mask(X_valid_second['health'] == group_num_small, 0)

    # yを分離
    y_train_second = X_train_second['health']
    X_train_second_drop = X_train_second.drop('health', axis=1)
    y_valid_second = X_valid_second['health']
    X_valid_second_drop = X_valid_second.drop('health', axis=1, )

    lgb_train = lgb.Dataset(X_train_second_drop, y_train_second, weight=compute_sample_weight(class_weight='balanced', y=y_train_second).astype('float32'))
    lgb_eval = lgb.Dataset(X_valid_second_drop, y_valid_second, weight=np.ones(len(X_valid_second)).astype('float32'))

    verbose_eval = -1 # この数字を1にすると学習時のスコア推移がコマンドライン表示される
    params = {'objective': 'binary',
    'metric': 'binary_logloss', 
    'seed': 0,}
    
    second_model = lgb.train(params,
                      lgb_train,
                      valid_sets=[lgb_train, lgb_eval],
                      num_boost_round=1000,
                       callbacks=[lgb.early_stopping(stopping_rounds=50, 
                                verbose=True), # early_stopping用コールバック関数
                           lgb.log_evaluation(verbose_eval)], # コマンドライン出力用コールバック関数))
                    )

    y_pred = second_model.predict(X_valid_second_drop, num_iteration=second_model.best_iteration)
    # lgb_pred_proba.iloc[valid_idx] = y_pred
    y_pred_max = np.where(y_pred<0.5, 0, 1)
    score = f1_score(y_valid_second, y_pred_max, average='macro')
    print(f'fold{fold} f1_score: {score:.4f}')
    second_valid_scores.append(score)
    second_models.append(second_model)
print(f'first-CV: {np.mean(first_valid_scores):.4f}')
print(f'second-CV: {np.mean(second_valid_scores):.4f}')
# lgb_pred_proba.to_csv('C:/python/signate/data/lgb_pred_proba.csv')


fold0 start
[LightGBM] [Info] Number of positive: 15428, number of negative: 559
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 978
[LightGBM] [Info] Number of data points in the train set: 15987, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[475]	training's binary_logloss: 0.0466128	valid_1's binary_logloss: 0.221791
fold0 f1_score: 0.5076
[LightGBM] [Info] Number of positive: 12600, number of negative: 2828
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 974
[LightGBM] [Info] Number of data points in the train set: 15428, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[Light

In [323]:
# モデルで評価
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
second_valid_scores = []
for fold, (train_idx, valid_idx) in enumerate(kf.split(train_df, y)):
    X_train = train_df.iloc[train_idx]
    X_valid = train_df.iloc[valid_idx]
    y_valid = X_valid['health']
    X_valid_first = X_valid.copy()
    X_valid_first.drop('health', axis=1, inplace=True)

    # 1段目の予測
    first_model = first_models[fold]
    y_pred_first = first_model.predict(X_valid_first, num_iteration=first_model.best_iteration)
    # 予測結果の2値化
    y_pred_first = np.where(y_pred_first < 0.5, solo_num, 1)
    
    # 0, 2の予測
    # 1段目の予測結果を結合
    X_valid_second = pd.concat([X_valid_first, pd.Series(y_pred_first, index=X_valid_first.index)], axis=1)
    # 使うのは1段目で0と予測したデータ    
    X_valid_second_target = X_valid_second[X_valid_second[0] == 1]
    X_valid_second_drop = X_valid_second_target.drop(0, axis=1)
    
    # 2段目の予測
    second_model = second_models[fold]
    y_pred_second = second_model.predict(X_valid_second_drop, num_iteration=second_model.best_iteration)
    # 予測結果の2値化は2と0
    y_pred_second = np.where(y_pred_second > 0.5, group_num_big, group_num_small)
    
    # 1段目で0と予測したデータに2段目の予測結果を代入
    y_pred = pd.Series(y_pred_first, index=X_valid_first.index)
    y_pred[y_pred == 1] = y_pred_second
    score = f1_score(y_valid, y_pred, average='macro')
    print(f'fold{fold} f1_score: {score:.4f}')
    lgb_pred_proba.loc[valid_idx, 'lgb'] = y_pred
    second_valid_scores.append(score)
    
print(f'CV: {np.mean(second_valid_scores):.4f}')
lgb_pred_proba.to_csv('C:/python/signate/data/lgb_dive-train.csv')
lgb_pred_proba.head()

fold0 f1_score: 0.3482
fold1 f1_score: 0.3559
fold2 f1_score: 0.3578
fold3 f1_score: 0.3431
fold4 f1_score: 0.3693
CV: 0.3549


Unnamed: 0,lgb
0,0
1,1
2,1
3,1
4,1


In [335]:
# テストデータデータで予測
# 1段目の予測
first_model = first_models[4]
y_pred_first = first_model.predict(test_clean_drop, num_iteration=first_model.best_iteration)
# 予測結果の2値化
y_pred_first = np.where(y_pred_first < 0.5, solo_num, 1)

# 0, 2の予測
# 1段目の予測結果を結合
test_second = pd.concat([test_clean_drop, pd.Series(y_pred_first, index=test_clean_drop.index)], axis=1)
# 使うのは1段目で0と予測したデータ
test_second_target = test_second[test_second[0] == 1]
test_second_drop = test_second_target.drop(0, axis=1)
# 2段目の予測
second_model = second_models[4]
y_pred_second = second_model.predict(test_second_drop, num_iteration=second_model.best_iteration)
# 予測結果の2値化は2と0
y_pred_second_th = np.where(y_pred_second > 0.5, group_num_big, group_num_small)

# 1段目で0と予測したデータに2段目の予測結果を代入
y_pred_series = pd.Series(y_pred_first, index=test_clean_drop.index)
y_pred_series_copy = y_pred_series.copy()
y_pred_series_copy[y_pred_series_copy == 1] = y_pred_second_th
y_pred_series_copy.index = sample_df.index
y_pred_series_copy.to_csv('C:/python/signate/data/lgb-dive_2to0.9.csv', header=False)

In [336]:
y_pred_series_copy

0
19984    0
19985    1
19986    0
19987    1
19988    0
        ..
39964    1
39965    1
39966    1
39967    1
39968    2
Length: 19702, dtype: int32

In [326]:
# 全データを利用して学習
'''
lgb_train = lgb.Dataset(train_clean_drop, y, weight=compute_sample_weight(class_weight='balanced', y=y).astype('float32'))
verbose_eval = -1 # この数字を1にすると学習時のスコア推移がコマンドライン表示される
params = {'objective': 'multiclass',
'metric': 'multi_logloss',
'num_class': 3,
'seed': 0
}

model = lgb.train(params,
                    lgb_train,
                    num_boost_round=520,
                )
'''

"\nlgb_train = lgb.Dataset(train_clean_drop, y, weight=compute_sample_weight(class_weight='balanced', y=y).astype('float32'))\nverbose_eval = -1 # この数字を1にすると学習時のスコア推移がコマンドライン表示される\nparams = {'objective': 'multiclass',\n'metric': 'multi_logloss',\n'num_class': 3,\n'seed': 0\n}\n\nmodel = lgb.train(params,\n                    lgb_train,\n                    num_boost_round=520,\n                )\n"

In [327]:
lgb_pred_proba_test = test_df[['borocode', 'tree_dbh', 'nta']].copy()
lgb_pred_proba_test.rename(columns= {'borocode':'lgb-0', 'tree_dbh':'lgb-1', 'nta':'lgb-2'}, inplace=True)

y_pred = model.predict(test_clean_drop)
lgb_pred_proba_test.iloc[:] = y_pred[:]
lgb_pred_proba_test.to_csv('C:/python/signate/data/lgb_pred_proba_test.csv')
y_pred_max = np.argmax(y_pred, axis=1) # 最尤と判断したクラスの値にする
sample_df[1] = y_pred_max
# sample_df.to_csv('C:/python/signate/data/base3(cv0.3586).csv', header=None)

NameError: name 'model' is not defined

In [None]:
import pandas as pd

importance = first_model.feature_importance()
feature_names = first_model.feature_name()

df_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
# importtane順に並び替え
df_importance = df_importance.sort_values('Importance', ascending=False)
df_importance


Unnamed: 0,Feature,Importance
0,tree_dbh,1763
10,boro_ct,1692
8,nta,1177
7,spc_common,1094
6,problems,990
19,dbh_mean_common,880
18,spc_genus,853
17,month_cos,752
14,st_assem,723
16,month_sin,683
