In [103]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder, TargetEncoder
from xgboost import XGBClassifier
import category_encoders as ce

# pandasの行を省略しない
pd.set_option('display.max_columns', None)

In [104]:
train_df = pd.read_csv('C:/python/signate/data/train.csv', index_col=0)
test_df = pd.read_csv('C:/python/signate/data/test.csv', index_col=0)
sample_df = pd.read_csv('C:/python/signate/data/sample_submission.csv', header=None, index_col=0)


In [105]:
def cleansing(df):
    df['created_at'] = pd.to_datetime(df['created_at'])
    # df['year'] = df['created_at'].dt.year
    # df['month'] = df['created_at'].dt.month
    # df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    # df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df.drop('created_at', axis=1, inplace=True)
    df['steward'].fillna('0', inplace=True)
    df['guards'].fillna('0', inplace=True)
    df['problems'].fillna('0', inplace=True)
    df['spc_genus'] = df['spc_latin'].str.split(' ').str[0]
    df.drop('spc_latin', axis=1, inplace=True)

    # df['spc_genus'] = df['spc_latin'].str.split(' ').str[0]
    # df.drop('spc_latin', axis=1, inplace=True)
    return df

train_clean = cleansing(train_df)
test_clean = cleansing(test_df)

In [106]:
# 種目ごとの平均直径とその差を特徴量にする
test_clean['health'] = 0
all_df = pd.concat([train_clean, test_clean], axis=0)   
tmp_df = all_df[['tree_dbh', 'spc_common', ]]
dbh_mean_common = (pd.DataFrame(tmp_df.groupby('spc_common').mean()['tree_dbh']))
dbh_mean_common.rename(columns={'tree_dbh': 'dbh_mean_common'}, inplace=True)
all_df = pd.merge(all_df, dbh_mean_common, on='spc_common', how='left')
# all_df['dbh_diff_common'] = all_df['tree_dbh'] - all_df['dbh_mean_common']

tmp_df = all_df[['tree_dbh', 'cb_num', ]]
dbh_mean_cb = (pd.DataFrame(tmp_df.groupby('cb_num').mean()['tree_dbh']))
dbh_mean_cb.rename(columns={'tree_dbh': 'dbh_mean_cb'}, inplace=True)
all_df = pd.merge(all_df, dbh_mean_cb, on='cb_num', how='left')

train_clean = all_df.iloc[:len(train_clean)]
test_clean = all_df.iloc[len(train_clean):]
test_clean.drop('health', axis=1, inplace=True)
test_clean.head()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_clean.drop('health', axis=1, inplace=True)


Unnamed: 0,tree_dbh,curb_loc,steward,guards,sidewalk,user_type,problems,spc_common,nta,nta_name,borocode,boro_ct,boroname,zip_city,cb_num,st_senate,st_assem,cncldist,spc_genus,dbh_mean_common,dbh_mean_cb
19984,15,OnCurb,0,0,Damage,NYC Parks Staff,0,Callery pear,BK17,Sheepshead Bay-Gerritsen Beach-Manhattan Beach,3,3062000,Brooklyn,Brooklyn,315,22,45,48,Pyrus,9.993088,11.813235
19985,5,OnCurb,1or2,0,NoDamage,Volunteer,0,cherry,BX62,Woodlawn-Wakefield,2,2045101,Bronx,Bronx,212,34,81,11,Prunus,6.937519,13.062044
19986,4,OnCurb,0,Unsure,NoDamage,Volunteer,0,littleleaf linden,QN60,Kew Gardens,4,4013800,Queens,Kew Gardens,409,14,27,29,Tilia,9.373364,11.174312
19987,7,OnCurb,0,0,NoDamage,TreesCount Staff,0,dawn redwood,BK09,Brooklyn Heights-Cobble Hill,3,3000301,Brooklyn,Brooklyn,302,26,52,33,Metasequoia,9.541667,9.89467
19988,6,OnCurb,1or2,Helpful,NoDamage,TreesCount Staff,Stones,purple-leaf plum,BK75,Bedford,3,3025100,Brooklyn,Brooklyn,303,25,56,36,Prunus,9.447942,12.23681


In [107]:
# エンコーディングパート

# すべてのカテゴリカル変数
all_categorical_cols = ['curb_loc', 'steward', 'guards', 'sidewalk',
       'user_type', 'problems', 'spc_common', 'nta', 'nta_name', 'borocode',
       'boro_ct', 'boroname', 'zip_city', 'cb_num', 'st_senate', 'st_assem',
       'cncldist', 'year', 'month',  'spc_genus', ]

# 

# 落とすカラム
drop_cols = ['nta_name', 'boroname']#'',
# カウントエンコーディング
ce_columns = ['curb_loc', 'steward', 'guards', 'sidewalk',
       'user_type', 'problems', 'spc_common', 'nta', 'borocode',
       'boro_ct',  'zip_city', 'cb_num', 'st_senate', 'st_assem',
       'cncldist',  'spc_genus', ]
# 'nta',
#  
# 

# ターゲットエンコーディング
te_columns = []
# ラベルエンコーディング
le_columns = []
encoding_cals = list(drop_cols + ce_columns + te_columns + drop_cols)
if(encoding_cals != all_categorical_cols):
    print('error! categorical is not same')
    print('足りないカラム:', set(all_categorical_cols)-set(encoding_cals))

error! categorical is not same
足りないカラム: {'year', 'month'}


In [108]:
# ターゲットエンコーディングパート
if(len(te_columns) > 0):
    # onehot正解ラベルの作成
    for i in range(3):
        train_clean['onehot' + str(i)] = 0
        train_clean['onehot' + str(i)] = train_clean['onehot' + str(i)].mask(train_clean['health'] == i, 1)

    # ターゲットエンコーディングのカラムを3つずつに分ける
    te_columns_list = []

    for te_column in te_columns:
        tmp_list = []
        for i in range(3):
            train_clean[te_column + '-te' + str(i)] = train_clean[te_column]
            test_clean[te_column + '-te' + str(i)] = test_clean[te_column]
            tmp_list.append(te_column + '-te' + str(i))
        te_columns_list.append(tmp_list)
        train_clean.drop(te_column, axis=1, inplace=True)
        test_clean.drop(te_column, axis=1, inplace=True)

    te_columns_list = np.array(te_columns_list)

    # ターゲットエンコーディング
    for i in range(3):
        te_target = te_columns_list[:, i]
        target_enc = TargetEncoder(target_type='binary', random_state=42)
        
        train_clean[te_target] = target_enc.fit_transform(train_clean[te_target], train_clean['onehot' + str(i)])
        # テストデータにtransform
        test_clean[te_target]  = target_enc.transform(test_clean[te_target])

    # onehot正解ラベルを削除
    for i in range(3):
        dropcol = 'onehot' + str(i)
        train_clean.drop(dropcol, axis=1, inplace=True)


# 目的変数を指定
y = train_clean['health']

# 使わないcolumnを削除
train_clean_drop = train_clean.drop(['health'], axis=1)
train_clean_drop = train_clean_drop.drop(drop_cols, axis=1)
test_clean_drop = test_clean.drop(drop_cols,axis=1)

# ラベルエンコーディングパート
le = LabelEncoder()
for column in le_columns:
    train_clean_drop[column] = le.fit_transform(train_clean_drop[column])
    test_clean_drop[column] = le.transform(test_clean_drop[column])

# カテゴリカル変数を指定
for col in le_columns:
    train_clean_drop[col] = train_clean_drop[col].astype('category')
    test_clean_drop[col] = test_clean_drop[col].astype('category')  

# カウントエンコーディングパート
# mergeしてfit, そのあとtransform
all_df = pd.concat([train_clean_drop, test_clean_drop], axis=0)

# count encoding
cencoder = ce.CountEncoder(cols=ce_columns)
cencoder.fit(all_df)

# count encodingを適用
train_clean_drop = cencoder.transform(train_clean_drop)
test_clean_drop = cencoder.transform(test_clean_drop)

In [109]:
train_clean_drop.head()

Unnamed: 0,tree_dbh,curb_loc,steward,guards,sidewalk,user_type,problems,spc_common,nta,borocode,boro_ct,zip_city,cb_num,st_senate,st_assem,cncldist,spc_genus,dbh_mean_common,dbh_mean_cb
0,14,37270,29409,29510,18509,22298,24288,2579,413,13234,129,413,1654,3780,1145,1694,7853,12.346646,11.311971
1,5,37270,2068,7366,21177,22298,24288,254,59,4479,24,4479,166,564,162,380,3613,10.15748,16.289157
2,26,37270,29409,29510,21177,22298,1529,2579,616,7292,189,7292,3436,5720,2868,3155,7853,12.346646,9.683062
3,15,37270,29409,29510,18509,6031,24288,2104,678,7292,121,7292,3436,5720,2868,3155,2104,11.484316,9.683062
4,23,37270,29409,29510,21177,22298,4455,4339,144,4522,39,4473,289,948,418,542,4339,19.432819,11.678201


In [110]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)
valid_scores = []
models = []
# train_clean_dropと同じindexを持つデータフレームを作成
xgb_pred_proba = train_df[['health', 'tree_dbh', 'nta']].copy()
xgb_pred_proba.rename(columns= {'health':'xgb-0', 'tree_dbh':'xgb-1', 'nta':'xgb-2'}, inplace=True)


for fold, (train_idx, valid_idx) in enumerate(kf.split(train_clean_drop, y)):
    print(f'fold{fold} start')
    X_train, y_train = train_clean_drop.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = train_clean_drop.iloc[valid_idx], y.iloc[valid_idx]

    # xgb_train = xgb.DMatrix(X_train, label=y_train, weight=compute_sample_weight(class_weight='balanced', y=y_train).astype('float32'))
    # xgb_eval = xgb.DMatrix(X_valid, label=y_valid, weight=np.ones(len(X_valid)).astype('float32'))

    param = {'objective': 'multi:softmax', 'num_class': 3,
             'n_estimators': 10000, 'random_state': 0,
             'eval_metric': 'merror'}

    verbose_eval = -1 # この数字を1にすると学習時のスコア推移がコマンドライン表示される

    model = XGBClassifier(**param, early_stopping_rounds=10,)
    model.fit(X_train, y_train, 
              sample_weight=compute_sample_weight(class_weight='balanced', y=y_train).astype('float32'),
              eval_set=[(X_train, y_train), (X_valid, y_valid)], verbose=100)
    
    # probaを予測
    y_pred = model.predict_proba(X_valid,)
    # 予測結果を格納
    xgb_pred_proba.iloc[valid_idx] = y_pred
    y_pred_max = np.argmax(y_pred, axis=1)
    score = f1_score(y_valid, y_pred_max, average='macro')
    print(f'fold{fold} f1_score: {score:.4f}')
    valid_scores.append(score)
    models.append(model)
print(f'CV: {np.mean(valid_scores):.4f}')
# 予測結果を保存
xgb_pred_proba.to_csv('C:/python/signate/data/xgb_pred_proba.csv')


fold0 start
[0]	validation_0-merror:0.62282	validation_1-merror:0.64373
[100]	validation_0-merror:0.21705	validation_1-merror:0.42082
[125]	validation_0-merror:0.18015	validation_1-merror:0.40655
fold0 f1_score: 0.3513
fold1 start
[0]	validation_0-merror:0.67455	validation_1-merror:0.69677
[100]	validation_0-merror:0.19697	validation_1-merror:0.41556
[124]	validation_0-merror:0.17214	validation_1-merror:0.40756
fold1 f1_score: 0.3504
fold2 start
[0]	validation_0-merror:0.58954	validation_1-merror:0.63698
[100]	validation_0-merror:0.19378	validation_1-merror:0.42432
[200]	validation_0-merror:0.10721	validation_1-merror:0.38104
[231]	validation_0-merror:0.09358	validation_1-merror:0.37128
fold2 f1_score: 0.3458
fold3 start
[0]	validation_0-merror:0.54682	validation_1-merror:0.57643
[100]	validation_0-merror:0.20154	validation_1-merror:0.41281
[200]	validation_0-merror:0.11059	validation_1-merror:0.36352
[224]	validation_0-merror:0.09558	validation_1-merror:0.36027
fold3 f1_score: 0.3551


In [111]:
xgb_pred_proba_test = test_df[['borocode', 'tree_dbh', 'nta']].copy()
xgb_pred_proba_test.rename(columns= {'borocode':'xgb-0', 'tree_dbh':'xgb-1', 'nta':'xgb-2'}, inplace=True)

y_pred = model.predict_proba(test_clean_drop)
xgb_pred_proba_test.iloc[:] = y_pred[:]
xgb_pred_proba_test.to_csv('C:/python/signate/data/xgb_pred_proba_test.csv')
y_pred_max = np.argmax(y_pred, axis=1)
sample_df[1] = y_pred_max
# sample_df.to_csv('C:/python/signate/data/xgb_diff_te.csv', header=None)

In [112]:
xgb_pred_proba_test

Unnamed: 0,xgb-0,xgb-1,xgb-2
19984,0.545216,0.404843,0.049941
19985,0.362207,0.587780,0.050013
19986,0.255562,0.733134,0.011305
19987,0.337109,0.617176,0.045715
19988,0.480037,0.468837,0.051126
...,...,...,...
39964,0.045060,0.944733,0.010206
39965,0.358874,0.481291,0.159836
39966,0.107937,0.139446,0.752617
39967,0.339700,0.639044,0.021256


In [113]:
importances = model.feature_importances_
display(pd.DataFrame(importances, index=train_clean_drop.columns).sort_values(by=0, ascending=False))

Unnamed: 0,0
steward,0.066767
zip_city,0.06078
cb_num,0.060659
boro_ct,0.058311
dbh_mean_cb,0.058077
cncldist,0.055186
st_assem,0.054775
st_senate,0.05435
nta,0.053964
dbh_mean_common,0.053182


In [114]:
# numpyで保存
# np.save('C:/python/signate/data/xgb_simple.npy', y_pred)