In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder, TargetEncoder
import lightgbm as lgb
import category_encoders as ce
import re

# pandasの行を省略しない
pd.set_option('display.max_columns', None)

In [96]:
train_df = pd.read_csv('C:/python/signate/data/train.csv', index_col=0)
test_df = pd.read_csv('C:/python/signate/data/test.csv', index_col=0)
sample_df = pd.read_csv('C:/python/signate/data/sample_submission.csv', header=None, index_col=0)


In [97]:
# train_dfのcolumnを取得
columns = test_df.columns

In [98]:
def cleansing(df):
    df['created_at'] = pd.to_datetime(df['created_at'])
    df['year'] = df['created_at'].dt.year
    df['month'] = df['created_at'].dt.month
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df.drop('created_at', axis=1, inplace=True)
    df['steward'].fillna('0', inplace=True)
    df['guards'].fillna('0', inplace=True)
    df['problems'].fillna('NoProblem', inplace=True)

    df['spc_genus'] = df['spc_latin'].str.split(' ').str[0]
    df.drop('spc_latin', axis=1, inplace=True)
    return df

train_clean = cleansing(train_df)
test_clean = cleansing(test_df)

In [99]:
# 一旦lightgbmで学習してみる
# target encodingとラベル削除を行う

# 使わないcolumnを指定
drop_col = []
# カテゴリカル変数を指定
te_columns = ['guards', 'sidewalk']

# カテゴリカルカラムから使わないcolumnを削除
te_columns = list(set(te_columns) - set(drop_col))

for i in range(3):
    train_clean['onehot' + str(i)] = 0
    train_clean['onehot' + str(i)] = train_clean['onehot' + str(i)].mask(train_clean['health'] == i, 1)

te_columns_list = []

for te_column in te_columns:
    tmp_list = []
    for i in range(3):
        train_clean[te_column + '-te' + str(i)] = train_clean[te_column]
        test_clean[te_column + '-te' + str(i)] = test_clean[te_column]
        tmp_list.append(te_column + '-te' + str(i))
    te_columns_list.append(tmp_list)
    train_clean.drop(te_column, axis=1, inplace=True)
    test_clean.drop(te_column, axis=1, inplace=True)

print(te_columns_list)
te_columns_list = np.array(te_columns_list)

test_clean.head()

[['sidewalk-te0', 'sidewalk-te1', 'sidewalk-te2'], ['guards-te0', 'guards-te1', 'guards-te2']]


Unnamed: 0,tree_dbh,curb_loc,steward,user_type,problems,spc_common,nta,nta_name,borocode,boro_ct,boroname,zip_city,cb_num,st_senate,st_assem,cncldist,year,month,month_sin,month_cos,spc_genus,sidewalk-te0,sidewalk-te1,sidewalk-te2,guards-te0,guards-te1,guards-te2
19984,15,OnCurb,0,NYC Parks Staff,NoProblem,Callery pear,BK17,Sheepshead Bay-Gerritsen Beach-Manhattan Beach,3,3062000,Brooklyn,Brooklyn,315,22,45,48,2015,9,-1.0,-1.83697e-16,Pyrus,Damage,Damage,Damage,0,0,0
19985,5,OnCurb,1or2,Volunteer,NoProblem,cherry,BX62,Woodlawn-Wakefield,2,2045101,Bronx,Bronx,212,34,81,11,2015,10,-0.866025,0.5,Prunus,NoDamage,NoDamage,NoDamage,0,0,0
19986,4,OnCurb,0,Volunteer,NoProblem,littleleaf linden,QN60,Kew Gardens,4,4013800,Queens,Kew Gardens,409,14,27,29,2016,8,-0.866025,-0.5,Tilia,NoDamage,NoDamage,NoDamage,Unsure,Unsure,Unsure
19987,7,OnCurb,0,TreesCount Staff,NoProblem,dawn redwood,BK09,Brooklyn Heights-Cobble Hill,3,3000301,Brooklyn,Brooklyn,302,26,52,33,2015,8,-0.866025,-0.5,Metasequoia,NoDamage,NoDamage,NoDamage,0,0,0
19988,6,OnCurb,1or2,TreesCount Staff,Stones,purple-leaf plum,BK75,Bedford,3,3025100,Brooklyn,Brooklyn,303,25,56,36,2015,10,-0.866025,0.5,Prunus,NoDamage,NoDamage,NoDamage,Helpful,Helpful,Helpful


In [102]:
for i in range(3):
    te_target = te_columns_list[:, i]
    target_enc = TargetEncoder(target_type='auto', random_state=42)
    
    train_clean[te_target] = target_enc.fit_transform(train_clean[te_target], train_clean['onehot' + str(i)])
    # テストデータにtransform
    test_clean[te_target]  = target_enc.transform(test_clean[te_target])
test_clean.head()


Unnamed: 0,tree_dbh,curb_loc,steward,user_type,problems,spc_common,nta,nta_name,borocode,boro_ct,boroname,zip_city,cb_num,st_senate,st_assem,cncldist,year,month,month_sin,month_cos,spc_genus,sidewalk-te0,sidewalk-te1,sidewalk-te2,guards-te0,guards-te1,guards-te2
19984,15,OnCurb,0,NYC Parks Staff,NoProblem,Callery pear,BK17,Sheepshead Bay-Gerritsen Beach-Manhattan Beach,3,3062000,Brooklyn,Brooklyn,315,22,45,48,2015,9,-1.0,-1.83697e-16,Pyrus,0.176892,0.788181,0.034928,0.176892,0.788181,0.034928
19985,5,OnCurb,1or2,Volunteer,NoProblem,cherry,BX62,Woodlawn-Wakefield,2,2045101,Bronx,Bronx,212,34,81,11,2015,10,-0.866025,0.5,Prunus,0.176892,0.788181,0.034928,0.176892,0.788181,0.034928
19986,4,OnCurb,0,Volunteer,NoProblem,littleleaf linden,QN60,Kew Gardens,4,4013800,Queens,Kew Gardens,409,14,27,29,2016,8,-0.866025,-0.5,Tilia,0.176892,0.788181,0.034928,0.176892,0.788181,0.034928
19987,7,OnCurb,0,TreesCount Staff,NoProblem,dawn redwood,BK09,Brooklyn Heights-Cobble Hill,3,3000301,Brooklyn,Brooklyn,302,26,52,33,2015,8,-0.866025,-0.5,Metasequoia,0.176892,0.788181,0.034928,0.176892,0.788181,0.034928
19988,6,OnCurb,1or2,TreesCount Staff,Stones,purple-leaf plum,BK75,Bedford,3,3025100,Brooklyn,Brooklyn,303,25,56,36,2015,10,-0.866025,0.5,Prunus,0.176892,0.788181,0.034928,0.176892,0.788181,0.034928


In [101]:
train_clean.head()

Unnamed: 0,tree_dbh,curb_loc,health,steward,user_type,problems,spc_common,nta,nta_name,borocode,boro_ct,boroname,zip_city,cb_num,st_senate,st_assem,cncldist,year,month,month_sin,month_cos,spc_genus,onehot0,onehot1,onehot2,sidewalk-te0,sidewalk-te1,sidewalk-te2,guards-te0,guards-te1,guards-te2
0,14,OnCurb,1,0,Volunteer,NoProblem,English oak,QN45,Douglas Manor-Douglaston-Little Neck,4,4152901,Queens,Little Neck,411,11,25,23,2015,6,1.224647e-16,-1.0,Quercus,0,1,0,0.182747,0.784939,0.035834,0.164804,0.800267,0.034977
1,5,OnCurb,1,3or4,Volunteer,NoProblem,crimson king maple,BX05,Bedford Park-Fordham North,2,2039901,Bronx,Bronx,207,33,78,15,2016,9,-1.0,-1.83697e-16,Acer,0,1,0,0.174502,0.790928,0.036055,0.205831,0.752257,0.038867
2,26,OnCurb,2,0,Volunteer,StonesBranchLights,English oak,SI01,Annadale-Huguenot-Prince's Bay-Eltingville,5,5017011,Staten Island,Staten Island,503,24,62,51,2015,9,-1.0,-1.83697e-16,Quercus,0,0,1,0.172193,0.790928,0.035387,0.163806,0.799114,0.034915
3,15,OnCurb,0,0,NYC Parks Staff,NoProblem,honeylocust,SI11,Charleston-Richmond Valley-Tottenville,5,5024401,Staten Island,Staten Island,503,24,62,51,2016,5,0.5,-0.8660254,Gleditsia,1,0,0,0.18232,0.779998,0.034896,0.163806,0.801522,0.034748
4,23,OnCurb,1,0,Volunteer,Stones,London planetree,MN03,Central Harlem North-Polo Grounds,1,1022102,Manhattan,New York,110,30,70,9,2016,6,1.224647e-16,-1.0,Platanus,0,1,0,0.17179,0.790994,0.034507,0.164804,0.800267,0.034708


In [80]:
print(len(encoded))
print(len(train_clean))
encoded_test.head()

19984
19984


Unnamed: 0,sidewalk-te2,guards-te2
0,0.034857,0.034866
1,0.034989,0.034866
2,0.034989,0.035278
3,0.034989,0.034866
4,0.034989,0.038081


In [60]:
train_clean.head()

Unnamed: 0,tree_dbh,curb_loc,health,steward,guards,sidewalk,user_type,problems,spc_common,nta_name,borocode,boro_ct,zip_city,cb_num,st_senate,st_assem,cncldist,year,month,month_sin,month_cos,spc_genus,onehot0,onehot1,onehot2,boroname-te1,boroname-te2,nta-te1,nta-te2,boroname-te0,nta-te0
0,14,OnCurb,1,0,0,Damage,Volunteer,NoProblem,English oak,Douglas Manor-Douglaston-Little Neck,4,4152901,Little Neck,411,11,25,23,2015,6,1.224647e-16,-1.0,Quercus,0,1,0,Queens,Queens,QN45,QN45,0.178103,0.162867
1,5,OnCurb,1,3or4,Helpful,NoDamage,Volunteer,NoProblem,crimson king maple,Bedford Park-Fordham North,2,2039901,Bronx,207,33,78,15,2016,9,-1.0,-1.83697e-16,Acer,0,1,0,Bronx,Bronx,BX05,BX05,0.175987,0.134456
2,26,OnCurb,2,0,0,NoDamage,Volunteer,StonesBranchLights,English oak,Annadale-Huguenot-Prince's Bay-Eltingville,5,5017011,Staten Island,503,24,62,51,2015,9,-1.0,-1.83697e-16,Quercus,0,0,1,Staten Island,Staten Island,SI01,SI01,0.174459,0.184069
3,15,OnCurb,0,0,0,Damage,NYC Parks Staff,NoProblem,honeylocust,Charleston-Richmond Valley-Tottenville,5,5024401,Staten Island,503,24,62,51,2016,5,0.5,-0.8660254,Gleditsia,1,0,0,Staten Island,Staten Island,SI11,SI11,0.174459,0.150957
4,23,OnCurb,1,0,0,NoDamage,Volunteer,Stones,London planetree,Central Harlem North-Polo Grounds,1,1022102,New York,110,30,70,9,2016,6,1.224647e-16,-1.0,Platanus,0,1,0,Manhattan,Manhattan,MN03,MN03,0.17356,0.233289


In [40]:
train_clean.head()

Unnamed: 0,tree_dbh,curb_loc,health,steward,guards,sidewalk,user_type,problems,spc_common,nta_name,borocode,boro_ct,boroname,zip_city,cb_num,st_senate,st_assem,cncldist,year,month,month_sin,month_cos,spc_genus,onehot0,onehot1,onehot2,nta-te0,nta-te1,nta-te2
0,14,OnCurb,1,0,0,Damage,Volunteer,NoProblem,English oak,Douglas Manor-Douglaston-Little Neck,4,4152901,Queens,Little Neck,411,11,25,23,2015,6,1.224647e-16,-1.0,Quercus,0,1,0,0.162867,0.786526,0.058594
1,5,OnCurb,1,3or4,Helpful,NoDamage,Volunteer,NoProblem,crimson king maple,Bedford Park-Fordham North,2,2039901,Bronx,Bronx,207,33,78,15,2016,9,-1.0,-1.83697e-16,Acer,0,1,0,0.134456,0.785799,0.03832
2,26,OnCurb,2,0,0,NoDamage,Volunteer,StonesBranchLights,English oak,Annadale-Huguenot-Prince's Bay-Eltingville,5,5017011,Staten Island,Staten Island,503,24,62,51,2015,9,-1.0,-1.83697e-16,Quercus,0,0,1,0.184069,0.748023,0.03376
3,15,OnCurb,0,0,0,Damage,NYC Parks Staff,NoProblem,honeylocust,Charleston-Richmond Valley-Tottenville,5,5024401,Staten Island,Staten Island,503,24,62,51,2016,5,0.5,-0.8660254,Gleditsia,1,0,0,0.150957,0.804741,0.03728
4,23,OnCurb,1,0,0,NoDamage,Volunteer,Stones,London planetree,Central Harlem North-Polo Grounds,1,1022102,Manhattan,New York,110,30,70,9,2016,6,1.224647e-16,-1.0,Platanus,0,1,0,0.233289,0.72004,0.03449


In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
valid_scores = []
models = []

for fold, (train_idx, valid_idx) in enumerate(kf.split(train_clean_drop, y)):
    print(f'fold{fold} start')
    X_train, y_train = train_clean_drop.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = train_clean_drop.iloc[valid_idx], y.iloc[valid_idx]

    lgb_train = lgb.Dataset(X_train, y_train, weight=compute_sample_weight(class_weight='balanced', y=y_train).astype('float32'))
    lgb_eval = lgb.Dataset(X_valid, y_valid, weight=np.ones(len(X_valid)).astype('float32'))

    params = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'num_class': 3,
        'seed': 0
    }
    verbose_eval = -1 # この数字を1にすると学習時のスコア推移がコマンドライン表示される

    model = lgb.train(params,
                      lgb_train,
                      valid_sets=[lgb_train, lgb_eval],
                      num_boost_round=1000,
                       callbacks=[lgb.early_stopping(stopping_rounds=10, 
                                verbose=True), # early_stopping用コールバック関数
                           lgb.log_evaluation(verbose_eval)] # コマンドライン出力用コールバック関数))
                    )

    y_pred = model.predict(X_valid, num_iteration=model.best_iteration)
    y_pred_max = np.argmax(y_pred, axis=1)
    score = f1_score(y_valid, y_pred_max, average='macro')
    print(f'fold{fold} f1_score: {score:.4f}')
    valid_scores.append(score)
    models.append(model)
print(f'CV: {np.mean(valid_scores):.4f}')


fold0 start
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2468
[LightGBM] [Info] Number of data points in the train set: 15987, number of used features: 31
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[407]	training's multi_logloss: 0.224635	valid_1's multi_logloss: 0.753399
fold0 f1_score: 0.3318
fold1 start
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2471
[LightGBM] [Info] Number of data points in the train set: 15987, number of used features: 31
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [In

In [None]:
y_pred = model.predict(test_clean_drop)
y_pred_max = np.argmax(y_pred, axis=1) # 最尤と判断したクラスの値にする
sample_df[1] = y_pred_max
# sample_df.to_csv('C:/python/signate/data/base2-sincos-genus-NoCategory-OneHotProblemFix.csv', header=None)

In [None]:
import pandas as pd

importance = model.feature_importance()
feature_names = model.feature_name()

df_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
df_importance


Unnamed: 0,Feature,Importance
0,tree_dbh,5179
1,curb_loc,252
2,steward,738
3,guards,671
4,sidewalk,790
5,user_type,1219
6,problems,1808
7,spc_common,3756
8,nta,3927
9,borocode,415
