In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder, TargetEncoder
import lightgbm as lgb
import category_encoders as ce
import re

# pandasの行を省略しない
pd.set_option('display.max_columns', None)

In [12]:
train_df = pd.read_csv('C:/python/signate/data/train.csv', index_col=0)
test_df = pd.read_csv('C:/python/signate/data/test.csv', index_col=0)
sample_df = pd.read_csv('C:/python/signate/data/sample_submission.csv', header=None, index_col=0)


データはほとんどカテゴリカル
#### 思いついた精度向上案
- commonを個別名称として、latinの上を属名として利用する
- 地区が多すぎるので整理する
- 季節性を導入
- 郵便番号は連続性があるからカテゴリカルにしないほうがよいのでは
- 高い確率で状態が悪いと推測された木の近くの木はフラグ立てる。つまり2段階モデル

In [13]:
# train_dfのcolumnを取得
columns = test_df.columns

In [14]:
def cleansing(df):
    df['created_at'] = pd.to_datetime(df['created_at'])
    df['year'] = df['created_at'].dt.year
    df['month'] = df['created_at'].dt.month
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df.drop('created_at', axis=1, inplace=True)
    df['steward'].fillna('0', inplace=True)
    df['guards'].fillna('0', inplace=True)
    df['problems'].fillna('NoProblem', inplace=True)

    df['spc_genus'] = df['spc_latin'].str.split(' ').str[0]
    df.drop('spc_latin', axis=1, inplace=True)
    return df

train_clean = cleansing(train_df)
test_clean = cleansing(test_df)

In [15]:
# trainとtestに含まれるproblemは全く同じ

# problemのonehotカラムを用意
problem_list = ['BranchLights', 'BranchOther', 'MetalGrates', 'RootOther', 'Stones', 'Sneakers', 'TrunkLights',
                 'TrunkOther', 'WiresRope', 'NoProblem']	

for problem in problem_list:
    train_clean[problem] = 0
    test_clean[problem] = 0

train_clean['problem_count'] = 0
test_clean['problem_count'] = 0

# problemlistにあったらonehotする
for i in train_clean.index:
    p_count = 0
    for problem in problem_list:
        if(problem in train_clean.loc[i, 'problems']):
            train_clean.loc[i, problem] = 1
            if(problem != 'NoProblem'):
                p_count+=1
    train_clean.loc[i, 'problem_count'] = p_count

for i in test_clean.index:
    p_count = 0
    for problem in problem_list:
        if(problem in test_clean.loc[i, 'problems']):
            test_clean.loc[i, problem] = 1
            if(problem != 'NoProblem'):
                p_count+=1
    test_clean.loc[i, 'problem_count'] = p_count

'''
def str_split(string):
    big_ite = re.findall(r'[A-Z][a-z]+', string)
    return big_ite

# 使われているproblemを取得しonehot化
for i in range(len(train_clean)):
    tmp = str_split(train_clean.loc[i, 'problems'])
    for word in tmp:
        train_clean.loc[i, word] = 1

for i in test_clean.index:
    tmp = str_split(test_clean.loc[i, 'problems'])
    for word in tmp:
        test_clean.loc[i, word] = 1 
        
test_clean.head()
'''
test_clean.head(10)

Unnamed: 0,tree_dbh,curb_loc,steward,guards,sidewalk,user_type,problems,spc_common,nta,nta_name,borocode,boro_ct,boroname,zip_city,cb_num,st_senate,st_assem,cncldist,year,month,month_sin,month_cos,spc_genus,BranchLights,BranchOther,MetalGrates,RootOther,Stones,Sneakers,TrunkLights,TrunkOther,WiresRope,NoProblem,problem_count
19984,15,OnCurb,0,0,Damage,NYC Parks Staff,NoProblem,Callery pear,BK17,Sheepshead Bay-Gerritsen Beach-Manhattan Beach,3,3062000,Brooklyn,Brooklyn,315,22,45,48,2015,9,-1.0,-1.83697e-16,Pyrus,0,0,0,0,0,0,0,0,0,1,0
19985,5,OnCurb,1or2,0,NoDamage,Volunteer,NoProblem,cherry,BX62,Woodlawn-Wakefield,2,2045101,Bronx,Bronx,212,34,81,11,2015,10,-0.8660254,0.5,Prunus,0,0,0,0,0,0,0,0,0,1,0
19986,4,OnCurb,0,Unsure,NoDamage,Volunteer,NoProblem,littleleaf linden,QN60,Kew Gardens,4,4013800,Queens,Kew Gardens,409,14,27,29,2016,8,-0.8660254,-0.5,Tilia,0,0,0,0,0,0,0,0,0,1,0
19987,7,OnCurb,0,0,NoDamage,TreesCount Staff,NoProblem,dawn redwood,BK09,Brooklyn Heights-Cobble Hill,3,3000301,Brooklyn,Brooklyn,302,26,52,33,2015,8,-0.8660254,-0.5,Metasequoia,0,0,0,0,0,0,0,0,0,1,0
19988,6,OnCurb,1or2,Helpful,NoDamage,TreesCount Staff,Stones,purple-leaf plum,BK75,Bedford,3,3025100,Brooklyn,Brooklyn,303,25,56,36,2015,10,-0.8660254,0.5,Prunus,0,0,0,0,1,0,0,0,0,0,1
19989,15,OnCurb,0,0,NoDamage,TreesCount Staff,NoProblem,green ash,QN66,Laurelton,4,4062600,Queens,Springfield Gardens,413,14,29,31,2015,10,-0.8660254,0.5,Fraxinus,0,0,0,0,0,0,0,0,0,1,0
19990,4,OnCurb,0,0,NoDamage,Volunteer,NoProblem,hedge maple,QN66,Laurelton,4,4062600,Queens,Springfield Gardens,413,14,29,31,2015,6,1.224647e-16,-1.0,Acer,0,0,0,0,0,0,0,0,0,1,0
19991,32,OnCurb,0,0,Damage,Volunteer,Stones,pin oak,MN12,Upper West Side,1,1018700,Manhattan,New York,107,31,69,6,2015,10,-0.8660254,0.5,Quercus,0,0,0,0,1,0,0,0,0,0,1
19992,19,OnCurb,0,0,Damage,Volunteer,StonesRootOtherTrunkOther,English oak,QN48,Auburndale,4,4141700,Queens,Flushing,411,11,25,20,2016,7,-0.5,-0.8660254,Quercus,0,0,0,1,1,0,0,1,0,0,3
19993,5,OnCurb,0,0,Damage,Volunteer,RootOther,littleleaf linden,BK90,East Williamsburg,3,3047700,Brooklyn,Brooklyn,301,18,50,34,2016,7,-0.5,-0.8660254,Tilia,0,0,0,1,0,0,0,0,0,0,1


In [16]:
# 一旦lightgbmで学習してみる
# target encodingとラベル削除を行う

# 使わないcolumnを指定
drop_col = ['nta_name', 'boroname']
# カテゴリカル変数を指定
te_columns = ['curb_loc', 'steward', 'guards',
       'sidewalk', 'user_type', 'problems', 'spc_common', 'nta',
       'nta_name', 'borocode', 'boro_ct', 'boroname', 'zip_city', 'cb_num',
       'st_senate', 'st_assem', 'cncldist', 'spc_genus']

# カテゴリカルカラムから使わないcolumnを削除
te_columns = list(set(te_columns) - set(drop_col))

# 目的変数を指定
y = train_df['health']

# 使わないcolumnを削除
train_clean_drop = train_clean.drop(['health'], axis=1)
train_clean_drop = train_clean_drop.drop(drop_col, axis=1)
test_clean_drop = test_clean.drop(drop_col, axis=1)

# target encoding
# インスタンス作成、変換
target_enc = TargetEncoder(target_type='auto', random_state=42)
encoded = target_enc.fit_transform(train_clean_drop[te_columns], y)
# ndarrayをdfに変換
encoded = pd.DataFrame(encoded)
# カラムが消えたので直し、元のデータと結合
for i, j in enumerate(te_columns):
    encoded = encoded.rename(columns={i: j})
train_clean_drop = train_clean_drop.drop(te_columns, axis=1)
train_clean_drop = pd.merge(train_clean_drop, encoded, right_index=True, left_index=True)

# テストデータにtransform
encoded_test = target_enc.transform(test_clean_drop[te_columns])
# ndarrayをdfに変換
encoded_test = pd.DataFrame(encoded_test)
# カラムが消えたので直し、元のデータと結合
for i, j in enumerate(te_columns):
    encoded_test = encoded_test.rename(columns={i: j})
test_clean_drop = test_clean_drop.drop(te_columns, axis=1)
test_clean_drop = pd.merge(test_clean_drop, encoded_test, right_index=True, left_index=True)


'''
# onehotをカテゴリカル化
categorical_columns = problem_list
# カテゴリカル変数を指定
for col in categorical_columns:
    train_clean_drop[col] = train_clean_drop[col].astype('category')
    test_clean_drop[col] = test_clean_drop[col].astype('category')  
'''


ValueError: Target type was inferred to be 'multiclass'. Only ('binary', 'continuous') are supported.

In [None]:
train_clean_drop.head()

Unnamed: 0,tree_dbh,year,month,month_sin,month_cos,BranchLights,BranchOther,MetalGrates,RootOther,Stones,Sneakers,TrunkLights,TrunkOther,WiresRope,NoProblem,problem_count,guards,st_senate,st_assem,cb_num,spc_genus,steward,borocode,boro_ct,curb_loc,zip_city,user_type,problems,sidewalk,cncldist,spc_common,nta
0,14,2015,6,1.224647e-16,-1.0,0,0,0,0,0,0,0,0,0,1,0,0.872048,0.857429,0.86021,0.857785,0.854931,0.864933,0.856218,0.891552,0.858009,0.906789,0.851432,0.869497,0.850177,0.87734,0.807256,0.906789
1,5,2016,9,-1.0,-1.83697e-16,0,0,0,0,0,0,0,0,0,1,0,0.824911,0.86607,0.812225,0.835921,0.893696,0.858295,0.854849,0.738517,0.858446,0.854849,0.850375,0.870739,0.863242,0.840906,0.932871,0.828704
2,26,2015,9,-1.0,-1.83697e-16,1,0,0,0,1,0,0,0,0,0,2,0.870216,0.857826,0.862005,0.8631,0.852832,0.863088,0.855556,0.801044,0.858446,0.855556,0.850375,0.829,0.863242,0.856351,0.817962,0.821143
3,15,2016,5,0.5,-0.8660254,0,0,0,0,0,0,0,0,0,1,0,0.871324,0.86844,0.870901,0.872902,0.861814,0.864641,0.863419,0.977775,0.858522,0.863419,0.866773,0.871105,0.852804,0.873812,0.861814,0.897035
4,23,2016,6,1.224647e-16,-1.0,0,0,0,0,1,0,0,0,0,0,1,0.872048,0.829581,0.83535,0.814878,0.840399,0.864933,0.864699,0.835001,0.858009,0.862635,0.851432,0.82402,0.864763,0.829615,0.840399,0.770902


In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
valid_scores = []
models = []

for fold, (train_idx, valid_idx) in enumerate(kf.split(train_clean_drop, y)):
    print(f'fold{fold} start')
    X_train, y_train = train_clean_drop.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = train_clean_drop.iloc[valid_idx], y.iloc[valid_idx]

    lgb_train = lgb.Dataset(X_train, y_train, weight=compute_sample_weight(class_weight='balanced', y=y_train).astype('float32'))
    lgb_eval = lgb.Dataset(X_valid, y_valid, weight=np.ones(len(X_valid)).astype('float32'))

    params = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'num_class': 3,
        'seed': 0
    }
    verbose_eval = -1 # この数字を1にすると学習時のスコア推移がコマンドライン表示される

    model = lgb.train(params,
                      lgb_train,
                      valid_sets=[lgb_train, lgb_eval],
                      num_boost_round=1000,
                       callbacks=[lgb.early_stopping(stopping_rounds=10, 
                                verbose=True), # early_stopping用コールバック関数
                           lgb.log_evaluation(verbose_eval)] # コマンドライン出力用コールバック関数))
                    )

    y_pred = model.predict(X_valid, num_iteration=model.best_iteration)
    y_pred_max = np.argmax(y_pred, axis=1)
    score = f1_score(y_valid, y_pred_max, average='macro')
    print(f'fold{fold} f1_score: {score:.4f}')
    valid_scores.append(score)
    models.append(model)
print(f'CV: {np.mean(valid_scores):.4f}')


fold0 start
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2468
[LightGBM] [Info] Number of data points in the train set: 15987, number of used features: 31
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[407]	training's multi_logloss: 0.224635	valid_1's multi_logloss: 0.753399
fold0 f1_score: 0.3318
fold1 start
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2471
[LightGBM] [Info] Number of data points in the train set: 15987, number of used features: 31
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [In

In [None]:
y_pred = model.predict(test_clean_drop)
y_pred_max = np.argmax(y_pred, axis=1) # 最尤と判断したクラスの値にする
sample_df[1] = y_pred_max
# sample_df.to_csv('C:/python/signate/data/base2-sincos-genus-NoCategory-OneHotProblemFix.csv', header=None)

In [None]:
import pandas as pd

importance = model.feature_importance()
feature_names = model.feature_name()

df_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
df_importance


Unnamed: 0,Feature,Importance
0,tree_dbh,5179
1,curb_loc,252
2,steward,738
3,guards,671
4,sidewalk,790
5,user_type,1219
6,problems,1808
7,spc_common,3756
8,nta,3927
9,borocode,415
