In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
# import optuna.integration.lightgbm as lgb

import category_encoders as ce
import re

# pandasの行を省略しない
pd.set_option('display.max_columns', None)

In [12]:
train_df = pd.read_csv('C:/python/signate/data/train.csv', index_col=0)
test_df = pd.read_csv('C:/python/signate/data/test.csv', index_col=0)
sample_df = pd.read_csv('C:/python/signate/data/sample_submission.csv', header=None, index_col=0)
train_df.head()

Unnamed: 0,created_at,tree_dbh,curb_loc,health,steward,guards,sidewalk,user_type,problems,spc_common,spc_latin,nta,nta_name,borocode,boro_ct,boroname,zip_city,cb_num,st_senate,st_assem,cncldist
0,2015-06-29,14,OnCurb,1,,,Damage,Volunteer,,English oak,Quercus robur,QN45,Douglas Manor-Douglaston-Little Neck,4,4152901,Queens,Little Neck,411,11,25,23
1,2016-09-21,5,OnCurb,1,3or4,Helpful,NoDamage,Volunteer,,crimson king maple,Acer platanoides 'Crimson King',BX05,Bedford Park-Fordham North,2,2039901,Bronx,Bronx,207,33,78,15
2,2015-09-13,26,OnCurb,2,,,NoDamage,Volunteer,StonesBranchLights,English oak,Quercus robur,SI01,Annadale-Huguenot-Prince's Bay-Eltingville,5,5017011,Staten Island,Staten Island,503,24,62,51
3,2016-05-09,15,OnCurb,0,,,Damage,NYC Parks Staff,,honeylocust,Gleditsia triacanthos var. inermis,SI11,Charleston-Richmond Valley-Tottenville,5,5024401,Staten Island,Staten Island,503,24,62,51
4,2016-06-24,23,OnCurb,1,,,NoDamage,Volunteer,Stones,London planetree,Platanus x acerifolia,MN03,Central Harlem North-Polo Grounds,1,1022102,Manhattan,New York,110,30,70,9


データはほとんどカテゴリカル
#### 思いついた精度向上案
- commonを個別名称として、latinの上を属名として利用する
- 地区が多すぎるので整理する
- 季節性を導入
- 郵便番号は連続性があるからカテゴリカルにしないほうがよいのでは
- 高い確率で状態が悪いと推測された木の近くの木はフラグ立てる。つまり2段階モデル
- 同じ人が記録した場合、1日のうちの0,1,2の割合は無意識にバイアスがかかって同じくらいにしてしまうのでは？

In [13]:
# train_dfのcolumnを取得
columns = test_df.columns

In [14]:
def cleansing(df):
    df['created_at'] = pd.to_datetime(df['created_at'])
    df['year'] = df['created_at'].dt.year
    df['month'] = df['created_at'].dt.month
    # df['day'] = df['created_at'].dt.day
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df.drop('created_at', axis=1, inplace=True)
    df['steward'].fillna('0', inplace=True)
    df['guards'].fillna('0', inplace=True)
    df['problems'].fillna('NoProblem', inplace=True)

    df['spc_genus'] = df['spc_latin'].str.split(' ').str[0]
    df.drop('spc_latin', axis=1, inplace=True)

    return df

train_clean = cleansing(train_df)
test_clean = cleansing(test_df)
train_clean.head()

Unnamed: 0,tree_dbh,curb_loc,health,steward,guards,sidewalk,user_type,problems,spc_common,nta,nta_name,borocode,boro_ct,boroname,zip_city,cb_num,st_senate,st_assem,cncldist,year,month,month_sin,month_cos,spc_genus
0,14,OnCurb,1,0,0,Damage,Volunteer,NoProblem,English oak,QN45,Douglas Manor-Douglaston-Little Neck,4,4152901,Queens,Little Neck,411,11,25,23,2015,6,1.224647e-16,-1.0,Quercus
1,5,OnCurb,1,3or4,Helpful,NoDamage,Volunteer,NoProblem,crimson king maple,BX05,Bedford Park-Fordham North,2,2039901,Bronx,Bronx,207,33,78,15,2016,9,-1.0,-1.83697e-16,Acer
2,26,OnCurb,2,0,0,NoDamage,Volunteer,StonesBranchLights,English oak,SI01,Annadale-Huguenot-Prince's Bay-Eltingville,5,5017011,Staten Island,Staten Island,503,24,62,51,2015,9,-1.0,-1.83697e-16,Quercus
3,15,OnCurb,0,0,0,Damage,NYC Parks Staff,NoProblem,honeylocust,SI11,Charleston-Richmond Valley-Tottenville,5,5024401,Staten Island,Staten Island,503,24,62,51,2016,5,0.5,-0.8660254,Gleditsia
4,23,OnCurb,1,0,0,NoDamage,Volunteer,Stones,London planetree,MN03,Central Harlem North-Polo Grounds,1,1022102,Manhattan,New York,110,30,70,9,2016,6,1.224647e-16,-1.0,Platanus


problemのonehotは意味ない、むしろスコア悪化させる

In [15]:
'''
# trainとtestに含まれるproblemは全く同じ
# problemのonehotカラムを用意
problem_list = ['BranchLights', 'BranchOther', 'MetalGrates', 'RootOther', 'Stones', 'Sneakers', 'TrunkLights',
                 'TrunkOther', 'WiresRope', 'NoProblem']	

for problem in problem_list:
    train_clean[problem] = 0
    test_clean[problem] = 0

train_clean['problem_count'] = 0
test_clean['problem_count'] = 0

# problemlistにあったらonehotする
for i in train_clean.index:
    p_count = 0
    for problem in problem_list:
        if(problem in train_clean.loc[i, 'problems']):
            train_clean.loc[i, problem] = 1
            if(problem != 'NoProblem'):
                p_count+=1
    train_clean.loc[i, 'problem_count'] = p_count

for i in test_clean.index:
    p_count = 0
    for problem in problem_list:
        if(problem in test_clean.loc[i, 'problems']):
            test_clean.loc[i, problem] = 1
            if(problem != 'NoProblem'):
                p_count+=1
    test_clean.loc[i, 'problem_count'] = p_count

test_clean.head(10)
'''

"\n# trainとtestに含まれるproblemは全く同じ\n# problemのonehotカラムを用意\nproblem_list = ['BranchLights', 'BranchOther', 'MetalGrates', 'RootOther', 'Stones', 'Sneakers', 'TrunkLights',\n                 'TrunkOther', 'WiresRope', 'NoProblem']\t\n\nfor problem in problem_list:\n    train_clean[problem] = 0\n    test_clean[problem] = 0\n\ntrain_clean['problem_count'] = 0\ntest_clean['problem_count'] = 0\n\n# problemlistにあったらonehotする\nfor i in train_clean.index:\n    p_count = 0\n    for problem in problem_list:\n        if(problem in train_clean.loc[i, 'problems']):\n            train_clean.loc[i, problem] = 1\n            if(problem != 'NoProblem'):\n                p_count+=1\n    train_clean.loc[i, 'problem_count'] = p_count\n\nfor i in test_clean.index:\n    p_count = 0\n    for problem in problem_list:\n        if(problem in test_clean.loc[i, 'problems']):\n            test_clean.loc[i, problem] = 1\n            if(problem != 'NoProblem'):\n                p_count+=1\n    test_clean.loc[i, 'proble

In [16]:
# 一旦lightgbmで学習してみる
# count encodingとラベル削除を行う

# 使わないcolumnを指定
'''
drop_col = [ 'Sneakers', "BranchLights",	"BranchOther",	"MetalGrates",	
            "RootOther",	"Stones",	"Sneakers",	"TrunkLights",	"TrunkOther",
            	"WiresRope","NoProblem"]
'''
drop_col = []
ce_drop_col = ['nta_name', 'boroname',]
# カテゴリカル変数を指定
ce_columns = ['curb_loc', 'steward', 'guards',
       'sidewalk', 'user_type', 'problems', 'nta',
       'nta_name',  'boro_ct', 'boroname', 'zip_city', 
       'spc_genus',  'spc_common', 'borocode', 'cb_num', 'st_senate', 'st_assem', 'cncldist', ]
le_columns = []

# カテゴリカルカラムから使わないcolumnを削除
ce_columns = list(set(ce_columns) - set(ce_drop_col))

# 目的変数を指定
y = train_df['health']

# 使わないcolumnを削除
train_clean_drop = train_clean.drop(['health'], axis=1)
drops = drop_col+ce_drop_col
train_clean_drop = train_clean_drop.drop(drops, axis=1)
test_clean_drop = test_clean.drop(drops,axis=1)

'''
# le
le = LabelEncoder()
for column in le_columns:
    train_clean_drop[column] = le.fit_transform(train_clean_drop[column])
    test_clean_drop[column] = le.transform(test_clean_drop[column])
'''
# カテゴリカル変数を指定
for col in le_columns:
    train_clean_drop[col] = train_clean_drop[col].astype('category')
    test_clean_drop[col] = test_clean_drop[col].astype('category')  

# mergeしてfit, そのあとtransform
all_df = pd.concat([train_clean_drop, test_clean_drop], axis=0)

# count encoding
cencoder = ce.CountEncoder(cols=ce_columns)
cencoder.fit(all_df)

# count encodingを適用
train_clean_drop = cencoder.transform(train_clean_drop)
test_clean_drop = cencoder.transform(test_clean_drop)



In [17]:
train_clean_drop.head()

Unnamed: 0,tree_dbh,curb_loc,steward,guards,sidewalk,user_type,problems,spc_common,nta,borocode,boro_ct,zip_city,cb_num,st_senate,st_assem,cncldist,year,month,month_sin,month_cos,spc_genus
0,14,37270,29409,29510,18509,22298,24288,2579,413,13234,129,413,1654,3780,1145,1694,2015,6,1.224647e-16,-1.0,7853
1,5,37270,2068,7366,21177,22298,24288,254,59,4479,24,4479,166,564,162,380,2016,9,-1.0,-1.83697e-16,3613
2,26,37270,29409,29510,21177,22298,1529,2579,616,7292,189,7292,3436,5720,2868,3155,2015,9,-1.0,-1.83697e-16,7853
3,15,37270,29409,29510,18509,6031,24288,2104,678,7292,121,7292,3436,5720,2868,3155,2016,5,0.5,-0.8660254,2104
4,23,37270,29409,29510,21177,22298,4455,4339,144,4522,39,4473,289,948,418,542,2016,6,1.224647e-16,-1.0,4339


In [18]:
def f1(y_pred, data):
    y_true = data.get_label()
    score = f1_score(np.argmax(y_pred, axis=1), y_true, average='macro')
    return 'custom', score, True

In [22]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
valid_scores = []
models = []

for fold, (train_idx, valid_idx) in enumerate(kf.split(train_clean_drop, y)):
    print(f'fold{fold} start')
    X_train, y_train = train_clean_drop.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = train_clean_drop.iloc[valid_idx], y.iloc[valid_idx]

    lgb_train = lgb.Dataset(X_train, y_train, weight=compute_sample_weight(class_weight='balanced', y=y_train).astype('float32'))
    lgb_eval = lgb.Dataset(X_valid, y_valid, weight=np.ones(len(X_valid)).astype('float32'))

    '''
    params = {'objective': 'multiclass',
 'metric': 'multi_logloss',
 'num_class': 3,
 'seed': 0,
 'feature_pre_filter': False,
 'lambda_l1': 0.0015923361719036968,
 'lambda_l2': 0.004786876640032096,
 'num_leaves': 252,
 'feature_fraction': 0.8,
 'bagging_fraction': 0.9931160872840541,
 'bagging_freq': 7,
 'min_child_samples': 5,
 'num_iterations': 1000}
 '''
    verbose_eval = -1 # この数字を1にすると学習時のスコア推移がコマンドライン表示される
    params = {'objective': 'multiclass',
 'metric': 'multi_logloss',
 'num_class': 3,
 'seed': 0,
 'num_iterations': 1000}
    
    model = lgb.train(params,
                      lgb_train,
                      valid_sets=[lgb_train, lgb_eval],
                      num_boost_round=1000,
                       callbacks=[lgb.early_stopping(stopping_rounds=10, 
                                verbose=True), # early_stopping用コールバック関数
                           lgb.log_evaluation(verbose_eval)], # コマンドライン出力用コールバック関数))
                        # feval=f1,
                        # categorical_feature=le_columns,
                    )

    y_pred = model.predict(X_valid, num_iteration=model.best_iteration)
    y_pred_max = np.argmax(y_pred, axis=1)
    score = f1_score(y_valid, y_pred_max, average='macro')
    print(f'fold{fold} f1_score: {score:.4f}')
    valid_scores.append(score)
    models.append(model)
print(f'CV: {np.mean(valid_scores):.4f}')

fold0 start
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 828
[LightGBM] [Info] Number of data points in the train set: 15987, number of used features: 21
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
Training until validation scores don't improve for 10 rounds




Early stopping, best iteration is:
[466]	training's multi_logloss: 0.257997	valid_1's multi_logloss: 0.791261
fold0 f1_score: 0.3539
fold1 start
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 828
[LightGBM] [Info] Number of data points in the train set: 15987, number of used features: 21
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
Training until validation scores don't improve for 10 rounds




Early stopping, best iteration is:
[470]	training's multi_logloss: 0.259554	valid_1's multi_logloss: 0.772076
fold1 f1_score: 0.3575
fold2 start
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 826
[LightGBM] [Info] Number of data points in the train set: 15987, number of used features: 21
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
Training until validation scores don't improve for 10 rounds




Early stopping, best iteration is:
[467]	training's multi_logloss: 0.254432	valid_1's multi_logloss: 0.771162
fold2 f1_score: 0.3593
fold3 start
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 827
[LightGBM] [Info] Number of data points in the train set: 15987, number of used features: 21
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
Training until validation scores don't improve for 10 rounds




Early stopping, best iteration is:
[489]	training's multi_logloss: 0.245128	valid_1's multi_logloss: 0.783708
fold3 f1_score: 0.3446
fold4 start
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 827
[LightGBM] [Info] Number of data points in the train set: 15988, number of used features: 21
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
Training until validation scores don't improve for 10 rounds




Early stopping, best iteration is:
[511]	training's multi_logloss: 0.236012	valid_1's multi_logloss: 0.776939
fold4 f1_score: 0.3396
CV: 0.3510


In [26]:
'''
# テストデータ格納用のnumpy行列を作成
test_pred = np.zeros((len(test_clean), 3, 5))
# 5モデルのアンサンブル
# 5個のモデル
for fold_, model in enumerate(models):
    # testを予測
    pred_ = model.predict(test_clean_drop, num_iteration=model.best_iteration) 
    # testの予測を保存
    test_pred[:, :, fold_] = pred_

test_pred_copy = test_pred.copy()
# テストデータの行
for i in range(len(y_pred)):
    # 各モデル
    for j in range(5):
        a = test_pred_copy[i, :, j]
        # 最大値の列番号
        test_pred_copy_max = np.argmax(a)
        # 最大値の列を1、他を0
        if test_pred_copy_max == 0:
            test_pred_copy[i, 0, j] = 1
            test_pred_copy[i, 1, j] = 0
            test_pred_copy[i, 2, j] = 0
        elif test_pred_copy_max == 1:
            test_pred_copy[i, 0, j] = 0
            test_pred_copy[i, 1, j] = 1
            test_pred_copy[i, 2, j] = 0
        elif test_pred_copy_max == 2:
            test_pred_copy[i, 0, j] = 0
            test_pred_copy[i, 1, j] = 0
            test_pred_copy[i, 2, j] = 1
# ここまで追記（以下は、test_pred_copyを使用）          

# テストデータで予測する
y_pred = np.zeros((len(test_clean), 3))

y_pred[:, 0] = np.mean(test_pred_copy[:, 0, :], axis=1)
y_pred[:, 1] = np.mean(test_pred_copy[:, 1, :], axis=1)
y_pred[:, 2] = np.mean(test_pred_copy[:, 2, :], axis=1)

y_pred_max = np.argmax(y_pred, axis=1)
'''
y_pred = model.predict(test_clean_drop)
y_pred_max = np.argmax(y_pred, axis=1) # 最尤と判断したクラスの値にする
sample_df[1] = y_pred_max
sample_df.to_csv('C:/python/signate/data/base2-noProblem-Noday-Nomonth-monthFromStart.csv', header=None)
# sample_df.to_csv('C:/python/signate/data/BEST2-anth5Vote.csv', header=None)

In [None]:
import pandas as pd

importance = model.feature_importance()
feature_names = model.feature_name()

df_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
# importtane順に並び替え
df_importance = df_importance.sort_values('Importance', ascending=False)
df_importance


Unnamed: 0,Feature,Importance
0,tree_dbh,5759
10,boro_ct,5674
8,nta,4304
7,spc_common,4241
20,spc_genus,2956
6,problems,2799
14,st_assem,2661
12,cb_num,2312
15,cncldist,2024
11,zip_city,1826
