In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import category_encoders as ce

# pandasの行を省略しない
pd.set_option('display.max_columns', None)

In [20]:
train_df = pd.read_csv('C:/python/signate/data/train.csv', index_col=0)
test_df = pd.read_csv('C:/python/signate/data/test.csv', index_col=0)
sample_df = pd.read_csv('C:/python/signate/data/sample_submission.csv', header=None, index_col=0)


データはほとんどカテゴリカル
#### 思いついた精度向上案
- commonを個別名称として、latinの上を属名として利用する
- 地区が多すぎるので整理する
- 季節性を導入

In [21]:
# train_dfのcolumnを取得
columns = test_df.columns

In [22]:
def cleansing(df):
    df['created_at'] = pd.to_datetime(df['created_at'])
    df['year'] = df['created_at'].dt.year
    df['month'] = df['created_at'].dt.month
    # df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    # df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df.drop('created_at', axis=1, inplace=True)
    df['steward'].fillna('0', inplace=True)
    df['guards'].fillna('0', inplace=True)
    df['problems'].fillna('0', inplace=True)

    df['spc_genus'] = df['spc_latin'].str.split(' ').str[0]
    df.drop('spc_latin', axis=1, inplace=True)
    return df

train_clean = cleansing(train_df)
test_clean = cleansing(test_df)

In [23]:
# 一旦lightgbmで学習してみる
# count encodingとラベル削除を行う

drop_col = []
# カテゴリカル変数を指定
categorical_columns = ['curb_loc', 'steward', 'guards',
       'sidewalk', 'user_type', 'problems', 'spc_common', 'nta',
       'nta_name', 'borocode', 'boro_ct', 'boroname', 'zip_city', 'cb_num',
       'st_senate', 'st_assem', 'cncldist', 'spc_genus']

# カテゴリカルカラムから使わないcolumnを削除
categorical_columns = list(set(categorical_columns) - set(drop_col))

# 目的変数を指定
y = train_df['health']

# 使わないcolumnを削除
train_clean_drop = train_clean.drop(['health'], axis=1)
train_clean_drop = train_clean_drop.drop(drop_col, axis=1)
test_clean_drop = test_clean.drop(drop_col, axis=1)

# count encoding
cencoder = ce.CountEncoder(cols=categorical_columns)
cencoder.fit(train_clean_drop)

# count encodingを適用
train_clean_drop = cencoder.transform(train_clean_drop)
test_clean_drop = cencoder.transform(test_clean_drop)

'''
# カテゴリカル変数を指定
for col in categorical_columns:
    train_clean_drop[col] = train_clean_drop[col].astype('category')
    test_clean_drop[col] = test_clean_drop[col].astype('category')  

'''

"\n# カテゴリカル変数を指定\nfor col in categorical_columns:\n    train_clean_drop[col] = train_clean_drop[col].astype('category')\n    test_clean_drop[col] = test_clean_drop[col].astype('category')  \n\n"

In [24]:
train_clean_drop.head()

Unnamed: 0,tree_dbh,curb_loc,steward,guards,sidewalk,user_type,problems,spc_common,nta,nta_name,borocode,boro_ct,boroname,zip_city,cb_num,st_senate,st_assem,cncldist,year,month,spc_genus
0,14,18777,14883,14943,9295,11189,12243,1277,222,222,6572,77,6572,222,843,1885,589,849,2015,6,3956
1,5,18777,1079,3650,10689,11189,12243,135,37,37,2270,14,2270,2270,89,308,88,207,2016,9,1879
2,26,18777,14883,14943,10689,11189,762,1277,300,300,3735,90,3735,3735,1735,2905,1459,1604,2015,9,3956
3,15,18777,14883,14943,9295,3000,12243,1034,359,359,3735,62,3735,3735,1735,2905,1459,1604,2016,5,1034
4,23,18777,14883,14943,10689,11189,2219,2237,79,79,2288,24,2288,2267,149,478,219,270,2016,6,2237


In [25]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
valid_scores = []
models = []

for fold, (train_idx, valid_idx) in enumerate(kf.split(train_clean_drop, y)):
    print(f'fold{fold} start')
    X_train, y_train = train_clean_drop.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = train_clean_drop.iloc[valid_idx], y.iloc[valid_idx]

    lgb_train = lgb.Dataset(X_train, y_train, weight=compute_sample_weight(class_weight='balanced', y=y_train).astype('float32'))
    lgb_eval = lgb.Dataset(X_valid, y_valid, weight=np.ones(len(X_valid)).astype('float32'))

    params = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'num_class': 3,
        'seed': 0
    }
    verbose_eval = -1 # この数字を1にすると学習時のスコア推移がコマンドライン表示される

    model = lgb.train(params,
                      lgb_train,
                      valid_sets=[lgb_train, lgb_eval],
                      num_boost_round=1000,
                       callbacks=[lgb.early_stopping(stopping_rounds=10, 
                                verbose=True), # early_stopping用コールバック関数
                           lgb.log_evaluation(verbose_eval)] # コマンドライン出力用コールバック関数))
                    )

    y_pred = model.predict(X_valid, num_iteration=model.best_iteration)
    y_pred_max = np.argmax(y_pred, axis=1)
    score = f1_score(y_valid, y_pred_max, average='macro')
    print(f'fold{fold} f1_score: {score:.4f}')
    valid_scores.append(score)
    models.append(model)
print(f'CV: {np.mean(valid_scores):.4f}')


fold0 start
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 830
[LightGBM] [Info] Number of data points in the train set: 15987, number of used features: 21
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
Training until validation scores don't improve for 10 rounds


Early stopping, best iteration is:
[418]	training's multi_logloss: 0.28963	valid_1's multi_logloss: 0.796834
fold0 f1_score: 0.3400
fold1 start
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 830
[LightGBM] [Info] Number of data points in the train set: 15987, number of used features: 21
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[494]	training's multi_logloss: 0.248994	valid_1's multi_logloss: 0.767016
fold1 f1_score: 0.3525
fold2 start
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 830
[LightGBM] [Info] Number of data points in the train set: 15987, number of used features: 21
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [

In [26]:
y_pred = model.predict(test_clean_drop)
y_pred_max = np.argmax(y_pred, axis=1) # 最尤と判断したクラスの値にする
sample_df[1] = y_pred_max
# sample_df.to_csv('C:/python/signate/data/base2-sincos-genus.csv', header=None)

In [27]:
# numpyで保存
np.save('C:/python/signate/data/lgb_best.npy', y_pred)

In [28]:
import pandas as pd

importance = model.feature_importance()
feature_names = model.feature_name()

df_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
df_importance


Unnamed: 0,Feature,Importance
0,tree_dbh,5552
1,curb_loc,283
2,steward,784
3,guards,705
4,sidewalk,852
5,user_type,1267
6,problems,2544
7,spc_common,3975
8,nta,4117
9,nta_name,0
