In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import category_encoders as ce

# pandasの行を省略しない
pd.set_option('display.max_columns', None)

In [22]:
categorical_columns = ['curb_loc', 'steward', 'guards',
       'sidewalk', 'user_type', 'problems', 'spc_common', 'spc_latin', 'nta',
       'nta_name', 'borocode', 'boro_ct', 'boroname', 'zip_city', 'cb_num',
       'st_senate', 'st_assem', 'cncldist']

In [23]:
train_df = pd.read_csv('C:/python/signate/data/train.csv', index_col=0)
test_df = pd.read_csv('C:/python/signate/data/test.csv', index_col=0)
sample_df = pd.read_csv('C:/python/signate/data/sample_submission.csv', header=None, index_col=0)

train_df.head()


Unnamed: 0,created_at,tree_dbh,curb_loc,health,steward,guards,sidewalk,user_type,problems,spc_common,spc_latin,nta,nta_name,borocode,boro_ct,boroname,zip_city,cb_num,st_senate,st_assem,cncldist
0,2015-06-29,14,OnCurb,1,,,Damage,Volunteer,,English oak,Quercus robur,QN45,Douglas Manor-Douglaston-Little Neck,4,4152901,Queens,Little Neck,411,11,25,23
1,2016-09-21,5,OnCurb,1,3or4,Helpful,NoDamage,Volunteer,,crimson king maple,Acer platanoides 'Crimson King',BX05,Bedford Park-Fordham North,2,2039901,Bronx,Bronx,207,33,78,15
2,2015-09-13,26,OnCurb,2,,,NoDamage,Volunteer,StonesBranchLights,English oak,Quercus robur,SI01,Annadale-Huguenot-Prince's Bay-Eltingville,5,5017011,Staten Island,Staten Island,503,24,62,51
3,2016-05-09,15,OnCurb,0,,,Damage,NYC Parks Staff,,honeylocust,Gleditsia triacanthos var. inermis,SI11,Charleston-Richmond Valley-Tottenville,5,5024401,Staten Island,Staten Island,503,24,62,51
4,2016-06-24,23,OnCurb,1,,,NoDamage,Volunteer,Stones,London planetree,Platanus x acerifolia,MN03,Central Harlem North-Polo Grounds,1,1022102,Manhattan,New York,110,30,70,9


データはほとんどカテゴリカル
#### 思いついた精度向上案
- commonを個別名称として、latinの上を属名として利用する
- 地区が多すぎるので整理する

In [24]:
# train_dfのcolumnを取得
columns = test_df.columns

In [25]:
def cleansing(df):
    df.fillna("None", inplace=True)

    # 日付を月情報だけにする
    df['created_at'] = pd.to_datetime(df['created_at'])
    df['created_at'] = df['created_at'].dt.month
    return df

train_clean = cleansing(train_df)
test_clean = cleansing(test_df)

In [26]:
train_clean.head()

Unnamed: 0,created_at,tree_dbh,curb_loc,health,steward,guards,sidewalk,user_type,problems,spc_common,spc_latin,nta,nta_name,borocode,boro_ct,boroname,zip_city,cb_num,st_senate,st_assem,cncldist
0,6,14,OnCurb,1,,,Damage,Volunteer,,English oak,Quercus robur,QN45,Douglas Manor-Douglaston-Little Neck,4,4152901,Queens,Little Neck,411,11,25,23
1,9,5,OnCurb,1,3or4,Helpful,NoDamage,Volunteer,,crimson king maple,Acer platanoides 'Crimson King',BX05,Bedford Park-Fordham North,2,2039901,Bronx,Bronx,207,33,78,15
2,9,26,OnCurb,2,,,NoDamage,Volunteer,StonesBranchLights,English oak,Quercus robur,SI01,Annadale-Huguenot-Prince's Bay-Eltingville,5,5017011,Staten Island,Staten Island,503,24,62,51
3,5,15,OnCurb,0,,,Damage,NYC Parks Staff,,honeylocust,Gleditsia triacanthos var. inermis,SI11,Charleston-Richmond Valley-Tottenville,5,5024401,Staten Island,Staten Island,503,24,62,51
4,6,23,OnCurb,1,,,NoDamage,Volunteer,Stones,London planetree,Platanus x acerifolia,MN03,Central Harlem North-Polo Grounds,1,1022102,Manhattan,New York,110,30,70,9


In [27]:
train_df['curb_loc'].unique()

array(['OnCurb', 'OffsetFromCurb'], dtype=object)

In [28]:
# 一旦lightgbmで学習してみる
# count encodingとラベル削除を行う
drop_columns = ['created_at', 'curb_loc', 'steward', 'guards',
       'sidewalk', 'user_type', 'problems', 'spc_common', 'spc_latin', 'nta',
       'nta_name', 'borocode', 'boro_ct', 'boroname', 'zip_city', 'cb_num',
       'st_senate', 'st_assem', 'cncldist']

y = train_df['health']
train_clean_drop = train_clean.drop(['health'], axis=1)
test_clean_drop = test_clean

ce = ce.CountEncoder(cols=categorical_columns)
ce.fit(train_clean_drop)

train_clean_drop = ce.transform(train_clean_drop)
test_clean_drop = ce.transform(test_clean_drop)

# カテゴリカル変数を指定
for col in categorical_columns:
    train_clean_drop[col] = train_clean_drop[col].astype('category')
    test_clean_drop[col] = test_clean_drop[col].astype('category')  


# trainとvalidに分割
X_train, X_test, y_train, y_test = train_test_split(train_clean_drop, y, test_size=0.2, random_state=0)
print(len(X_train))

# LightGBM用のデータセットに変換
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

# LightGBMのハイパーパラメータの設定
params = {
  'objective':'multiclass', # 目的 : 多クラス分類
  'metric':'multi_logloss', # 評価指標 : 不正解率(= 1-正解率)
  'num_class':3             # クラス数 : 3
}

verbose_eval = 1 # この数字を1にすると学習時のスコア推移がコマンドライン表示される

# LightGBMモデルの学習
model = lgb.train(params, train_data, num_boost_round=1000, 
                  valid_sets=[train_data, test_data],
                  callbacks=[lgb.early_stopping(stopping_rounds=10, 
                                verbose=True), # early_stopping用コールバック関数
                           lgb.log_evaluation(verbose_eval)] # コマンドライン出力用コールバック関数)
)
# テストデータでの予測
y_pred = model.predict(X_test)
y_pred_max = np.argmax(y_pred, axis=1) # 最尤と判断したクラスの値にする
# Calculate accuracy score
print("Accuracy:", f1_score(y_test, y_pred_max, average='macro'))


15987
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 837
[LightGBM] [Info] Number of data points in the train set: 15987, number of used features: 20
[LightGBM] [Info] Start training from score -1.738591
[LightGBM] [Info] Start training from score -0.237206
[LightGBM] [Info] Start training from score -3.340937
[1]	training's multi_logloss: 0.602527	valid_1's multi_logloss: 0.610775
Training until validation scores don't improve for 10 rounds
[2]	training's multi_logloss: 0.594718	valid_1's multi_logloss: 0.6096
[3]	training's multi_logloss: 0.587221	valid_1's multi_logloss: 0.609216
[4]	training's multi_logloss: 0.580569	valid_1's multi_logloss: 0.609104
[5]	training's multi_logloss: 0.574553	valid_1's multi_logloss: 0.609323
[6]	training's multi_logloss: 0.56844	valid_1's multi_logloss: 0.609458
[7]	training's multi_logloss: 0.562511	valid_1's multi_logloss: 0.609717
[8]	training's multi_logloss: 0.557346	valid_1's multi_logloss: 0.610449
[9]	tr

In [31]:
y_pred = model.predict(test_clean_drop)
y_pred_max = np.argmax(y_pred, axis=1) # 最尤と判断したクラスの値にする
sample_df[1] = y_pred_max
sample_df.to_csv('C:/python/signate/data/base2t.csv', header=None)