In [59]:
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder, TargetEncoder
from catboost import CatBoost, Pool

import category_encoders as ce
import math

# pandasの行を省略しない
pd.set_option('display.max_columns', None)

In [60]:
train_df = pd.read_csv('C:/python/signate/data/train.csv', index_col=0)
test_df = pd.read_csv('C:/python/signate/data/test.csv', index_col=0)
sample_df = pd.read_csv('C:/python/signate/data/sample_submission.csv', header=None, index_col=0)
train_df.head()

Unnamed: 0,created_at,tree_dbh,curb_loc,health,steward,guards,sidewalk,user_type,problems,spc_common,spc_latin,nta,nta_name,borocode,boro_ct,boroname,zip_city,cb_num,st_senate,st_assem,cncldist
0,2015-06-29,14,OnCurb,1,,,Damage,Volunteer,,English oak,Quercus robur,QN45,Douglas Manor-Douglaston-Little Neck,4,4152901,Queens,Little Neck,411,11,25,23
1,2016-09-21,5,OnCurb,1,3or4,Helpful,NoDamage,Volunteer,,crimson king maple,Acer platanoides 'Crimson King',BX05,Bedford Park-Fordham North,2,2039901,Bronx,Bronx,207,33,78,15
2,2015-09-13,26,OnCurb,2,,,NoDamage,Volunteer,StonesBranchLights,English oak,Quercus robur,SI01,Annadale-Huguenot-Prince's Bay-Eltingville,5,5017011,Staten Island,Staten Island,503,24,62,51
3,2016-05-09,15,OnCurb,0,,,Damage,NYC Parks Staff,,honeylocust,Gleditsia triacanthos var. inermis,SI11,Charleston-Richmond Valley-Tottenville,5,5024401,Staten Island,Staten Island,503,24,62,51
4,2016-06-24,23,OnCurb,1,,,NoDamage,Volunteer,Stones,London planetree,Platanus x acerifolia,MN03,Central Harlem North-Polo Grounds,1,1022102,Manhattan,New York,110,30,70,9


データはほとんどカテゴリカル
#### 思いついた精度向上案
- commonを個別名称として、latinの上を属名として利用する
- 地区が多すぎるので整理する
- 季節性を導入
- 郵便番号は連続性があるからカテゴリカルにしないほうがよいのでは
- 高い確率で状態が悪いと推測された木の近くの木はフラグ立てる。つまり2段階モデル
- 同じ人が記録した場合、1日のうちの0,1,2の割合は無意識にバイアスがかかって同じくらいにしてしまうのでは？
- 曜日の導入

In [61]:
def cleansing(df):
    df['created_at'] = pd.to_datetime(df['created_at'])
    # df['cos_day'] = df['created_at'].dt.dayofyear
    # df['cos_day'] = df['cos_day'].apply(lambda x: np.cos(math.radians(90 - (x/365)*365)))
    # df['sin_day'] = df['created_at'].dt.dayofyear
    # df['sin_day'] = df['sin_day'].apply(lambda x: np.sin(math.radians(90 - (x/365)*365)))  
    df['year'] = df['created_at'].dt.year
    df['month'] = df['created_at'].dt.month
    # df['weekday'] = df['created_at'].dt.weekday
    # df['day'] = df.created_at.dt.day
    df.drop('created_at', axis=1, inplace=True)
    df['steward'].fillna('0', inplace=True)
    df['guards'].fillna('0', inplace=True)
    df['problems'].fillna('NoProblem', inplace=True)
    df['spc_genus'] = df['spc_latin'].str.split(' ').str[0]
    df.drop('spc_latin', axis=1, inplace=True)

    return df

train_clean = cleansing(train_df)
test_clean = cleansing(test_df)
train_clean.head()

Unnamed: 0,tree_dbh,curb_loc,health,steward,guards,sidewalk,user_type,problems,spc_common,nta,nta_name,borocode,boro_ct,boroname,zip_city,cb_num,st_senate,st_assem,cncldist,year,month,spc_genus
0,14,OnCurb,1,0,0,Damage,Volunteer,NoProblem,English oak,QN45,Douglas Manor-Douglaston-Little Neck,4,4152901,Queens,Little Neck,411,11,25,23,2015,6,Quercus
1,5,OnCurb,1,3or4,Helpful,NoDamage,Volunteer,NoProblem,crimson king maple,BX05,Bedford Park-Fordham North,2,2039901,Bronx,Bronx,207,33,78,15,2016,9,Acer
2,26,OnCurb,2,0,0,NoDamage,Volunteer,StonesBranchLights,English oak,SI01,Annadale-Huguenot-Prince's Bay-Eltingville,5,5017011,Staten Island,Staten Island,503,24,62,51,2015,9,Quercus
3,15,OnCurb,0,0,0,Damage,NYC Parks Staff,NoProblem,honeylocust,SI11,Charleston-Richmond Valley-Tottenville,5,5024401,Staten Island,Staten Island,503,24,62,51,2016,5,Gleditsia
4,23,OnCurb,1,0,0,NoDamage,Volunteer,Stones,London planetree,MN03,Central Harlem North-Polo Grounds,1,1022102,Manhattan,New York,110,30,70,9,2016,6,Platanus


In [62]:
# 種目ごとの平均直径とその差を特徴量にする
test_clean['health'] = 0
all_df = pd.concat([train_clean, test_clean], axis=0)   
tmp_df = all_df[['tree_dbh', 'spc_common']]
dbh_mean = (pd.DataFrame(tmp_df.groupby('spc_common').mean()['tree_dbh']))
dbh_mean.rename(columns={'tree_dbh': 'dbh_mean'}, inplace=True)
all_df = pd.merge(all_df, dbh_mean, on='spc_common', how='left')
all_df['dbh_diff'] = all_df['tree_dbh'] - all_df['dbh_mean']
train_clean = all_df.iloc[:len(train_clean)]
test_clean = all_df.iloc[len(train_clean):]
test_clean.drop('health', axis=1, inplace=True)
test_clean.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_clean.drop('health', axis=1, inplace=True)


Unnamed: 0,tree_dbh,curb_loc,steward,guards,sidewalk,user_type,problems,spc_common,nta,nta_name,borocode,boro_ct,boroname,zip_city,cb_num,st_senate,st_assem,cncldist,year,month,spc_genus,dbh_mean,dbh_diff
19984,15,OnCurb,0,0,Damage,NYC Parks Staff,NoProblem,Callery pear,BK17,Sheepshead Bay-Gerritsen Beach-Manhattan Beach,3,3062000,Brooklyn,Brooklyn,315,22,45,48,2015,9,Pyrus,9.993088,5.006912
19985,5,OnCurb,1or2,0,NoDamage,Volunteer,NoProblem,cherry,BX62,Woodlawn-Wakefield,2,2045101,Bronx,Bronx,212,34,81,11,2015,10,Prunus,6.937519,-1.937519
19986,4,OnCurb,0,Unsure,NoDamage,Volunteer,NoProblem,littleleaf linden,QN60,Kew Gardens,4,4013800,Queens,Kew Gardens,409,14,27,29,2016,8,Tilia,9.373364,-5.373364
19987,7,OnCurb,0,0,NoDamage,TreesCount Staff,NoProblem,dawn redwood,BK09,Brooklyn Heights-Cobble Hill,3,3000301,Brooklyn,Brooklyn,302,26,52,33,2015,8,Metasequoia,9.541667,-2.541667
19988,6,OnCurb,1or2,Helpful,NoDamage,TreesCount Staff,Stones,purple-leaf plum,BK75,Bedford,3,3025100,Brooklyn,Brooklyn,303,25,56,36,2015,10,Prunus,9.447942,-3.447942


In [63]:
'''
# trainとtestに含まれるproblemは全く同じ
# problemのonehotカラムを用意
problem_list = ['BranchLights', 'BranchOther', 'MetalGrates', 'RootOther', 'Stones', 'Sneakers', 'TrunkLights',
                 'TrunkOther', 'WiresRope', 'NoProblem']	

for problem in problem_list:
    train_clean.loc[:, problem] = 0
    test_clean.loc[:, problem] = 0

train_clean['problem_count'] = 0
test_clean['problem_count'] = 0

# problemlistにあったらonehotする
for i in train_clean.index:
    p_count = 0
    for problem in problem_list:
        if(problem in train_clean.loc[i, 'problems']):
            train_clean.loc[i, problem] = 1
            if(problem != 'NoProblem'):
                p_count+=1
    train_clean.loc[i, 'problem_count'] = p_count

for i in test_clean.index:
    p_count = 0
    for problem in problem_list:
        if(problem in test_clean.loc[i, 'problems']):
            test_clean.loc[i, problem] = 1
            if(problem != 'NoProblem'):
                p_count+=1
    test_clean.loc[i, 'problem_count'] = p_count

test_clean.head()
'''

"\n# trainとtestに含まれるproblemは全く同じ\n# problemのonehotカラムを用意\nproblem_list = ['BranchLights', 'BranchOther', 'MetalGrates', 'RootOther', 'Stones', 'Sneakers', 'TrunkLights',\n                 'TrunkOther', 'WiresRope', 'NoProblem']\t\n\nfor problem in problem_list:\n    train_clean.loc[:, problem] = 0\n    test_clean.loc[:, problem] = 0\n\ntrain_clean['problem_count'] = 0\ntest_clean['problem_count'] = 0\n\n# problemlistにあったらonehotする\nfor i in train_clean.index:\n    p_count = 0\n    for problem in problem_list:\n        if(problem in train_clean.loc[i, 'problems']):\n            train_clean.loc[i, problem] = 1\n            if(problem != 'NoProblem'):\n                p_count+=1\n    train_clean.loc[i, 'problem_count'] = p_count\n\nfor i in test_clean.index:\n    p_count = 0\n    for problem in problem_list:\n        if(problem in test_clean.loc[i, 'problems']):\n            test_clean.loc[i, problem] = 1\n            if(problem != 'NoProblem'):\n                p_count+=1\n    test_clean.

## やりたい実験
- あらゆるエンコーディング方法の比較 ok
- 地区ごとの予測モデルの実装 
- 365日で一周するsin, cos 意味無し


In [64]:
train_clean.columns

Index(['tree_dbh', 'curb_loc', 'health', 'steward', 'guards', 'sidewalk',
       'user_type', 'problems', 'spc_common', 'nta', 'nta_name', 'borocode',
       'boro_ct', 'boroname', 'zip_city', 'cb_num', 'st_senate', 'st_assem',
       'cncldist', 'year', 'month', 'spc_genus', 'dbh_mean', 'dbh_diff'],
      dtype='object')

In [65]:
# エンコーディングパート

# すべてのカテゴリカル変数
all_categorical_cols = ['curb_loc', 'steward', 'guards', 'sidewalk',
       'user_type', 'problems', 'spc_common', 'nta', 'nta_name', 'borocode',
       'boro_ct', 'boroname', 'zip_city', 'cb_num', 'st_senate', 'st_assem',
       'cncldist', 'year', 'month',  'spc_genus', ]

# 落とすカラム
drop_cols = ['nta_name', 'boroname', 'dbh_mean',]
# カウントエンコーディング
ce_columns = ['curb_loc',  'steward', 'guards', 'sidewalk',
       'user_type',  'spc_common','borocode', 'nta',
       'boro_ct', 'zip_city', 'cb_num', 'st_senate', 'st_assem',
       'cncldist', 'year', 'month', 'spc_genus', 'problems', ]
# ターゲットエンコーディング
te_columns = []
# ラベルエンコーディング
le_columns = []
encoding_cals = list(drop_cols + ce_columns + te_columns + drop_cols)
if(encoding_cals != all_categorical_cols):
    print('error! categorical is not same')
    print('足りないカラム:', set(all_categorical_cols)-set(encoding_cals))

error! categorical is not same
足りないカラム: set()


In [66]:
y = train_clean['health']
train_clean_drop = train_clean.drop('health', axis=1)

In [67]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
valid_scores = []
models = []

for fold, (train_idx, valid_idx) in enumerate(kf.split(train_clean_drop, y)):
    print(f'fold{fold} start')
    X_train, y_train = train_clean_drop.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = train_clean_drop.iloc[valid_idx], y.iloc[valid_idx]

    lgb_train = Pool(X_train, y_train, cat_features=all_categorical_cols, weight=compute_sample_weight(class_weight='balanced', y=y_train).astype('float32'))
    lgb_eval = Pool(X_valid, y_valid, cat_features=all_categorical_cols, weight=np.ones(len(X_valid)).astype('float32'))

    params = {
        'loss_function': 'MultiClass',
        'num_boost_round': 1000,
        'early_stopping_rounds': 10,
        'eval_metric': 'TotalF1:average=Macro',
        'verbose': 100,
        
    }
   #  verbose_eval = -1 # この数字を1にすると学習時のスコア推移がコマンドライン表示される    
    model = CatBoost(
    params
)
    model.fit(lgb_train, eval_set=[lgb_eval])

    y_pred = model.predict(X_valid, prediction_type='Class')
    y_pred_max = np.argmax(y_pred, axis=1)
    score = f1_score(y_valid, y_pred_max, average='macro')
    print(f'fold{fold} f1_score: {score:.4f}')
    valid_scores.append(score)
    models.append(model)
print(f'CV: {np.mean(valid_scores):.4f}')


fold0 start


Learning rate set to 0.114131
0:	learn: 0.3729581	test: 0.2570049	best: 0.2570049 (0)	total: 211ms	remaining: 3m 30s
Stopped by overfitting detector  (10 iterations wait)

bestTest = 0.3010211734
bestIteration = 14

Shrink model to first 15 iterations.


CatBoostError: Invalid value of prediction_type=MultiClass: must be Class, RawFormulaVal, Probability, LogProbability, Exponent, RMSEWithUncertainty.

In [None]:
# 全データを利用して学習
lgb_train = lgb.Dataset(train_clean_drop, y, weight=compute_sample_weight(class_weight='balanced', y=y).astype('float32'))
verbose_eval = -1 # この数字を1にすると学習時のスコア推移がコマンドライン表示される
params = {'objective': 'multiclass',
'metric': 'multi_logloss',
'num_class': 3,
'seed': 0
}

model = lgb.train(params,
                    lgb_train,
                    num_boost_round=520,
                )


NameError: name 'lgb' is not defined

In [None]:
y_pred = model.predict(test_clean_drop)
y_pred_max = np.argmax(y_pred, axis=1) # 最尤と判断したクラスの値にする
sample_df[1] = y_pred_max
sample_df.to_csv('C:/python/signate/data/BEST2-allce-problemsDrop-trainAllData.csv', header=None)

In [None]:
import pandas as pd

importance = model.feature_importance()
feature_names = model.feature_name()

df_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
# importtane順に並び替え
df_importance = df_importance.sort_values('Importance', ascending=False)
df_importance


Unnamed: 0,Feature,Importance
19,dbh_diff,6742
10,boro_ct,5214
8,nta,4220
7,spc_common,3606
0,tree_dbh,3425
17,month,3212
6,problems,2756
18,spc_genus,2561
14,st_assem,2400
12,cb_num,2370
