In [346]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.neighbors import KNeighborsClassifier
# import optuna.integration.lightgbm as lgb
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

import category_encoders as ce
import re

# pandasの行を省略しない
pd.set_option('display.max_columns', None)

In [347]:
train_df = pd.read_csv('C:/python/signate/data/train.csv', index_col=0)
test_df = pd.read_csv('C:/python/signate/data/test.csv', index_col=0)
sample_df = pd.read_csv('C:/python/signate/data/sample_submission.csv', header=None, index_col=0)
train_df.head()

Unnamed: 0,created_at,tree_dbh,curb_loc,health,steward,guards,sidewalk,user_type,problems,spc_common,spc_latin,nta,nta_name,borocode,boro_ct,boroname,zip_city,cb_num,st_senate,st_assem,cncldist
0,2015-06-29,14,OnCurb,1,,,Damage,Volunteer,,English oak,Quercus robur,QN45,Douglas Manor-Douglaston-Little Neck,4,4152901,Queens,Little Neck,411,11,25,23
1,2016-09-21,5,OnCurb,1,3or4,Helpful,NoDamage,Volunteer,,crimson king maple,Acer platanoides 'Crimson King',BX05,Bedford Park-Fordham North,2,2039901,Bronx,Bronx,207,33,78,15
2,2015-09-13,26,OnCurb,2,,,NoDamage,Volunteer,StonesBranchLights,English oak,Quercus robur,SI01,Annadale-Huguenot-Prince's Bay-Eltingville,5,5017011,Staten Island,Staten Island,503,24,62,51
3,2016-05-09,15,OnCurb,0,,,Damage,NYC Parks Staff,,honeylocust,Gleditsia triacanthos var. inermis,SI11,Charleston-Richmond Valley-Tottenville,5,5024401,Staten Island,Staten Island,503,24,62,51
4,2016-06-24,23,OnCurb,1,,,NoDamage,Volunteer,Stones,London planetree,Platanus x acerifolia,MN03,Central Harlem North-Polo Grounds,1,1022102,Manhattan,New York,110,30,70,9


データはほとんどカテゴリカル
#### 思いついた精度向上案
- commonを個別名称として、latinの上を属名として利用する
- 地区が多すぎるので整理する
- 季節性を導入
- 郵便番号は連続性があるからカテゴリカルにしないほうがよいのでは
- 高い確率で状態が悪いと推測された木の近くの木はフラグ立てる。つまり2段階モデル
- 同じ人が記録した場合、1日のうちの0,1,2の割合は無意識にバイアスがかかって同じくらいにしてしまうのでは？

In [348]:
# train_dfのcolumnを取得
columns = test_df.columns

In [349]:
def cleansing(df):
    df['created_at'] = pd.to_datetime(df['created_at'])
    # df['year'] = df['created_at'].dt.year
    # df['month'] = df['created_at'].dt.month
    # df['day'] = df['created_at'].dt.day
    # df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    # df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df.drop('created_at', axis=1, inplace=True)
    df['steward'].fillna('0', inplace=True)
    df['guards'].fillna('0', inplace=True)
    df['problems'].fillna('NoProblem', inplace=True)

    df['spc_genus'] = df['spc_latin'].str.split(' ').str[0]
    df.drop('spc_latin', axis=1, inplace=True)

    return df

train_clean = cleansing(train_df)
test_clean = cleansing(test_df)
train_clean.head()

Unnamed: 0,tree_dbh,curb_loc,health,steward,guards,sidewalk,user_type,problems,spc_common,nta,nta_name,borocode,boro_ct,boroname,zip_city,cb_num,st_senate,st_assem,cncldist,spc_genus
0,14,OnCurb,1,0,0,Damage,Volunteer,NoProblem,English oak,QN45,Douglas Manor-Douglaston-Little Neck,4,4152901,Queens,Little Neck,411,11,25,23,Quercus
1,5,OnCurb,1,3or4,Helpful,NoDamage,Volunteer,NoProblem,crimson king maple,BX05,Bedford Park-Fordham North,2,2039901,Bronx,Bronx,207,33,78,15,Acer
2,26,OnCurb,2,0,0,NoDamage,Volunteer,StonesBranchLights,English oak,SI01,Annadale-Huguenot-Prince's Bay-Eltingville,5,5017011,Staten Island,Staten Island,503,24,62,51,Quercus
3,15,OnCurb,0,0,0,Damage,NYC Parks Staff,NoProblem,honeylocust,SI11,Charleston-Richmond Valley-Tottenville,5,5024401,Staten Island,Staten Island,503,24,62,51,Gleditsia
4,23,OnCurb,1,0,0,NoDamage,Volunteer,Stones,London planetree,MN03,Central Harlem North-Polo Grounds,1,1022102,Manhattan,New York,110,30,70,9,Platanus


problemのonehotは意味ない、むしろスコア悪化させる

In [350]:

# 種目ごとの平均直径とその差を特徴量にする
test_clean['health'] = 0
all_df = pd.concat([train_clean, test_clean], axis=0)   
tmp_df = all_df[['tree_dbh', 'spc_common', ]]
dbh_mean_common = (pd.DataFrame(tmp_df.groupby('spc_common').mean()['tree_dbh']))
dbh_mean_common.rename(columns={'tree_dbh': 'dbh_mean_common'}, inplace=True)
all_df = pd.merge(all_df, dbh_mean_common, on='spc_common', how='left')

tmp_df = all_df[['tree_dbh', 'cb_num']]
dbh_mean_cb = (pd.DataFrame(tmp_df.groupby('cb_num').mean()['tree_dbh']))
dbh_mean_cb.rename(columns={'tree_dbh': 'dbh_mean_cb'}, inplace=True)
all_df = pd.merge(all_df, dbh_mean_cb, on='cb_num', how='left')


train_clean = all_df.iloc[:len(train_clean)]
test_clean = all_df.iloc[len(train_clean):]
test_clean.drop('health', axis=1, inplace=True)
test_clean.head()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_clean.drop('health', axis=1, inplace=True)


Unnamed: 0,tree_dbh,curb_loc,steward,guards,sidewalk,user_type,problems,spc_common,nta,nta_name,borocode,boro_ct,boroname,zip_city,cb_num,st_senate,st_assem,cncldist,spc_genus,dbh_mean_common,dbh_mean_cb
19984,15,OnCurb,0,0,Damage,NYC Parks Staff,NoProblem,Callery pear,BK17,Sheepshead Bay-Gerritsen Beach-Manhattan Beach,3,3062000,Brooklyn,Brooklyn,315,22,45,48,Pyrus,9.993088,11.813235
19985,5,OnCurb,1or2,0,NoDamage,Volunteer,NoProblem,cherry,BX62,Woodlawn-Wakefield,2,2045101,Bronx,Bronx,212,34,81,11,Prunus,6.937519,13.062044
19986,4,OnCurb,0,Unsure,NoDamage,Volunteer,NoProblem,littleleaf linden,QN60,Kew Gardens,4,4013800,Queens,Kew Gardens,409,14,27,29,Tilia,9.373364,11.174312
19987,7,OnCurb,0,0,NoDamage,TreesCount Staff,NoProblem,dawn redwood,BK09,Brooklyn Heights-Cobble Hill,3,3000301,Brooklyn,Brooklyn,302,26,52,33,Metasequoia,9.541667,9.89467
19988,6,OnCurb,1or2,Helpful,NoDamage,TreesCount Staff,Stones,purple-leaf plum,BK75,Bedford,3,3025100,Brooklyn,Brooklyn,303,25,56,36,Prunus,9.447942,12.23681


In [351]:
# アンダーサンプリング
# rs = RandomUnderSampler(random_state=42)
# train_clean ,_ = rs.fit_resample(train_clean, train_clean['health'])

# 一旦lightgbmで学習してみる
# count encodingとラベル削除を行う

# 使わないcolumnを指定
'''
drop_col = [ 'Sneakers', "BranchLights",	"BranchOther",	"MetalGrates",	
            "RootOther",	"Stones",	"Sneakers",	"TrunkLights",	"TrunkOther",
            	"WiresRope","NoProblem"]
'''
drop_col = []
ce_drop_col = ['nta_name', 'boroname', ]
# カテゴリカル変数を指定
ce_columns = ['curb_loc', 'steward', 'guards',
       'sidewalk', 'user_type', 'problems', 'nta',
       'nta_name',  'boro_ct', 'boroname', 'zip_city', 
       'spc_genus',  'spc_common', 'borocode', 'cb_num', 'st_senate', 'st_assem', 'cncldist', 
       ]
le_columns = []

# カテゴリカルカラムから使わないcolumnを削除
ce_columns = list(set(ce_columns) - set(ce_drop_col))

# 目的変数を指定
y = train_clean['health']

# 使わないcolumnを削除
train_clean_drop = train_clean.drop(['health'], axis=1)
drops = drop_col+ce_drop_col
train_clean_drop = train_clean_drop.drop(drops, axis=1)
test_clean_drop = test_clean.drop(drops,axis=1)

'''
# le
le = LabelEncoder()
for column in le_columns:
    train_clean_drop[column] = le.fit_transform(train_clean_drop[column])
    test_clean_drop[column] = le.transform(test_clean_drop[column])
'''
# カテゴリカル変数を指定
for col in le_columns:
    train_clean_drop[col] = train_clean_drop[col].astype('category')
    test_clean_drop[col] = test_clean_drop[col].astype('category')  

# mergeしてfit, そのあとtransform
all_df = pd.concat([train_clean_drop, test_clean_drop], axis=0)

# count encoding
cencoder = ce.CountEncoder(cols=ce_columns)
cencoder.fit(all_df)

# count encodingを適用
train_clean_drop = cencoder.transform(train_clean_drop)
test_clean_drop = cencoder.transform(test_clean_drop)



In [352]:
train_clean_drop.head()

Unnamed: 0,tree_dbh,curb_loc,steward,guards,sidewalk,user_type,problems,spc_common,nta,borocode,boro_ct,zip_city,cb_num,st_senate,st_assem,cncldist,spc_genus,dbh_mean_common,dbh_mean_cb
0,14,37270,29409,29510,18509,22298,24288,2579,413,13234,129,413,1654,3780,1145,1694,7853,12.346646,11.311971
1,5,37270,2068,7366,21177,22298,24288,254,59,4479,24,4479,166,564,162,380,3613,10.15748,16.289157
2,26,37270,29409,29510,21177,22298,1529,2579,616,7292,189,7292,3436,5720,2868,3155,7853,12.346646,9.683062
3,15,37270,29409,29510,18509,6031,24288,2104,678,7292,121,7292,3436,5720,2868,3155,2104,11.484316,9.683062
4,23,37270,29409,29510,21177,22298,4455,4339,144,4522,39,4473,289,948,418,542,4339,19.432819,11.678201


In [353]:
def f1(y_pred, data):
    y_true = data.get_label()
    score = f1_score(np.argmax(y_pred, axis=1), y_true, average='macro')
    return 'custom', score, True

In [354]:
print(len(train_clean_drop))

19984


In [355]:
train_clean_drop_np = train_clean_drop.to_numpy()
y_np = y.to_numpy()

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
valid_scores = []
models = []
knn_pred = train_df['health'].copy()
knn_pred.rename('knn_pred', inplace=True)

for fold, (train_idx, valid_idx) in enumerate(kf.split(train_clean_drop_np, y)):
   print(f'fold{fold} start')

   X_train, y_train = train_clean_drop_np[train_idx],y_np[train_idx]
   X_valid, y_valid = train_clean_drop_np[valid_idx], y_np[valid_idx]

   knc= KNeighborsClassifier(n_neighbors=1)
   knc.fit(X_train, y_train)

   #  予測
   y_pred = knc.predict(X_valid)
   knn_pred.iloc[valid_idx] = y_pred
   score = f1_score(y_valid, y_pred, average='macro')
   print(f'fold{fold} f1_score: {score:.4f}')
   valid_scores.append(score)
   print(f'CV: {np.mean(valid_scores):.4f}')
# 予測結果を保存
   
knn_pred.rename('knn_predict', inplace=True)
knn_pred.to_csv('C:/python/signate/data/knn_pred.csv', header=True)

fold0 start
fold0 f1_score: 0.3456
CV: 0.3456
fold1 start
fold1 f1_score: 0.3371
CV: 0.3413
fold2 start
fold2 f1_score: 0.3442
CV: 0.3423
fold3 start
fold3 f1_score: 0.3475
CV: 0.3436
fold4 start
fold4 f1_score: 0.3427
CV: 0.3434


In [356]:
test_clean_drop = test_clean_drop.to_numpy()
y_pred = knc.predict(test_clean_drop)
sample_df[1] = y_pred
predict = sample_df.copy()
predict.rename(columns={1: 'knn_predict'}, inplace=True)

predict.to_csv('C:/python/signate/data/knn_predict_test.csv', header=True)
sample_df.to_csv('C:/python/signate/data/knn.csv', header=None)

In [357]:
predict

Unnamed: 0_level_0,knn_predict
0,Unnamed: 1_level_1
19984,1
19985,1
19986,1
19987,1
19988,1
...,...
39964,0
39965,1
39966,2
39967,0
