In [1]:
from pathlib import Path
import catboost as cb
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from category_encoders import CountEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.pipeline import Pipeline
#from sklearn.preprocessing import TargetEncoder
from sklearn.metrics import f1_score,recall_score,precision_score,confusion_matrix

from sklearn.metrics import classification_report
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [11]:
train_date = pd.read_csv('C:/Users/wgkdj/smbc_comp/input/train.csv')
test_date = pd.read_csv('C:/Users/wgkdj/smbc_comp/input/test.csv')
sub_df = pd.read_csv('C:/Users/wgkdj/smbc_comp/input/sample_submission.csv',header=None)
sub_df.columns = ["index", "health"]
row_data = pd.read_csv('C:/Users/wgkdj/work12_07/newrows.csv')

In [12]:
class PreProcessTransformer(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None):
        return self
    
    def transform(self,X):
        X['created_at'] = pd.to_datetime(X['created_at'])
        X['created_at_year'] = X['created_at'].dt.year
        X['created_at_month'] = X['created_at'].dt.month
        X['created_at_day'] = X['created_at'].dt.day
        X['problems'].fillna(' ', inplace=True)
        X['problems'] = X['problems'].astype('category')
        problem = ["Stones", "BranchLights", "BranchOther", "RootOther", "TrunkOther", "WiresRope", "MetalGrates", "TrunkLights"]
        for substring in problem:
            X[substring] = X['problems'].apply(lambda row: 1 if (pd.notnull(row) and substring in row) else 0)
        X['int_steward'] = X['steward'].replace({'1or2': 1, '3or4': 2, '4orMore': 3})
        X['null_problems'] = X['problems'].apply(lambda x: 1 if pd.isnull(x) else 0)
        X['weekdays'] = X['created_at'].dt.day_name()
        X['status'] = 0
        X.loc[X['steward'].isnull() & X['guards'].isnull(), 'status'] = 1
        X['status'] = X['status'].astype('str')
        X['nta_prefix'] = X['nta'].str[:2]
        X['nta_suffix'] = X['nta'].str[2:].astype('int')
        return X

class OriginalTransformer(TransformerMixin, BaseEstimator):
    """数値特徴はそのまま、カテゴリ特徴はcategory型に変換"""

    def __init__(self, numeric_cols, categorical_cols):
        self.numeric_cols = numeric_cols
        self.categorical_cols = categorical_cols
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_new = X.copy()

        # 数値変数
        X_new[self.numeric_cols] = X_new[self.numeric_cols].astype("float32")

        # カテゴリ変数
        X_new[self.categorical_cols] = X_new[self.categorical_cols].astype("category")
        
        return X_new[self.get_feature_names_out()]

    def get_feature_names_out(self):
        return self.numeric_cols + self.categorical_cols
class AggTransformer(TransformerMixin, BaseEstimator):
    """集約特徴量"""

    def __init__(self, key, numeric_cols, agg_func: dict):
        self.key = key
        self.numeric_cols = numeric_cols
        self.agg_func = agg_func

    def fit(self, X, y=None):
        X = X.copy()
        X[self.key] = X[self.key].astype("category")
        self.agg_df = X.groupby(self.key)[self.numeric_cols].agg(self.agg_func)
        self.agg_df.columns = [f"{col}_{func}" for col, func in self.agg_df.columns.tolist()]

        return self

    def transform(self, X):
        X_new = pd.merge(X, self.agg_df, on=self.key, how="left")
        return X_new[self.get_feature_names_out()]

    def get_feature_names_out(self, input_features=None):
        return self.agg_df.columns.tolist()

In [13]:
import category_encoders as ce



train_date = train_date[~train_date['spc_common'].isin(['Himalayan cedar', 'Chinese chestnut'])]

numeric_cols = [
    'tree_dbh',
    'borocode',
    'boro_ct',
    'cb_num',
    'st_senate',
    'st_assem',
    'cncldist',
    'created_at_year',
    'created_at_month',
    'created_at_day',
    'int_steward',
    #'nta_suffix',
]
categorical_cols = [
    'curb_loc',
    'guards',
    'sidewalk',
    'user_type',
    #'problems',
    'Stones',
    'BranchLights',
    'BranchOther',
    'RootOther',
    'TrunkOther',
    'WiresRope',
    'MetalGrates',
    'TrunkLights',
    'status',
    'null_problems',
    #'weekdays',
]

# ColumnTransformer
ct = ColumnTransformer(
    transformers=[
        (
            "ori",
            OriginalTransformer(numeric_cols, categorical_cols),
            numeric_cols + categorical_cols
        ),
        (
            "tgt",
            ce.TargetEncoder(cols=[ 'steward', 'guards','user_type', 'problems',
                                   'spc_common', 'nta','zip_city']),
            [ 'steward', 'guards', 'user_type','problems',
             'spc_common','nta','zip_city']
        ),
    ],
    verbose=True
)

# Pipeline
ct.set_output(transform="pandas")
pipe = Pipeline(
    steps=[
        ("preprocess", PreProcessTransformer()),
        ('ct', ct),
    ]
)
# Fit and transform

train_feat_df = pipe.fit_transform(row_data, row_data['health'])
test_feat_df = pipe.transform(train_date.drop('health', axis=1))


[ColumnTransformer] ........... (1 of 2) Processing ori, total=   0.5s
[ColumnTransformer] ........... (2 of 2) Processing tgt, total=   2.0s


In [8]:
row_data['health'].value_counts()

health
1    526787
0     96164
2     26683
Name: count, dtype: int64

In [26]:
lgbm_params = {
    'eta':0.05,
    'n_estimators':1000,
    'objective': 'multiclass',
    'num_class': 3,  # クラスの数を指定
    'metric': 'multi_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'class_weight': {0: 3, 1: 1, 2: 8},
    #'importance_type' : 'gain'
    #'device_type' :'gpu'
    #1以外の数値を上げるとreal１がより当該ラベルに落ちるようになる
}

gbm = lgb.LGBMClassifier(**lgbm_params)
gbm.fit(train_feat_df ,row_data['health'] )

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021408 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1127
[LightGBM] [Info] Number of data points in the train set: 649634, number of used features: 31
[LightGBM] [Info] Start training from score -1.271426
[LightGBM] [Info] Start training from score -0.669297
[LightGBM] [Info] Start training from score -1.572625


In [27]:
pred = gbm.predict(test_feat_df)



In [28]:
f1 = f1_score(train_date['health'], pred, average='macro')
print(f1)

0.32830943554736364
