# Library Import

In [None]:
!pip install tensorflow_addons
!pip install catboost

In [None]:
# *------------ base library ------------*
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# *------------ sklearn ------------*
from sklearn.metrics import f1_score, log_loss, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, StandardScaler
from sklearn.preprocessing import Normalizer, LabelEncoder


# *------------tf & keras ------------*
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils
from tensorflow.keras.metrics import Metric

# *------------ Catboost ------------*
import catboost as cat
from catboost import CatBoostClassifier

# *------------default setting & read files ------------*
pd.options.display.min_rows=100

train = pd.read_csv("/content/drive/MyDrive/data/train.csv", index_col=0)
test = pd.read_csv("/content/drive/MyDrive/data/test.csv", index_col=0)
ss = pd.read_csv("/content/drive/MyDrive/data/submission.csv")

seed=1617
def seed_everything(seed):
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    tf.random.set_seed(seed)

seed_everything(seed)

# Embedding 생성

In [None]:
# 신고일자 정리 및 수치 데이터 정제
def preprocessing(df):
    
    #날짜 특징 추출
    df['date_time'] = pd.to_datetime(df['일시'])
    df['day']= df.date_time.dt.day
    df['weekday'] = df.date_time.dt.weekday
    df['weekend'] = df['weekday'].isin([5,6]).astype(int)

    # 수치데이터
    df['무게'] = np.log1p(df['무게'])
    df['금액'] = np.log1p(df['금액'])
    
    df.drop(['일시', 'date_time', 'weekday'], axis=1, inplace=True)

    return df

train = preprocessing(train)
test = preprocessing(test)

In [None]:
cat_features = [x for x in train.columns if x not in ["target_1","target_2",'무게','금액']]
num_features = ['무게','금액']

test.loc[:,'target_1']=-1
test.loc[:,'target_2']=-1

data=pd.concat([train,test])


# 각 피처별 tokenize
for feat in cat_features:
    lbl_enc = LabelEncoder()
    data[feat] = lbl_enc.fit_transform(data[feat].fillna("-1").astype(str).values)

# 데이터 분리
train = data[data.target_1 !=-1]
test = data[data.target_1 ==-1]

In [None]:
# callback 정의
################# binary -> target_1 #############################

ES_bin = tf.keras.callbacks.EarlyStopping(monitor='val_state_full_binary_f1',
                                     min_delta=1e-02, patience=5,
                                     verbose=0,
                                     mode='max',
                                     baseline=None, restore_best_weights=True)

LRPlateau_bin = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_state_full_binary_f1', 
                                               factor=0.5, patience=3, verbose=0, min_lr=1e-6,mode='max')

dir_name = '/content/drive/MyDrive/data'
model_bin_name = "Embed_special_target_1_classifier"

checkpoint_bin_path = os.path.join(dir_name, model_bin_name+'weights.h5')
CP_bin = tf.keras.callbacks.ModelCheckpoint(checkpoint_bin_path, monitor='val_state_full_binary_f1', verbose=False, save_best_only=True, save_weights_only=True)


################# multi -> target_2 #############################

ES_mul = tf.keras.callbacks.EarlyStopping(monitor='val_state_full_multiclass_f1',
                                     min_delta=1e-05, patience=5,
                                     verbose=0,
                                     mode='max',
                                     baseline=None, restore_best_weights=True)

LRPlateau_mul = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_state_full_multiclass_f1', 
                                               factor=0.5, patience=3, verbose=0, min_lr=1e-6, mode='max')

dir_name = '/content/drive/MyDrive/data
model_mul_name = "Embed_special_target_2_classifier"

checkpoint_mul_path = os.path.join(dir_name, model_mul_name+'weights.h5')
CP_mul = tf.keras.callbacks.ModelCheckpoint(checkpoint_mul_path, monitor='val_state_full_multiclass_f1', verbose=False, save_best_only=True, save_weights_only=True)

In [None]:
# Ref : https://towardsdatascience.com/f-beta-score-in-keras-part-ii-15f91f07c9a4
# custom Total F1 정의

class StatefullBinaryFBeta(Metric):
  def __init__(self, name='state_full_binary_f1', beta=1, threshold=0.5, epsilon=1e-7, **kwargs): # f1 ==> beta:1
    # initializing an object of the super class
    super(StatefullBinaryFBeta, self).__init__(name=name, **kwargs)

    # initializing state variables
    self.tp = self.add_weight(name='tp', initializer='zeros') # initializing true positives 
    self.actual_positive = self.add_weight(name='fp', initializer='zeros') # initializing actual positives
    self.predicted_positive = self.add_weight(name='fn', initializer='zeros') # initializing predicted positives

    # initializing other atrributes that wouldn't be changed for every object of this class
    self.beta_squared = beta**2 
    self.threshold = threshold
    self.epsilon = epsilon

  def update_state(self, ytrue, ypred, sample_weight=None):
    # casting ytrue and ypred as float dtype
    ytrue = tf.cast(ytrue, tf.float32)
    ypred = tf.cast(ypred, tf.float32)

    # setting values of ypred greater than the set threshold to 1 while those lesser to 0
    ypred = tf.cast(tf.greater_equal(ypred, tf.constant(self.threshold)), tf.float32)
        
    self.tp.assign_add(tf.reduce_sum(ytrue*ypred)) # updating true positives atrribute
    self.predicted_positive.assign_add(tf.reduce_sum(ypred)) # updating predicted positive atrribute
    self.actual_positive.assign_add(tf.reduce_sum(ytrue)) # updating actual positive atrribute

  def result(self):
    self.precision = self.tp/(self.predicted_positive+self.epsilon) # calculates precision
    self.recall = self.tp/(self.actual_positive+self.epsilon) # calculates recall

    # calculating fbeta
    self.fb = (1+self.beta_squared)*self.precision*self.recall / (self.beta_squared*self.precision + self.recall + self.epsilon)
    
    return self.fb

  def reset_states(self):
    self.tp.assign(0) # resets true positives to zero
    self.predicted_positive.assign(0) # resets predicted positives to zero
    self.actual_positive.assign(0) # resets actual positives to zero


class StatefullMultiClassFBeta(Metric):
    
    # we create (initialize) the state variables here.
    def __init__(self, name='state_full_multiclass_f1', beta=1, n_class=3, average='macro', epsilon=1e-7, **kwargs): # f1 ==> beta:1
        # initializing an object of the super class
        super(StatefullMultiClassFBeta, self).__init__(name=name, **kwargs)

        # initializing state variables
        self.tp = self.add_weight(name='tp', shape=(n_class,), initializer='zeros')     # initializing true positives
        self.actual_positives = self.add_weight(name='ap', shape=(n_class,), initializer='zeros') # initializing actual positives
        self.predicted_positives = self.add_weight(name='pp', shape=(n_class,), initializer='zeros') # initializing predicted positives

        # initializing other atrributes that wouldn't be changed for every object of this class
        self.beta_squared = beta**2
        self.n_class = n_class
        self.average = average
        self.epsilon = epsilon
    
    # this method is called at the end of each batch and is used to change (update) the state variables.
    def update_state(self, ytrue, ypred, sample_weight=None):
        # casting ytrue and ypred as float dtype
        ytrue = tf.cast(ytrue, tf.float32)
        ypred = tf.cast(ypred, tf.float32)

        # finding the maximum probability in ypred
        max_prob = tf.reduce_max(ypred, axis=-1, keepdims=True)

        # making ypred one hot encoded such that the class with the maximum probability as encoded as 1 while others as 0
        ypred = tf.cast(tf.equal(ypred, max_prob), tf.float32)
        
        self.tp.assign_add(tf.reduce_sum(ytrue*ypred, axis=0)) # updating true positives atrribute
        self.predicted_positives.assign_add(tf.reduce_sum(ypred, axis=0)) # updating predicted positives atrribute
        self.actual_positives.assign_add(tf.reduce_sum(ytrue, axis=0)) # updating actual positives atrribute
    
    # this is called at the end of each batch after states variables are updated. It is used to compute and return the metric for each batch.
    def result(self):
        self.precision = self.tp/(self.predicted_positives+self.epsilon) # calculates precision
        self.recall = self.tp/(self.actual_positives+self.epsilon) # calculates recall

        # calculating fbeta score
        self.fb = (1+self.beta_squared)*self.precision*self.recall / (self.beta_squared*self.precision + self.recall + self.epsilon)

        if self.average == 'weighted':
            return tf.reduce_sum(self.fb*self.actual_positives / tf.reduce_sum(self.actual_positives))
    
        elif self.average == 'raw':
            return self.fb
        
        return tf.reduce_mean(self.fb)
        
    # this is called at the end of each epoch. It is used to clear (reinitialize) the state variables.
    def reset_states(self):
        self.tp.assign(tf.zeros(self.n_class)) # resets true positives to zero
        self.predicted_positives.assign(tf.zeros(self.n_class)) # resets predicted positives to zero
        self.actual_positives.assign(tf.zeros(self.n_class)) # resets actual positives to zero

## 모델 세팅

모델 후보 1

In [None]:
def create_model(data, cat_features, target, num_features):    
    inputs = []
    outputs = []

    name_scope = [f'feature_{i}' for i in range(len(cat_features))]

    for i,c in enumerate(cat_features):
        num_unique_values = int(data[c].nunique())
        
        # embed 차원 결정
        if c in ['''cardinaliy 중간 수준 피처''']:
            embed_dim = 64
        elif c in ['''cardinaliy 큰 피처들''']:
            embed_dim = 128
        else:
            embed_dim = int(min(np.ceil((num_unique_values)/2), 50))

        inp = layers.Input(shape=(1,))
        out = layers.Embedding(num_unique_values + 1, embed_dim, name=name_scope[i])(inp)
        out = layers.SpatialDropout1D(0.3)(out)
        out = layers.Reshape(target_shape=(embed_dim, ))(out)
        inputs.append(inp)
        outputs.append(out)

    num_inp = layers.Input(shape=(len(num_features),))
    inputs.append(num_inp)
    outputs.append(num_inp)
    
    x = layers.Concatenate()(outputs)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dense(512, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dense(256, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    
    if target=='target_2':
        y = layers.Dense(3,
                         activation='softmax',
                         )(x)
    
    if target=='target_1':
        y = layers.Dense(2,
                         activation='sigmoid',
                         )(x)

    model = Model(inputs=inputs, outputs=y)
    return model

모델 후보 2

In [None]:
def create_model(data, cat_features, target, num_features):    
    inputs = []
    outputs = []

    name_scope = [f'feature_{i}' for i in range(len(cat_features))]

    for i,c in enumerate(cat_features):
        num_unique_values = int(data[c].nunique())
        
        # embed 차원 결정
        if c in ['''cardinaliy 중간 수준 피처''']:
            embed_dim = 64
        elif c in ['''cardinaliy 큰 피처들''']:
            embed_dim = 128
        else:
            embed_dim = int(min(np.ceil((num_unique_values)/2), 50))

        inp = layers.Input(shape=(1,))
        out = layers.Embedding(num_unique_values + 1, embed_dim, name=name_scope[i])(inp)
        out = layers.SpatialDropout1D(0.3)(out)
        out = layers.Reshape(target_shape=(embed_dim, ))(out)
        inputs.append(inp)
        outputs.append(out)

    num_inp = layers.Input(shape=(len(num_features),))
    inputs.append(num_inp)
    outputs.append(num_inp)
    
    x = layers.Concatenate()(outputs)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dense(1024, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dense(512, activation="relu")(x)
    x = layers.Dropout(0.4)(x)
    x = layers.BatchNormalization()(x)
    
    #----------- Residual blocks layers ----------------------
    x = tfa.layers.WeightNormalization(
        layers.Dense(256,
                activation ='selu',
                kernel_initializer = "lecun_normal"))(x)
    
    drop_out = layers.Dropout(0.3)(x)
    x = tfa.layers.WeightNormalization(
        layers.Dense(128,
                activation='relu',
                kernel_initializer = "he_normal"))(drop_out) 
    x = layers.Dropout(0.4)(x)   #layers.Concatenate()([embed, hidden, output]))
    x = tfa.layers.WeightNormalization(
    layers.Dense(
                units = 64, 
                activation = 'elu',
                kernel_initializer = "lecun_normal"))(x)


    if target=='target_2':
        y = layers.Dense(3,
                         activation='softmax',
                         )(x)
    
    if target=='target_1':
        y = layers.Dense(2,
                         activation='sigmoid',
                         )(x)

    model = Model(inputs=inputs, outputs=y)
    return model

## target_2 모델

In [None]:
oof_mul = np.zeros((train.shape[0],3))
target= 'target_2'
train_y=train['target_2']
train_x=train.drop(["target_1","target_2"], axis=1)

models_core = []

N_FOLDS = 5
SEED = seed
EPOCH = 100

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train[target])):
    print(f"\n ====== TRAINING FOLD {fold} =======\n")

    K.clear_session()

    X_train_1, X_valid_1, y_train_1, y_valid_1 = train_x.iloc[train_idx], train_x.iloc[valid_idx], train_y.iloc[train_idx], train_y.iloc[valid_idx]


    X_train = [X_train_1.loc[:, cat_features].values[:, k] for k in range(X_train_1.loc[:, cat_features].values.shape[1])]+[X_train_1.loc[:,num_features].values]
    X_valid = [X_valid_1.loc[:, cat_features].values[:, k] for k in range(X_valid_1.loc[:, cat_features].values.shape[1])]+[X_valid_1.loc[:,num_features].values]

    y_train = utils.to_categorical(y_train_1)
    y_valid = utils.to_categorical(y_valid_1)

    #================= Embedding MODEL training =================
    
    print("\n-----Embedding model Training-----\n")

    model = create_model(data, cat_features, target, num_features)

    # Metrics Weights 도 있나?
    model.compile(loss='categorical_crossentropy',
                       metrics=StatefullMultiClassFBeta(),
                       optimizer = tf.keras.optimizers.Adam()
                       )
    
    model.fit(X_train,y_train,
               batch_size = 256, 
               epochs = EPOCH,
               validation_data=(X_valid, y_valid),
               callbacks=[ES_mul, LRPlateau_mul,CP_mul],
              class_weight={0:1.0, 1:3.0, 2: 3.5},
               verbose = False)
    #============== Embedding Model prediction ============== 

    pred_mul = model.predict(X_valid) 
    oof_mul[valid_idx] = pred_mul 
    
    multiclass_score = f1_score(y_true=np.argmax(y_valid, axis=1), y_pred=np.argmax(pred_mul, axis=1), average='macro')

    print(f"target_2 score : {multiclass_score}")
    models_core.append(model)

    
total_score = f1_score(y_true=train_y, y_pred=np.argmax(oof_mul, axis=1), average='macro')

print(f"\n=== FINAL target_2 SCORE CONVOLUTION MODEL : {total_score}===\n") 

## target_1 모델

In [None]:
oof_bin = np.zeros((train.shape[0],2))
target= 'target_1'
train_y=train['target_1']
train_x=train.drop(["target_1","target_2"], axis=1)

models_crime = []

N_FOLDS = 5
SEED = seed
EPOCH = 100


skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train[target])):
    print(f"\n ====== TRAINING FOLD {fold} =======\n")

    K.clear_session()

    X_train_1, X_valid_1, y_train_1, y_valid_1 = train_x.iloc[train_idx], train_x.iloc[valid_idx], train_y.iloc[train_idx], train_y.iloc[valid_idx]


    X_train = [X_train_1.loc[:, cat_features].values[:, k] for k in range(X_train_1.loc[:, cat_features].values.shape[1])]+[X_train_1.loc[:,num_features].values]
    X_valid = [X_valid_1.loc[:, cat_features].values[:, k] for k in range(X_valid_1.loc[:, cat_features].values.shape[1])]+[X_valid_1.loc[:,num_features].values]

    #X_train = [X_train_1.loc[:, cat_features].values[:, k] for k in range(X_train_1.loc[:, cat_features].values.shape[1])]+[X_train_1.loc[:,num_features]]
    #X_valid = [X_valid_1.loc[:, cat_features].values[:, k] for k in range(X_valid_1.loc[:, cat_features].values.shape[1])]+[X_valid_1.loc[:,num_features]]

    y_train = utils.to_categorical(y_train_1)
    y_valid = utils.to_categorical(y_valid_1)

    #================= Embedding MODEL training =========
    
    print("\n-----Embedding model Training----\n")

    model = create_model(data, cat_features, target, num_features)

    model.compile(loss='binary_crossentropy',
                       metrics=StatefullBinaryFBeta(),
                       optimizer = tf.keras.optimizers.Adam()
                       )
    
    model.fit(X_train,y_train,
               batch_size = 256, 
               epochs = EPOCH,
               validation_data=(X_valid, y_valid),
               callbacks=[ES_bin, LRPlateau_bin,CP_bin],
              class_weight={0:1.0, 1:3.0},
               verbose = False)
    #============== Embedding Model prediction ==========
 
    pred_bin = model.predict(X_valid) 
    oof_bin[valid_idx] = pred_bin 
    
    binary_score = f1_score(y_true=np.argmax(y_valid, axis=1), y_pred=np.argmax(pred_bin, axis=1), average='binary')

    print(f"target_1 score : {binary_score}")
    models_crime.append(model)
    
total_score = f1_score(y_true=train_y, y_pred=np.argmax(oof_bin, axis=1), average='binary')

print(f"\n=== FINAL target_1 SCORE CONVOLUTION MODEL : {total_score}===\n") 

# Embedding 추출

In [None]:
tr_cat = [train.loc[:, cat_features].values[:, k] for k in range(train.loc[:, cat_features].values.shape[1])]
tr_cat=tf.convert_to_tensor(tr_cat, dtype=tf.float32)

ts_cat = [test.loc[:, cat_features].values[:, k] for k in range(test.loc[:, cat_features].values.shape[1])]
ts_cat=tf.convert_to_tensor(ts_cat, dtype=tf.float32)

5개의 fold Embed 결과를 평균 내어 사용하는 코드 

In [None]:
# *--------------- target_2 ---------------*
# train
cat_core_train = pd.DataFrame(index=train.index)
cctr = pd.DataFrame(index=train.index)

for idx, col in enumerate(cat_features):
    for fold in range(5):
        emb_fea = models_core[fold].layers[19:38][idx](tr_cat[idx])
        emb_np_fea = emb_fea.numpy()
        for i in range(emb_np_fea.shape[1]):
            if col+f'_{i}' not in cctr.columns:
                cctr[col+f'_{i}'] = np.zeros((cctr.shape[0],1))
            cctr[col+f'_{i}'] += emb_np_fea[:,i]
    cat_core_train[col+f'_{i}']= cctr[col+f'_{i}'].div(5)
        
    

# test
cat_core_test = pd.DataFrame(index=test.index)
ccts = pd.DataFrame(index=test.index)

for idx, col in enumerate(cat_features):
    for fold in range(5):
        emb_fea = models_core[fold].layers[19:38][idx](ts_cat[idx])
        emb_np_fea = emb_fea.numpy()
        for i in range(emb_np_fea.shape[1]):
            if col+f'_{i}' not in ccts.columns:
                ccts[col+f'_{i}'] = np.zeros((ccts.shape[0],1))
            ccts[col+f'_{i}'] += emb_np_fea[:,i]
    cat_core_test[col+f'_{i}']= ccts[col+f'_{i}'].div(5)

display(cat_core_train.head(3))
display(cat_core_test.head(3))

In [None]:
# *--------------- target_1 ---------------*
# train
cat_crime_train = pd.DataFrame(index=train.index)
cctr = pd.DataFrame(index=train.index)

for idx, col in enumerate(cat_features):
    for fold in range(5):
        emb_fea = models_crime[fold].layers[19:38][idx](tr_cat[idx])
        emb_np_fea = emb_fea.numpy()
        for i in range(emb_np_fea.shape[1]):
            if col+f'_{i}' not in cctr.columns:
                cctr[col+f'_{i}'] = np.zeros((cctr.shape[0],1))
            cctr[col+f'_{i}'] += emb_np_fea[:,i]
    cat_crime_train[col+f'_{i}']= cctr[col+f'_{i}'].div(5)

# test
cat_crime_test = pd.DataFrame(index=test.index)
ccts = pd.DataFrame(index=test.index)

for idx, col in enumerate(cat_features):
    for fold in range(5):
        emb_fea = models_crime[fold].layers[19:38][idx](ts_cat[idx])
        emb_np_fea = emb_fea.numpy()
        for i in range(emb_np_fea.shape[1]):
            if col+f'_{i}' not in ccts.columns:
                ccts[col+f'_{i}'] = np.zeros((ccts.shape[0],1))            
            ccts[col+f'_{i}'] += emb_np_fea[:,i]
    cat_crime_test[col+f'_{i}']= ccts[col+f'_{i}'].div(5)


display(cat_crime_train.head(3))
display(cat_crime_test.head(3))

5개의 fold Embed 결과 중 가장 높은 점수의 embed를 추출하는 코드 

In [None]:
# *--------------- target_2 ---------------*
# train
cat_core_train = pd.DataFrame(index=train.index)

for idx, col in enumerate(cat_features):
    emb_fea = models_core[4].layers[19:38][idx](tr_cat[idx])
    emb_np_fea = emb_fea.numpy()
    for i in range(emb_np_fea.shape[1]):
        cat_core_train[col+f'_{i}']= emb_np_fea[:,i]
    

# test
cat_core_test = pd.DataFrame(index=test.index)

for idx, col in enumerate(cat_features):
    emb_fea = models_core[4].layers[19:38][idx](ts_cat[idx])
    emb_np_fea = emb_fea.numpy()
    for i in range(emb_np_fea.shape[1]):
        cat_core_test[col+f'_{i}']= emb_np_fea[:,i]

display(cat_core_train.head(3))
display(cat_core_test.head(3))

In [None]:
# *--------------- target_1 ---------------*
# train
cat_crime_train = pd.DataFrame(index=train.index)

for idx, col in enumerate(cat_features):
    emb_fea = models_crime[4].layers[19:38][idx](tr_cat[idx])
    emb_np_fea = emb_fea.numpy()
    for i in range(emb_np_fea.shape[1]):
        cat_crime_train[col+f'_{i}']= emb_np_fea[:,i]

# test
cat_crime_test = pd.DataFrame(index=test.index)

for idx, col in enumerate(cat_features):
    emb_fea = models_crime[4].layers[19:38][idx](ts_cat[idx])
    emb_np_fea = emb_fea.numpy()
    for i in range(emb_np_fea.shape[1]):
        cat_crime_test[col+f'_{i}']= emb_np_fea[:,i]


display(cat_crime_train.head(3))
display(cat_crime_test.head(3))

In [None]:
cat_core_train.to_csv("/content/drive/MyDrive/data/embed_cat_core_train_2.csv", encoding='utf-8', index=True)
cat_core_test.to_csv("/content/drive/MyDrive/data/embed_cat_core_test_2.csv", encoding='utf-8', index=True)

cat_crime_train.to_csv("/content/drive/MyDrive/data/embed_cat_crime_train_2.csv", encoding='utf-8', index=True)
cat_crime_test.to_csv("/content/drive/MyDrive/data/embed_cat_crime_test_2.csv", encoding='utf-8', index=True)

# Catboost

In [None]:
train_y_crime = train['target_2']
train_y_core = train['target_1']

train_y_crime.shape, train_y_core.shape

((89619,), (89619,))

In [None]:
cat_crime_train = pd.read_csv("/content/drive/MyDrive/data/embed_cat_crime_train.csv")
cat_crime_test = pd.read_csv("/content/drive/MyDrive/data/embed_cat_crime_test.csv")

cat_core_train = pd.read_csv("/content/drive/MyDrive/data/embed_cat_core_train.csv")
cat_core_test = pd.read_csv("/content/drive/MyDrive/data/embed_cat_core_test.csv")

In [None]:
# 수치 데이터 추가
cat_crime_train['무게']= train['무게']
cat_crime_test['무게']= test['무게']

cat_crime_train['금액']= train['금액']
cat_crime_test['금액']= test['금액']

cat_core_train['무게']= train['무게']
cat_core_test['무게']= test['무게']

cat_core_train['금액']= train['금액']
cat_core_test['금액']= test['금액']

In [None]:
cat_core_params = {
    'bootstrap_type': 'Poisson',
    'custom_metric':'F1',
    'random_seed': seed,
    'task_type': 'GPU',
    'learning_rate': 1e-1,
    'n_estimators': 2000,
    'auto_class_weights':"Balanced"
}

cat_crime_params = {
    'bootstrap_type': 'Poisson',
    'custom_metric':'F1',
    'random_seed': seed,
    'task_type': 'GPU',
    'learning_rate': 1e-1,
    'n_estimators': 2000,
    'auto_class_weights':"Balanced"    
}

target_2

In [None]:
n_fold = 5

cat_pred = np.zeros((cat_core_train.shape[0], 1))
pred_core_test = pd.DataFrame()
feat_core_importance = pd.DataFrame({'fea_name':cat_core_train.columns.to_list()})

train_x = cat_core_train
test_x= cat_core_test
target_y = train_y_core

skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)




for fold, (train_idx, valid_idx) in enumerate(skfold.split(train_x, target_y)):
    print(f'\n----------------- Fold {fold} -----------------\n')
    X_train, X_valid, y_train, y_valid = train_x.iloc[train_idx], train_x.iloc[valid_idx], target_y.iloc[train_idx], target_y.iloc[valid_idx]

    model_cat = CatBoostClassifier(**cat_core_params)

    model_cat.fit(X_train, y_train, 
        #cat_features=cat_feats, 
        eval_set=(X_valid, y_valid), 
        early_stopping_rounds = 50,

        verbose= 100
    )
  
    cat_pred[valid_idx] = model_cat.predict(X_valid)
    #pred_core_test[f'{fold}_pred']=model_cat.predict(cat_core_test).reshape(-1,)
    feat_core_importance[f'importance_{fold}'] = model_cat.get_feature_importance()
    
    print('\nCV f1 Score:', f1_score(y_valid,  cat_pred[valid_idx], average='macro'))
    print('\nCV precision Score:', precision_score(y_valid, cat_pred[valid_idx], average='macro'))
    print('CV recall Score:', recall_score(y_valid, cat_pred[valid_idx], average='macro'))

print('\n\nf1 Score:', f1_score(target_y, cat_pred, average='macro'))
print('\nprecision Score:', precision_score(target_y, cat_pred, average='macro'))
print('recall Score:', recall_score(target_y, cat_pred, average='macro'))


----------------- Fold 0 -----------------

0:	learn: 1.0778348	test: 1.0776663	best: 1.0776663 (0)	total: 22.6ms	remaining: 45.2s
100:	learn: 0.9195796	test: 0.9558806	best: 0.9556670 (91)	total: 1.44s	remaining: 27s
200:	learn: 0.8873408	test: 0.9546067	best: 0.9540728 (180)	total: 2.67s	remaining: 23.9s
bestTest = 0.9540728187
bestIteration = 180
Shrink model to first 181 iterations.

CV f1 Score: 0.44918274787774876

CV precision Score: 0.4509730149189464
CV recall Score: 0.5036852593013844

----------------- Fold 1 -----------------

0:	learn: 1.0772399	test: 1.0784745	best: 1.0784745 (0)	total: 12.6ms	remaining: 25.1s
100:	learn: 0.9175765	test: 0.9578853	best: 0.9578853 (100)	total: 1.12s	remaining: 21s
bestTest = 0.957805917
bestIteration = 113
Shrink model to first 114 iterations.

CV f1 Score: 0.44620381730976755

CV precision Score: 0.4484841477694094
CV recall Score: 0.4997967771408665

----------------- Fold 2 -----------------

0:	learn: 1.0780366	test: 1.0783365	best: 1

target_1

In [None]:
n_fold = 5

cat_pred = np.zeros((cat_crime_train.shape[0], 1))
pred_crime_test = pd.DataFrame()
feat_crime_importance = pd.DataFrame({'fea_name':cat_crime_train.columns.to_list()})

train_x = cat_crime_train
test_x= cat_crime_test
target_y = train_y_crime

skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)




for fold, (train_idx, valid_idx) in enumerate(skfold.split(train_x, target_y)):
    print(f'\n----------------- Fold {fold} -----------------\n')
    X_train, X_valid, y_train, y_valid = train_x.iloc[train_idx], train_x.iloc[valid_idx], target_y.iloc[train_idx], target_y.iloc[valid_idx]

    model_cat = CatBoostClassifier(**cat_crime_params)

    model_cat.fit(X_train, y_train, 
        #cat_features=cat_feats, 
        eval_set=(X_valid, y_valid), 
        early_stopping_rounds = 50,

        verbose= 100
    )
  
    cat_pred[valid_idx] = model_cat.predict(X_valid).reshape(-1,1)
    #pred_crime_test[f'{fold}_pred']=model_cat.predict(cat_crime_test).reshape(-1,)
    
    feat_crime_importance[f'importance_{fold}'] = model_cat.get_feature_importance()

    print('\nCV f1 Score:', f1_score(y_valid,  cat_pred[valid_idx], average='binary'))
    print('\nCV precision Score:', precision_score(y_valid, cat_pred[valid_idx], average='binary'))
    print('CV recall Score:', recall_score(y_valid, cat_pred[valid_idx], average='binary'))

print('\n\nf1 Score:', f1_score(target_y, cat_pred, average='binary'))
print('\nprecision Score:', precision_score(target_y, cat_pred, average='binary'))
print('recall Score:', recall_score(target_y, cat_pred, average='binary'))



----------------- Fold 0 -----------------

0:	learn: 0.6668838	test: 0.6671397	best: 0.6671397 (0)	total: 9.84ms	remaining: 19.7s
100:	learn: 0.5201650	test: 0.5367427	best: 0.5367427 (100)	total: 896ms	remaining: 16.8s
200:	learn: 0.5046396	test: 0.5341767	best: 0.5341599 (193)	total: 1.79s	remaining: 16s
300:	learn: 0.4922046	test: 0.5333664	best: 0.5331743 (259)	total: 2.7s	remaining: 15.3s
bestTest = 0.5331743182
bestIteration = 259
Shrink model to first 260 iterations.

CV f1 Score: 0.5472493942385354

CV precision Score: 0.42423820787533045
CV recall Score: 0.7707280080889788

----------------- Fold 1 -----------------

0:	learn: 0.6668698	test: 0.6675375	best: 0.6675375 (0)	total: 9.03ms	remaining: 18.1s
100:	learn: 0.5196384	test: 0.5374085	best: 0.5374085 (100)	total: 871ms	remaining: 16.4s
200:	learn: 0.5045493	test: 0.5359109	best: 0.5357747 (169)	total: 1.76s	remaining: 15.8s
bestTest = 0.5357747291
bestIteration = 169
Shrink model to first 170 iterations.

CV f1 Score: 0

# 피처중요도

target_2

In [None]:
feat_core_importance.sort_values("importance_1", ascending=False)

In [None]:
feat_core_drop_list = feat_core_importance.loc[feat_core_importance.apply(lambda row: row[1:].sum()==0, axis=1)].fea_name.to_list()

if '금액' not in feat_core_drop_list:
    feat_core_drop_list.append('금액')
    
print('금액' in feat_core_drop_list)
print(feat_core_drop_list)
print(len(feat_core_drop_list))

target_1

In [None]:
feat_crime_importance.sort_values("importance_1", ascending=False)

In [None]:
feat_crime_importance.sort_values("importance_1",)

In [None]:
feat_crime_importance.loc[feat_crime_importance.apply(lambda row: row[1:].sum()==0, axis=1)]

Unnamed: 0,fea_name,importance_0,importance_1,importance_2,importance_3,importance_4


In [None]:
feat_crime_drop_list = feat_crime_importance.loc[feat_crime_importance.apply(lambda row: row[1:].sum()==0, axis=1)].fea_name.to_list()

if '금액' not in feat_crime_drop_list:
    feat_crime_drop_list.append('금액')

print('금액' in feat_crime_drop_list)
print(feat_crime_drop_list)
print(len(feat_crime_drop_list))

# bad 피처 제거 후 catboost 재실행

In [None]:
cat_core_train.drop(feat_core_drop_list, axis=1, inplace=True)
cat_core_test.drop(feat_core_drop_list, axis=1, inplace=True)

cat_crime_train.drop(feat_crime_drop_list, axis=1, inplace=True)
cat_crime_test.drop(feat_crime_drop_list, axis=1, inplace=True)

target_2

In [None]:
n_fold = 5

cat_pred = np.zeros((cat_core_train.shape[0], 1))
pred_core_test = pd.DataFrame()
feat_core_importance = pd.DataFrame({'fea_name':cat_core_train.columns.to_list()})

train_x = cat_core_train
target_y = train_y_core

skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)




for fold, (train_idx, valid_idx) in enumerate(skfold.split(train_x, target_y)):
    print(f'\n----------------- Fold {fold} -----------------\n')
    X_train, X_valid, y_train, y_valid = train_x.iloc[train_idx], train_x.iloc[valid_idx], target_y.iloc[train_idx], target_y.iloc[valid_idx]

    model_cat = CatBoostClassifier(**cat_core_params)

    model_cat.fit(X_train, y_train, 
        #cat_features=cat_feats, 
        eval_set=(X_valid, y_valid), 
        early_stopping_rounds = 50,

        verbose= 100
    )
  
    cat_pred[valid_idx] = model_cat.predict(X_valid)
    pred_core_test[f'{fold}_pred']=model_cat.predict(cat_core_test).reshape(-1,)
    feat_core_importance[f'importance_{fold}'] = model_cat.get_feature_importance()
    
    print('\nCV f1 Score:', f1_score(y_valid,  cat_pred[valid_idx], average='macro'))
    print('\nCV precision Score:', precision_score(y_valid, cat_pred[valid_idx], average='macro'))
    print('CV recall Score:', recall_score(y_valid, cat_pred[valid_idx], average='macro'))

print('\n\nf1 Score:', f1_score(target_y, cat_pred, average='macro'))
print('\nprecision Score:', precision_score(target_y, cat_pred, average='macro'))
print('recall Score:', recall_score(target_y, cat_pred, average='macro'))


----------------- Fold 0 -----------------

0:	learn: 1.0778351	test: 1.0776663	best: 1.0776663 (0)	total: 17.7ms	remaining: 35.4s
100:	learn: 0.9198997	test: 0.9546410	best: 0.9545356 (91)	total: 1.41s	remaining: 26.4s
200:	learn: 0.8880878	test: 0.9539131	best: 0.9530677 (161)	total: 2.66s	remaining: 23.8s
bestTest = 0.9530677496
bestIteration = 161
Shrink model to first 162 iterations.

CV f1 Score: 0.45041501078430696

CV precision Score: 0.45219121844181737
CV recall Score: 0.5057396353130962

----------------- Fold 1 -----------------

0:	learn: 1.0772401	test: 1.0784746	best: 1.0784746 (0)	total: 14.1ms	remaining: 28.1s
100:	learn: 0.9184089	test: 0.9575585	best: 0.9575585 (100)	total: 1.14s	remaining: 21.5s
bestTest = 0.9565721937
bestIteration = 140
Shrink model to first 141 iterations.

CV f1 Score: 0.44707990284687166

CV precision Score: 0.4489342929467088
CV recall Score: 0.5010540857957663

----------------- Fold 2 -----------------

0:	learn: 1.0780364	test: 1.0783365	b

target_1

In [None]:
n_fold = 5

cat_pred = np.zeros((cat_crime_train.shape[0], 1))
pred_crime_test = pd.DataFrame()
feat_crime_importance = pd.DataFrame({'fea_name':cat_crime_train.columns.to_list()})

train_x = cat_crime_train
target_y = train_y_crime

skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)




for fold, (train_idx, valid_idx) in enumerate(skfold.split(train_x, target_y)):
    print(f'\n----------------- Fold {fold} -----------------\n')
    X_train, X_valid, y_train, y_valid = train_x.iloc[train_idx], train_x.iloc[valid_idx], target_y.iloc[train_idx], target_y.iloc[valid_idx]

    model_cat = CatBoostClassifier(**cat_crime_params)

    model_cat.fit(X_train, y_train, 
        #cat_features=cat_feats, 
        eval_set=(X_valid, y_valid), 
        early_stopping_rounds = 50,

        verbose= 100
    )
  
    cat_pred[valid_idx] = model_cat.predict(X_valid).reshape(-1,1)
    pred_crime_test[f'{fold}_pred']=model_cat.predict(cat_crime_test).reshape(-1,)
    
    feat_crime_importance[f'importance_{fold}'] = model_cat.get_feature_importance()

    print('\nCV f1 Score:', f1_score(y_valid,  cat_pred[valid_idx], average='binary'))
    print('\nCV precision Score:', precision_score(y_valid, cat_pred[valid_idx], average='binary'))
    print('CV recall Score:', recall_score(y_valid, cat_pred[valid_idx], average='binary'))

print('\n\nf1 Score:', f1_score(target_y, cat_pred, average='binary'))
print('\nprecision Score:', precision_score(target_y, cat_pred, average='binary'))
print('recall Score:', recall_score(target_y, cat_pred, average='binary'))



----------------- Fold 0 -----------------

0:	learn: 0.6668839	test: 0.6671399	best: 0.6671399 (0)	total: 9.27ms	remaining: 18.5s
100:	learn: 0.5203354	test: 0.5366359	best: 0.5365319 (95)	total: 861ms	remaining: 16.2s
200:	learn: 0.5055614	test: 0.5346990	best: 0.5346990 (200)	total: 1.7s	remaining: 15.2s
300:	learn: 0.4937688	test: 0.5341775	best: 0.5338523 (274)	total: 2.54s	remaining: 14.3s
bestTest = 0.5338522955
bestIteration = 274
Shrink model to first 275 iterations.

CV f1 Score: 0.54416897878815

CV precision Score: 0.4212276569211584
CV recall Score: 0.7684529828109201

----------------- Fold 1 -----------------

0:	learn: 0.6668698	test: 0.6675375	best: 0.6675375 (0)	total: 9.4ms	remaining: 18.8s
100:	learn: 0.5201080	test: 0.5370282	best: 0.5370282 (100)	total: 854ms	remaining: 16.1s
200:	learn: 0.5053450	test: 0.5354655	best: 0.5354600 (175)	total: 1.67s	remaining: 14.9s
bestTest = 0.5352200295
bestIteration = 249
Shrink model to first 250 iterations.

CV f1 Score: 0.54

#제출

In [None]:
ss.head(3)

In [None]:
ss['target_2']= pred_core_test.T.mode().rename(index={0:"pred_mode"}).T['pred_mode']
ss['target_1']= pred_crime_test.T.mode().rename(index={0:"pred_mode"}).T['pred_mode']