In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, PowerTransformer
from itertools import combinations

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
tf.compat.v1.enable_eager_execution()
import tensorflow.keras.backend as K

os.environ["CUDA_VISIBLE_DEVICES"]="0"
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')

# Data Loading

In [2]:
train = pd.read_csv('./data/train_df.csv')
train.columns = ['index', '송장인번호', '수하인번호', '카테고리_대', '카테고리_중'] + ['운송장건수']
test = pd.read_csv('./data/test_df.csv')
test.columns = ['index', '송장인번호', '수하인번호', '카테고리_대', '카테고리_중']
sample = pd.read_csv('./data/sample_submission.csv')

train['송장인번호'] = train['송장인번호'].astype('str').apply(lambda x : x[:5])
train['수하인번호'] = train['수하인번호'].astype('str').apply(lambda x : x[:5])
test['송장인번호'] = test['송장인번호'].astype('str').apply(lambda x : x[:5])
test['수하인번호'] = test['수하인번호'].astype('str').apply(lambda x : x[:5])

In [3]:
train = train.sample(frac=1).reset_index(drop = True)

# Encoding

In [5]:
def target_encoding(train, test, columns) : 
    poly_features = []
    med = train['운송장건수'].median()
    func = 'mean'
    for column in columns : 
        new_column  = func + '_' + column
        d = train.groupby([column])['운송장건수'].agg(func).to_dict()
        train[new_column] = train[column].map(d)
        test[new_column]      = test[column].map(d)
        test[new_column] = test[new_column].fillna(med)
        poly_features.append(new_column)
    return train, test, poly_features

encoding_columns = ['송장인번호',	'수하인번호',	'카테고리_대',	'카테고리_중']
train,test, poly_features = target_encoding(train, test, encoding_columns)

poly = PolynomialFeatures(degree=2)
train_poly = poly.fit_transform(train.loc[:,poly_features])
poly_columns = poly.get_feature_names(poly_features)
train_poly = pd.DataFrame(train_poly, columns = poly_columns)

test_poly = poly.fit_transform(test.loc[:,poly_features])
test_poly = pd.DataFrame(test_poly, columns = poly_columns)

train = pd.concat([train.drop(columns = poly_features), train_poly ], axis = 1)
test = pd.concat([test.drop(columns = poly_features), test_poly ], axis = 1)

In [6]:
X_train = train.copy()
X_test = test.copy()
y_train = train['운송장건수']
for column in tqdm(, total = 4) : 
    dic = {value : key for key, value in train[column].to_dict().items()}
    dic = {value : num for num, value in enumerate(dic.keys())}
    dic['UNK'] = len(dic)
    train[column] = train[column].map(dic)
    test[column] = test[column].apply(lambda x : dic.get(x, len(dic) -1 ))
    
    train_onehot = pd.DataFrame(columns = dic.values(), index = train.index).fillna(0)
    test_onehot = pd.DataFrame(columns = dic.values(), index = test.index).fillna(0)
    
    idx = 0
    for c in train[column] : 
        train_onehot.loc[idx,c] = 1
        idx += 1
    train_onehot.columns = [column + '_' + str(x) for x in train_onehot.columns]
    
    idx = 0
    for c in test[column] : 
        test_onehot.loc[idx,c] = 1
        idx += 1
    test_onehot.columns = [column + '_' + str(x) for x in test_onehot.columns]
        
    X_train = pd.concat([X_train, train_onehot], axis =1)
    X_test = pd.concat([X_test, test_onehot], axis =1)

print(X_train.shape)
print(X_test.shape)

100%|██████████| 4/4 [00:27<00:00,  6.90s/it]

(32000, 536)
(4640, 535)





In [7]:
y_train = X_train['운송장건수'].astype('float32')
drop_cols = ['index', '송장인번호', '수하인번호', '카테고리_대', '카테고리_중']
X_train = X_train.drop(columns = ['운송장건수'] + drop_cols)
X_test = X_test.drop(columns = drop_cols)

#  Modeling

In [8]:
def inp_emb(data) :
    input_size = data.shape[1]
    emb_size = round(input_size ** 0.25)
    
    inp = Input(shape = (input_size,))
    emb = Embedding(input_dim = input_size, output_dim = emb_size)(inp)
    return inp, emb

def conv_layer(x, filters) : 
    x = Conv1D(filters, 1, padding = 'same')(x)
    x = BatchNormalization()(x)
    x = LeakyReLU()(x)
    
    x = Conv1D(filters, 1, padding = 'same')(x)
    x = BatchNormalization()(x)
    x = LeakyReLU()(x)
    
    x = MaxPooling1D()(x)
    x = Dropout(0.2)(x)
    
    return x

def fc_layer(x, unit, dr) : 
    x = Dense(unit, kernel_initializer = 'he_normal')(x)
    x = BatchNormalization()(x)
    x = LeakyReLU()(x)
    x = Dropout(dr)(x)
    return x

def rmse(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 
    
def draw_plot() : 
    import matplotlib.pyplot as plt

    fig, loss_ax = plt.subplots()

    loss_ax.plot(history.history['loss'], 'y', label='train rmse')
    loss_ax.plot(history.history['val_loss'], 'r', label='val rmse')
    loss_ax.set_xlabel('epoch')
    loss_ax.set_ylabel('RMSE')
    loss_ax.legend(loc='upper left')

    plt.show()
    

In [9]:
def create_model():
    inputs = []
    concats = []
    for column in [ '송장인번호_', 
                    '수하인번호_', 
                    '카테고리_대_', 
                    '카테고리_중_'
                    ] : 
        data = X_train.filter(like = column)
        inp, emb = inp_emb(data)
        inputs.append(inp)

        start_shape = X_train.filter(like = column).shape[1] // 2
        start_filter = 4
        conv = conv_layer(emb, start_filter)
        
        while True : 
            if start_shape == 1 : 
                break;
            else : 
                start_shape = start_shape // 2
                start_filter = start_filter * 2
                conv = conv_layer(conv, start_filter)
        flat = Flatten()(conv)
        concats.append(flat)
    inp = Input(shape = (X_train.filter(like = 'mean_').shape[1],))
    inputs.append(inp)
    concats.append(inp)
    
    concat = Concatenate()(concats)
    drop = Dropout(0.2)(concat)

    fc=  fc_layer(drop, 128, 0.2)
    fc = fc_layer(fc, 64, 0.2)
    fc = fc_layer(fc, 32, 0.2)
    fc = fc_layer(fc, 16, 0.2)
    fc = fc_layer(fc, 8, 0.2)
    fc = fc_layer(fc, 4, 0.2)
    fc = fc_layer(fc, 2, 0.2)
    out = Dense(1)(fc)

    model = Model(inputs, out)
    optimizer = tf.keras.optimizers.Adam(0.1)
    model.compile(optimizer=optimizer, loss=rmse)
    return model

model.

In [10]:
test_inputs = []
for column in ['송장인번호_', '수하인번호_', '카테고리_대_', '카테고리_중_','mean'] : 
    test_inputs.append(X_test.filter(like = column))

In [11]:
preds = []
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=15,verbose = 0, min_delta=1e-7)
early = EarlyStopping(monitor = 'val_loss', patience = 30, verbose = 0)

for i in range(5) : 
    rs = np.random.randint(0,1234578,1)[0]
    fold = 0
    kf = KFold(n_splits=5, shuffle=True, random_state = rs)
    for train_idx, valid_idx in kf.split(X_train.index) : 
        print(f'    ===== FOLD : {fold} =====')
        X_tr = X_train.iloc[train_idx] ; X_val = X_train.iloc[valid_idx]
        y_tr = y_train.iloc[train_idx] ; y_val = y_train.iloc[valid_idx]
        f_path = f'./models/model_{fold}.h5'
        mck = ModelCheckpoint(filepath=f_path,  monitor='val_loss', 
                                                                save_best_only=True, verbose = 0, model = 'min')
        callbacks = [mck, early, reduce_lr]

        train_inputs=  []
        valid_inputs = []
        for column in ['송장인번호_', '수하인번호_', '카테고리_대_', '카테고리_중_','mean'] : 
            train_inputs.append(X_tr.filter(like = column))
            valid_inputs.append(X_val.filter(like = column))

        model = create_model()
        history = model.fit(train_inputs, y_tr,
                            epochs = 20000,
                            batch_size = X_tr.shape[0]//5,
                            validation_data=(valid_inputs, y_val),
                            callbacks = callbacks,
                            verbose = 0
                            )
        # draw_plot()
        score = min(history.history['val_loss'])
        print('    ',score)

        model.load_weights(f_path)
        pred = model.predict(test_inputs)
        preds.append([pred, score])

        fold += 1

    ===== FOLD : 0 =====
     5.49909782409668
    ===== FOLD : 1 =====
     5.201865196228027
    ===== FOLD : 2 =====
     5.11037015914917
    ===== FOLD : 3 =====
     4.911957740783691
    ===== FOLD : 4 =====
     5.612766742706299
    ===== FOLD : 0 =====
     5.367262363433838
    ===== FOLD : 1 =====
     5.589743614196777
    ===== FOLD : 2 =====
     5.417158603668213
    ===== FOLD : 3 =====
     5.047820568084717
    ===== FOLD : 4 =====
     4.862943649291992
    ===== FOLD : 0 =====
     5.548834323883057
    ===== FOLD : 1 =====
     4.499111175537109
    ===== FOLD : 2 =====
     5.382911205291748
    ===== FOLD : 3 =====
     5.729288101196289
    ===== FOLD : 4 =====
     5.106230735778809
    ===== FOLD : 0 =====
     5.871496677398682
    ===== FOLD : 1 =====
     4.752765655517578
    ===== FOLD : 2 =====
     5.31601095199585
    ===== FOLD : 3 =====
     5.329275131225586
    ===== FOLD : 4 =====
     5.2221360206604
    ===== FOLD : 0 =====
     5.6026062965393

In [12]:
minimum = y_train.min()
preds2 = sorted(preds, key = lambda x: x[1])[:5]
pred = pred.flatten()
pred[pred < minimum ] = minimum

display(pd.DataFrame(pred).describe())
sample['INVC_CONT'] = pred
sample.to_csv('./submissions/CNN.csv', index=False)

Unnamed: 0,0
count,4640.0
mean,4.679496
std,2.046795
min,4.254835
25%,4.32607
50%,4.367467
75%,4.600364
max,71.723289
