## This model has the following characteristics:
* No feature engineering
* Applying bidirectional Conv1D to raw transactions

In [3]:
%run import_modules.py
%matplotlib inline

# Tensorflow warning off
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import tensorflow as tf
import keras

#tf.logging.set_verbosity(tf.logging.ERROR)
import logging
logger = tf.get_logger()
logger.setLevel(logging.ERROR)

from keras import Input
from keras import layers
from keras import regularizers
from keras.constraints import max_norm
from keras.callbacks import EarlyStopping
from keras.preprocessing import sequence
from keras.preprocessing.text import *

##tf.random.set_seed(1)
from keras import backend as K
from keras.layers import * 
from keras.models import * 
from keras.optimizers import *
from keras.initializers import *
from keras.regularizers import *
from keras.utils.np_utils import *
from keras.utils.vis_utils import * #model_to_dot

Using TensorFlow backend.


### Read Data

In [4]:
df_train = pd.read_csv('X_train.csv', encoding='cp949')
df_test = pd.read_csv('X_test.csv', encoding='cp949')
y_train = pd.read_csv('y_train.csv').gender
IDtest = df_test.cust_id.unique()

df = pd.concat([df_train,df_test],ignore_index=True)

df.tail(10)

Unnamed: 0,cust_id,tran_date,store_nm,goods_id,gds_grp_nm,gds_grp_mclas_nm,amount
395552,5981,2007-01-12 00:00:00,영등포점,85150,포숑,가공식품,18000
395553,5981,2007-01-12 00:00:00,영등포점,657121,서양델리,가공식품,11000
395554,5981,2007-01-12 00:00:00,영등포점,85150,밥류,가공식품,3000
395555,5981,2007-01-12 00:00:00,영등포점,50109,일반가공식품,가공식품,178750
395556,5981,2007-01-12 00:00:00,영등포점,85150,패밀리레스토랑,가공식품,3000
395557,5981,2007-01-12 00:00:00,영등포점,50105,일반가공식품,가공식품,209000
395558,5981,2007-01-12 00:00:00,영등포점,50109,상품군미지정,기타,7150
395559,5981,2007-01-12 00:00:00,영등포점,50105,햄,축산가공,9500
395560,5981,2007-01-12 00:00:00,영등포점,50105,상품군미지정,기타,9500
395561,5981,2007-03-16 00:00:00,영등포점,77198,수입식품,차/커피,174800


### Transform Data

In [15]:
max_features = 2000 # 324
max_len = 200
emb_dim = 64

df.goods_id = df.goods_id.astype(str)
# Converts a "gds_grp_nm" to a sequence of indexes in a fixed-size hashing space
train_test = df.groupby('cust_id')['gds_grp_nm'].apply(lambda x: [one_hot(i, max_features)[0] for i in x]).values
X_train = train_test[:3500]
X_test = train_test[3500:]
#X_train = df_train.groupby('cust_id')['goods_id'].apply(lambda x: [one_hot(i, max_features)[0] for i in x]).values
#X_test = df_test.groupby('cust_id')['goods_id'].apply(lambda x: [one_hot(i, max_features)[0] for i in x]).values

for i in range(X_train.shape[0]):
    x = np.unique(X_train[i])
    y = np.array([])
    for j in range(5):
        y = np.append(y, np.random.choice(x, len(x), replace=False))
    X_train[i] = y    


for i in range(X_test.shape[0]):
    x = np.unique(X_test[i])
    y = np.array([])
    for j in range(5):
        y = np.append(y, np.random.choice(x, len(x), replace=False))
    X_test[i] = y    
  
    
# Pads sequences to the same length
X_train_conv1d = sequence.pad_sequences(X_train, maxlen=max_len)
X_test_conv1d = sequence.pad_sequences(X_test, maxlen=max_len)

X_train_conv1d.shape, X_test_conv1d.shape

((3500, 200), (2482, 200))

### Build Models

- ### 1st try (전반적인 아키텍쳐의 파라미터 조정 시도(패딩, activation 함수) -> 성능이 다 더 떨어짐, 
  ### Flattening 후 단순 DNN과 연결 -> 성능 떨어짐)  Dropout 값만 조정 

In [None]:
%%time

# Set hyper-parameters for power mean ensemble 
N = 5
p = 3.5
preds = []
aucs = []
callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss',
                                               patience=10),
                 keras.callbacks.ModelCheckpoint(filepath='best_model.h5',
                                                 monitor='val_loss',
                                                 save_best_only=True)]

for i in tqdm(range(N)):    
    X_train, X_test = X_train_conv1d, X_test_conv1d

    ##### STEP 1: Randomize Seed
    SEED = np.random.randint(1, 10000)              
    random.seed(SEED)       
    np.random.seed(SEED)     
    if tf.__version__[0] < '2':  
        tf.set_random_seed(SEED)
    else:
        tf.random.set_seed(SEED)

    ##### STEP 4: Build a CNN Model
    
    # Define the Model architecture
    in_f = Input(shape=(max_len,), dtype='int32', name='forward')
    x = layers.Embedding(max_features, emb_dim)(in_f)
    x = layers.Conv1D(32, 7, activation='tanh', padding='same')(x)
    x = layers.MaxPooling1D(5)(x)
    x = layers.Conv1D(32, 7, activation='tanh',padding='same')(x)
    #x = layers.GlobalMaxPooling1D()(x)
    x = layers.Flatten()(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(8, activation='relu', kernel_initializer='he_normal', kernel_regularizer='l2')(x)
    out = layers.Dense(1, activation='sigmoid')(x)

    model1 = Model(in_f, out)
    model1.summary()

    # Choose the Optimizer and the Cost function
    model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc',tf.keras.metrics.AUC()]) #RMSprop(lr=1e-4)
    train_x, valid_x, train_y, valid_y = train_test_split(X_train, y_train, test_size=0.2)
    # Train the Model
    history = model1.fit(train_x, train_y, epochs=100, batch_size=128, 
                        validation_data=(valid_x,valid_y), callbacks=callbacks, verbose=1)

    print(f'CNN learning curve {i+1}/{N}')
    plt.plot(history.history["loss"], label="train loss")
    plt.plot(history.history["val_loss"], label="validation loss")
    plt.legend()
    plt.title("Loss")
    plt.show()
    
    # Make Prediction
    auc = roc_auc_score(valid_y, model1.predict(valid_x).flatten())
    aucs.append(auc)
    print('AUC', auc)
    preds.append(model1.predict(X_test).flatten())   

### Validate the Models
print('\nValidation Summary:')
aucs = pd.Series(aucs)
print(aucs.sort_values(ascending=False))
print('mean={:.5f}, std={:.3f}'.format(aucs.mean(), aucs.std()))     

  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Model: "model_94"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
forward (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_94 (Embedding)     (None, 200, 64)           128000    
_________________________________________________________________
conv1d_187 (Conv1D)          (None, 200, 32)           14368     
_________________________________________________________________
max_pooling1d_71 (MaxPooling (None, 40, 32)            0         
_________________________________________________________________
conv1d_188 (Conv1D)          (None, 40, 32)            7200      
_________________________________________________________________
flatten_92 (Flatten)         (None, 1280)              0         
_________________________________________________________________
dropout_112 (Dropout)        (None, 1280)              0  

### Make Submissions

In [33]:
cd C:\Users\john9\Downloads\3학년 2학기\머신러닝\과제\5차\Submission

C:\Users\john9\Downloads\3학년 2학기\머신러닝\과제\5차\Submission


In [34]:
# Power mean ensemble
THRESHOLD = 0.78  # Use only models whose AUC exceeds this value

pred = 0
n = 0
for i in range(N):
    if aucs.iloc[i] > THRESHOLD:
        pred = pred + preds[i]**p 
        n += 1
pred = pred / n    
pred = pred**(1/p)

# Make a submission file
t = pd.Timestamp.now()
fname = f"ConV1d_p{p}n{n}_submit_{t.month:02}{t.day:02}{t.hour:02}{t.minute:02}.csv"
submissions = pd.concat([pd.Series(IDtest, name="cust_id"), pd.Series(pred, name="gender")] ,axis=1)
submissions.to_csv(fname, index=False)
print(f"'{fname}' is ready to submit.")

'ConV1d_p3.5n1_submit_06152225.csv' is ready to submit.


## End