libraries

In [202]:
# Data Wrangling
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# Visualization
import seaborn as sns
import matplotlib.pylab as plt
from matplotlib import font_manager, rc

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectPercentile
from sklearn.preprocessing import PolynomialFeatures

# Modeling
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.ensemble import VotingClassifier
from vecstack import stacking
from scipy.stats.mstats import gmean

# Evaluation
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import ShuffleSplit

# Utility
import os
import time
import random
import warnings; warnings.filterwarnings("ignore")
from IPython.display import Image
from sklearn.externals import joblib
import pickle
from itertools import combinations
import gc
from tqdm import tqdm
import platform

# Keras
import tensorflow as tf
# Tensorflow warning off
if tf.__version__[0] < '2':
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    tf.logging.set_verbosity(tf.logging.ERROR)
import keras
from keras import backend as K
from keras.layers import * #Input, Dense
from keras.models import * #Model
from keras.optimizers import *
from keras.initializers import *
from keras.regularizers import *
from keras.constraints import *
from keras.utils.np_utils import *
from keras.utils.vis_utils import * #model_to_dot
from keras.preprocessing.image import *
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import *

from keras.models import Model, Sequential
from keras import Input
from keras import layers
from keras.optimizers import RMSprop
from keras.constraints import max_norm
from keras.callbacks import EarlyStopping


read data & set seed

In [16]:
# read data
df_train = pd.read_csv('X_train.csv', encoding='cp949')
df_test = pd.read_csv('X_test.csv', encoding='cp949')
y_train = pd.read_csv('y_train.csv', encoding='cp949').gender
IDtest = df_test['cust_id'].unique()

# set seed
seed = 2020
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

transform data

In [109]:
# 띄어쓰기, 점, 콤마, 세로바 없애기

before = [
# 띄어쓰기
'기초 화장품', '니  트', '캐릭터 여화', '캐릭터 남화', '수      저', '4대 B/D', 'N/B 아동복', '수입 아동복', '스낵형 델리','수입청소기 외','영 캐릭터','영 트렌디','식기 단독매입','커       튼','생활잡화 단독매입',
'가구 단독매입','커리어 행사','가 방','패션 단독매입','시티 단독매입','신생아완구 단독매입','MP3 外','인텔리젼스 행사',
'구두 단독매입','시네마 매점','시티웨어 행사','장신 단독매입','NB 남화','남성시티 직매입','영플라자 (올리브핫스텁)',
'색조 화장품','NB 여화','직수입 골프의류','트래디 행사','IT 게임기,S/W','TAKE OUT','주방 단독매입','셔츠 직매입(PB)', '남성정장 직매입',
# 점
'N.B정장', '즉석.스넥(매장)','N.B', 'L.B',
# 콤마
'기타(가발,포장,담배,끽연구,사진,수선)', '원목,학생,철재',  
# 세로바
'N/B골프의류', 'L/C 아동복', 'L/C골프의류', '4대B/D', '우/양산(특정)', 'N/B아동복', 'L/C정장', '국산A/V','수입A/V','영플라자(진/유니)',
'IT게임기,S/W', '피아노/악기',
] 

after = ['기초화장품', '니트', '캐릭터여화', '캐릭터남화', '수저', '4대B/D', 'N/B아동복', '수입아동복', '스낵형델리','수입청소기외','영캐릭터','영트렌디','식기단독매입','커튼','생활잡화단독매입','가구단독매입',
'커리어행사','가방','패션단독매입','시티단독매입','신생아완구단독매입','MP3外','인텔리젼스행사','구두단독매입','시네마매점','시티웨어행사','장신단독매입','NB남화','남성시티직매입','영플라자(올리브핫스텁)',
'색조화장품','NB여화','직수입골프의류','트래디행사','IT게임기,S/W','TAKEOUT','주방단독매입','셔츠직매입(PB)', '남성정장직매입',
'NB정장', '즉석스넥(매장)','NB', 'LB', '기타(가발포장담배끽연구사진수선)', '원목학생철재',
'NB골프의류', 'LC아동복', 'LC골프의류', '4대BD', '우양산(특정)', 'NB아동복', 'LC정장', '국산AV','수입AV','영플라자(진유니)',
'IT게임기SW', '피아노악기',]



df_train.gds_grp_nm = df_train.gds_grp_nm.replace(before, after)
df_test.gds_grp_nm = df_test.gds_grp_nm.replace(before, after)

In [218]:
max_features_nm = 340 # 중분류 nunique = 330
maxlen_nm = 450 # oversample n=3 일때 np.mean이 397이어서
emb_dim_nm = 256 # 임베딩 벡터의 출력 차원. 결과로서 나오는 임베딩 벡터의 크기

# Converts a "gds_grp_nm" to a sequence of indexes in a fixed-size hashing space
X_train_nm = df_train.groupby('cust_id')['gds_grp_nm'].apply(lambda x: [one_hot(products, max_features_nm)[0] for products in x]).values
X_test_nm = df_test.groupby('cust_id')['gds_grp_nm'].apply(lambda x: [one_hot(products, max_features_nm)[0] for products in x]).values


# oversample2: 
    # 1. unique X(get all products with duplication)
    # 2. replace=True(Restoration extraction)
    # 3. more buy, more oversample
    #       -> if you want to oversample in same ratio, just adjust n=1 to oversample=1, and erase 'for oversample in range(n)'

def oversample2(data, n=1, seed=seed):
    
    np.random.seed(seed)
    
    customerProducts = []
    
    for products in data:
        
        for oversample in range(n):
            
            products = list(np.append(products, np.random.choice(products, len(products) * oversample, replace=True)))
            
        customerProducts.append(products)
        
    return customerProducts

In [193]:
X_train_nm = oversample2(X_train_nm, 3)
X_test_nm = oversample2(X_test_nm, 3)

In [194]:
# 차이를 한번 봐야함: pad_sequences에서 truncating 하는 것이 좋은지, 안하는 것이 성능이 더 좋은지
# 1. 일단 적은 양으로 가보기 위해 truncating 실시.
# 2. 

X_train_nm = pad_sequences(X_train_nm, maxlen=maxlen_nm, padding='post', truncating='post', value=0)
X_test_nm = pad_sequences(X_test_nm, maxlen=maxlen_nm, padding='post', truncating='post', value=0)

X_train_nm.shape, X_test_nm.shape

((3500, 450), (2482, 450))

build models

In [None]:
# Define the model & its Architecture

In [207]:
in_f = Input(shape=(maxlen_nm, ), dtype='int32', name='forward')

In [208]:
in_f

<tf.Tensor 'forward_2:0' shape=(None, 450) dtype=int32>

In [219]:
x = layers.Embedding(max_features_nm, emb_dim_nm)(in_f)

In [222]:
x = layers.Conv1D(32, 3, activation='relu')(x)

In [223]:
layers.Conv1D?

In [215]:
layers.Embedding?

In [None]:
model = Sequential(
    [
        Input(shape=(maxlen_nm, ), dtypes='int32', name='gds_grp_nm'),
        x = layers.
        
    ]
)

In [None]:
initial_model = keras.Sequential(
    [
        keras.Input(shape=(250, 250, 3)),
        layers.Conv2D(32, 5, strides=2, activation="relu"),
        layers.Conv2D(32, 3, activation="relu"),
        layers.Conv2D(32, 3, activation="relu"),
    ]
)

In [None]:
# Define Sequential model with 3 layers
model = keras.Sequential(
    [
        layers.Dense(2, activation="relu", name="layer1"),
        layers.Dense(3, activation="relu", name="layer2"),
        layers.Dense(4, name="layer3"),
    ]
)
# Call model on a test input
x = tf.ones((3, 3))
y = model(x)


is equivalent to this function:

# Create 3 layers
layer1 = layers.Dense(2, activation="relu", name="layer1")
layer2 = layers.Dense(3, activation="relu", name="layer2")
layer3 = layers.Dense(4, name="layer3")

# Call layers on a test input
x = tf.ones((3, 3))
y = layer3(layer2(layer1(x)))


You can also create a Sequential model incrementally via the add() method:

model = keras.Sequential()
model.add(layers.Dense(2, activation="relu"))
model.add(layers.Dense(3, activation="relu"))
model.add(layers.Dense(4))


Also note that the Sequential constructor accepts a name argument, just like any layer or model in Keras. This is useful to annotate TensorBoard graphs with semantically meaningful names.

model = keras.Sequential(name="my_sequential")
model.add(layers.Dense(2, activation="relu", name="layer1"))
model.add(layers.Dense(3, activation="relu", name="layer2"))
model.add(layers.Dense(4, name="layer3"))

In [None]:
Feature extraction with a Sequential model
Once a Sequential model has been built, it behaves like a Functional API model. This means that every layer has an input and output attribute. These attributes can be used to do neat things, like quickly creating a model that extracts the outputs of all intermediate layers in a Sequential model:

initial_model = keras.Sequential(
    [
        keras.Input(shape=(250, 250, 3)),
        layers.Conv2D(32, 5, strides=2, activation="relu"),
        layers.Conv2D(32, 3, activation="relu"),
        layers.Conv2D(32, 3, activation="relu"),
    ]
)
feature_extractor = keras.Model(
    inputs=initial_model.inputs,
    outputs=[layer.output for layer in initial_model.layers],
)

# Call feature extractor on test input.
x = tf.ones((1, 250, 250, 3))
features = feature_extractor(x)
Here's a similar example that only extract features from one layer:

initial_model = keras.Sequential(
    [
        keras.Input(shape=(250, 250, 3)),
        layers.Conv2D(32, 5, strides=2, activation="relu"),
        layers.Conv2D(32, 3, activation="relu", name="my_intermediate_layer"),
        layers.Conv2D(32, 3, activation="relu"),
    ]
)
feature_extractor = keras.Model(
    inputs=initial_model.inputs,
    outputs=initial_model.get_layer(name="my_intermediate_layer").output,
)
# Call feature extractor on test input.
x = tf.ones((1, 250, 250, 3))
features = feature_extractor(x)

In [None]:
### data reading
from tensorflow.examples.tutorials.mnist import input_data

mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
x_train, y_train = mnist.train.images, mnist.train.labels
x_test, y_test = mnist.test.images, mnist.test.labels

from keras.models import Sequential, Model
from keras.layers import Input, Dense, Activation
from keras.optimizers import Adam, SGD
from keras import metrics

## sequential model 
seq_model = Sequential([
    Dense(512, input_shape=(784,), activation='relu'), 
    Dense(128, activation='relu'),
    Dense(32, activation='relu'),
    Dense(10, activation='softmax'),
])

print("#### Sequential Model")
seq_model.summary()
seq_model.compile(loss='categorical_crossentropy', 
              optimizer=Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8),
              metrics=[metrics.categorical_accuracy])
train_history = seq_model.fit(x_train, y_train, epochs=5, batch_size=500, verbose=2)
train_history = train_history.history # epoch마다 변화한 loss, metric

loss_and_metric = seq_model.evaluate(x_train, y_train, batch_size=128, verbose=0)
print("train, loss and metric: {}".format(loss_and_metric))
loss_and_metric = seq_model.evaluate(x_test, y_test, batch_size=128, verbose=0)
print("test, loss and metric: {}".format(loss_and_metric))