### Import module and Load data

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import tensorflow as tf

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
from IPython.display import clear_output

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from scipy.stats import kstest

from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import StratifiedKFold

In [2]:
def f1(X_val, y_val, model,mapping):
    """
    Model evaluation function for multiclass classification problem
    1) F-1 score, Precision, Recall
    2) ROC curve, PR curve는 추후에 생각
    """
 
    #### predict the value
    y_pred = model.predict(X_val)

    #print('-'*50)
    #print('2. F1-score')
    
    # inverse pre/ rec
    pre = precision_score(y_true = y_val, y_pred = y_pred, average=None)
    rec = recall_score(y_true = y_val, y_pred = y_pred, average=None)

    # f1 measure
    f1_score = 8/(sum(1/pre) + sum(1/rec))
    
    # view - precision recall
    table = pd.DataFrame([])

    for i,k in enumerate(mapping.keys()):
        table[k] = [pre[i],rec[i]]
    table.index = ['precision','recall']
    # print(table)
    
    # view - f1
    #print('F1_score %.3f'%f1_score)
    #print('='*50)
    return f1_score

In [3]:
#### load data set
## train
X_train1 = pd.read_csv('X_train_stat.csv')
X_train1.columns = ['new_id'] + [x +'_stat' for x in X_train1.columns[1:]]
X_train2 = pd.read_csv('X_train_easy_time.csv').drop('new_id',axis=1)
X_train2.columns = [x +'_basic_time' for x in X_train2.columns]
X_train3 = pd.read_csv('X_train_게임활동_time.csv').drop('new_id',axis=1)
X_train3.columns = [x +'_time_series' for x in X_train3.columns]


## test
X_test1 = pd.read_csv('X_test_stat.csv')
X_test1.columns = X_train1.columns
X_test2 = pd.read_csv('X_test_easy_time.csv').drop('new_id',axis=1)
X_test2.columns = X_train2.columns
X_test3 = pd.read_csv('X_test_게임활동_time.csv').drop('new_id',axis=1)
X_test3.columns = X_train3.columns


## guild and trade
X_train4 = pd.read_csv('../MODELS/temp_data/temp_guild_train.csv').drop('new_id',axis=1)
X_train5 = pd.read_csv('../MODELS/temp_data/temp_trade_train.csv').drop('new_id',axis=1)
X_test4 = pd.read_csv('../MODELS/temp_data/temp_guild_test.csv').drop('new_id',axis=1)
X_test5 = pd.read_csv('../MODELS/temp_data/temp_trade_test.csv').drop('new_id',axis=1)

In [4]:
X_train = pd.concat((X_train1,X_train2,X_train3,X_train4,X_train5),axis=1).drop('new_id',axis=1)
X_test = pd.concat((X_test1,X_test2,X_test3,X_test4,X_test5),axis=1).drop('new_id',axis=1)

In [5]:
#### load class
train_label = pd.read_csv('../lite_data/train_label_lite.csv')
#hasher = pd.read_csv('temp_data/test_id.csv')
label_map = {'retained':0,'2month':1,'month':2,'week':3}
inv_map = {label_map[k]:k for k in label_map.keys()}
y_train = pd.Series([label_map[l] for l in train_label.label]).values

---

### Preprocessing

In [6]:
X_train.columns.tolist()

['payment_amount_CAT_stat',
 'payment_amount_min_stat',
 'payment_amount_max_stat',
 'payment_amount_range_stat',
 'payment_amount_median_stat',
 'payment_amount_sum_stat',
 'cnt_dt_CAT_stat',
 'cnt_dt_min_stat',
 'cnt_dt_max_stat',
 'cnt_dt_range_stat',
 'cnt_dt_median_stat',
 'cnt_dt_sum_stat',
 'play_time_CAT_stat',
 'play_time_min_stat',
 'play_time_max_stat',
 'play_time_range_stat',
 'play_time_median_stat',
 'play_time_sum_stat',
 'game_combat_time_CAT_stat',
 'game_combat_time_min_stat',
 'game_combat_time_max_stat',
 'game_combat_time_range_stat',
 'game_combat_time_median_stat',
 'game_combat_time_sum_stat',
 'get_money_CAT_stat',
 'get_money_min_stat',
 'get_money_max_stat',
 'get_money_range_stat',
 'get_money_median_stat',
 'get_money_sum_stat',
 'cnt_use_buffitem_CAT_stat',
 'cnt_use_buffitem_min_stat',
 'cnt_use_buffitem_max_stat',
 'cnt_use_buffitem_range_stat',
 'cnt_use_buffitem_median_stat',
 'cnt_use_buffitem_sum_stat',
 'npc_exp_CAT_stat',
 'npc_exp_min_stat',
 'np

In [7]:
#### columns dict 
numeric_col = {i:c for i,c in enumerate(X_train.columns.tolist()) if 'CAT' not in c}
categoric_col = {i:c for i,c in enumerate(X_train.columns.tolist()) if 'CAT' in c}

In [8]:
total_col = dict()
total_col.update(numeric_col)
total_col.update(categoric_col)

In [9]:
#### 최솟값 0패딩
X_train = X_train.fillna(0.0)
X_test = X_test.fillna(0.0)

In [10]:
#### scaling 
class Scaler(object):
    def __init__(self):
        self.sc = StandardScaler()
        self.mn = MinMaxScaler()
        
    def fit_transform(self, X_train, X_test):
        self.cols = X_train.columns.tolist()
        
        for c in self.cols:
            _, n = kstest(X_train[c],'norm')
            if n < 0.1:
                pass
            else:
                if X_train[c].min() < 0:
                    X_train[c] = self.sc.fit_transform(X_train[c])
                    X_test[c] = self.sc.transform(X_test[c])
                    
                else:
                    temp_tr = X_train[c].copy()
                    temp_te = X_train[c].copy()
                    X_train[c] = np.log1p(X_train[c])
                    X_test[c] = np.log1p(X_test[c])
                    
                    _, m = kstest(X_train[c],'norm')
                    
                    if m < 0.1:
                        pass
                    else:
                        
                        
                        X_train[c] = np.sqrt(temp_tr)
                        X_test[c] = np.sqrt(temp_te)
                        
        
        X_train = self.mn.fit_transform(X_train)
        X_test = self.mn.transform(X_test)
        
        return X_train, X_test 

In [11]:
#### scaler
scaler= Scaler()
X_train, X_test= scaler.fit_transform(X_train,X_test)

---

### Converting Data into Tensors

In [None]:
Nums = [tf.feature_column.numeric_column(c) for c in numeric_col.values()]
Cats = [tf.feature_column.numeric_column(c) for c in categoric_col.values()]
feature_name = Nums + Cats

In [None]:
# define estimator
optimizer = tf.train.FtrlOptimizer(learning_rate=0.1, l2_regularization_strength=1.0)
estimator = tf.estimator.LinearClassifier(feature_columns = feature_name, n_classes=4, optimizer=optimizer,model_dir='/tmp/svm')

In [None]:
def input_fn(dataset):
    def _fn():
        features = {feature_name : tf.constant(dataset.data)}
        label = 

In [None]:
## define input_fn_train 
input_fn_train = tf.estimator.inputs.numpy_input_fn(x = {total_col[i]:arr for i, arr in enumerate(X_train[train].T)},
                                                    y = y_train[train], batch_size = 400, num_epochs = 5, shuffle = True)

## define input_fn_test
input_fn_test = tf.estimator.inputs.numpy_input_fn(x = {total_col[i]:arr for i, arr in enumerate(X_train[test].T)},
                                                y = y_train[test], batch_size = 500, num_epochs = None, shuffle = True)

In [None]:
## optimizer
optimizer = tf.train.FtrlOptimizer(learning_rate=0.1, l2_regularization_strength=1.0)
estimator = tf.estimator.LinearClassifier(feature_columns = Nums + Cats, n_classes=4, optimizer=optimizer)
estimator.train(input_fn = input_fn_train, steps=1000)
result = estimator.evaluate(input_fn_test)

In [12]:
#### cross validation
kfold = StratifiedKFold(n_splits = 5 ,random_state = 7,shuffle = True).split(X_train, y_train)
Nums = [tf.feature_column.numeric_column(c) for c in numeric_col.values()]
Cats = [tf.feature_column.numeric_column(c) for c in categoric_col.values()]
for k, (train, test) in enumerate(kfold):
    
    ## define input_fn_train 
    input_fn_train = tf.estimator.inputs.numpy_input_fn(x = {total_col[i]:arr for i, arr in enumerate(X_train[train].T)},
                                                        y = y_train[train], batch_size = 400, num_epochs = 5, shuffle = True)
    
    ## define input_fn_test
    input_fn_test = tf.estimator.inputs.numpy_input_fn(x = {total_col[i]:arr for i, arr in enumerate(X_train[test].T)},
                                                    y = y_train[test], batch_size = 500, num_epochs = None, shuffle = True)
    
    ## optimizer

    estimator.train(input_fn = input_fn_train, steps=1000)
    result = estimator.evaluate(input_fn_test)
    
    clear_output()
    
    print('=========='+str(k)+'==========')
    for key,value in sorted(result.items()):
        print('%s: %s' % (key, value))
    

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpv3izdesn', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f8f9edb74e0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpv3izdesn/model.ckpt.
INFO:tensorflow:loss = 554.5177, step = 0
INFO:tensorfl

KeyboardInterrupt: 