In [1]:
import numpy as np
import pandas as pd
from logging import StreamHandler , DEBUG , Formatter, FileHandler ,getLogger
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold , ParameterGrid
from load_data import load_train_data , load_test_data
from sklearn.metrics import log_loss , roc_auc_score , roc_curve , auc
from tqdm import tqdm

In [2]:
logger = getLogger(__name__)

DIR = 'result_tmp/'
SAMPLE_SUBMIT_FILE = '../input/sample_submission.csv'


In [3]:
def gini(y,pred):
    fpr , tpr , thr = roc_curve(y , pred , pos_label = 1)
    g = 2 * auc(fpr ,tpr ) - 1
    return g

In [4]:
if __name__  == '__main__':

  log_fmt = Formatter('%(asctime)s %(name)s %(lineno)d [%(levelname)s][%(funcName)s] %(message)s ')
  handler = StreamHandler()
  handler.setLevel('INFO')
  handler.setFormatter(log_fmt)
  logger.addHandler(handler)

  handler = FileHandler(DIR + 'train.py.log', 'a')
  handler.setLevel(DEBUG)
  handler.setFormatter(log_fmt)
  logger.setLevel(DEBUG)

  logger.info('start')

  df = load_train_data()

  x_train = df.drop('target' , axis =1 )
  y_train = df['target'].values

  use_cols = x_train.columns.values

  logger.debug('train columns: {} {} '.format(use_cols.shape,use_cols))

  logger.info('data preparation end : {} {} '.format(use_cols.shape,use_cols))

  logger.info('prepared data ended')

  clf = LogisticRegression(random_state = 0)
  clf.fit(x_train,y_train)

  logger.info('learning phase ended {} :'.format(x_train.shape))

  cv =StratifiedKFold(n_splits = 5,shuffle = True)
  # declare dictionary variable
  all_params = {'C' : [10**i for i in range (-1,2)],
                           'fit_intercept' : [True,False],
                           'penalty' : ['l2','l1'],
                           'random_state' : [0]}
  min_score = 100
  min_params = None
  for params in tqdm(list(ParameterGrid(all_params))):
      logger.info('params : {}'.format(params))
    
      list_gini_score = []
      list_logloss_score = []
      for train_idx , valid_idx in cv.split(x_train , y_train):
            trn_x = x_train.iloc[train_idx, :]
            val_x = x_train.iloc[valid_idx, :]
            trn_y = y_train[train_idx]
            val_y = y_train[valid_idx]
            clf = LogisticRegression(**params)
            clf.fit(trn_x,trn_y)
            pred = clf.predict_proba(val_x)[:,1]
            sc_logloss = log_loss(val_y,pred)
            sc_gini = - gini(val_y,pred)
#             logger.info('logloss : {} , auc : {}'.format(sc_logloss,sc_auc))
            list_gini_score.append(sc_gini)
            list_logloss_score.append(sc_logloss)
      
      sc_logloss = np.mean(list_logloss_score)
      sc_gini = np.mean(list_gini_score)
      logger.info('logloss : {} , gini : {}'.format(sc_logloss,sc_gini))
      logger.info('current highest score gini : {} , params : {}'.format(min_score,min_params))  
      if min_score > sc_gini : 
            min_score = sc_gini
            min_params = params
        
  logger.info('logloss : {}'.format(np.mean(list_gini_score)))
  logger.info('gini : {}'.format(np.mean(list_logloss_score)))
  df = load_test_data()

  x_test = df[use_cols].sort_values('id')

  logger.info('test data load end {}'.format(x_test.shape))
  pred_test = clf.predict_proba(x_test)

  df_submit = pd.read_csv(SAMPLE_SUBMIT_FILE).sort_values('id')
  df_submit['target'] = pred_test

  df_submit.to_csv(DIR + 'submit.csv',index = False)

  logger.info(df_submit)


2018-01-27 07:05:32,671 __main__ 14 [INFO][<module>] start 
2018-01-27 07:05:35,795 __main__ 25 [INFO][<module>] data preparation end : (58,) ['id' 'ps_ind_01' 'ps_ind_02_cat' 'ps_ind_03' 'ps_ind_04_cat'
 'ps_ind_05_cat' 'ps_ind_06_bin' 'ps_ind_07_bin' 'ps_ind_08_bin'
 'ps_ind_09_bin' 'ps_ind_10_bin' 'ps_ind_11_bin' 'ps_ind_12_bin'
 'ps_ind_13_bin' 'ps_ind_14' 'ps_ind_15' 'ps_ind_16_bin' 'ps_ind_17_bin'
 'ps_ind_18_bin' 'ps_reg_01' 'ps_reg_02' 'ps_reg_03' 'ps_car_01_cat'
 'ps_car_02_cat' 'ps_car_03_cat' 'ps_car_04_cat' 'ps_car_05_cat'
 'ps_car_06_cat' 'ps_car_07_cat' 'ps_car_08_cat' 'ps_car_09_cat'
 'ps_car_10_cat' 'ps_car_11_cat' 'ps_car_11' 'ps_car_12' 'ps_car_13'
 'ps_car_14' 'ps_car_15' 'ps_calc_01' 'ps_calc_02' 'ps_calc_03'
 'ps_calc_04' 'ps_calc_05' 'ps_calc_06' 'ps_calc_07' 'ps_calc_08'
 'ps_calc_09' 'ps_calc_10' 'ps_calc_11' 'ps_calc_12' 'ps_calc_13'
 'ps_calc_14' 'ps_calc_15_bin' 'ps_calc_16_bin' 'ps_calc_17_bin'
 'ps_calc_18_bin' 'ps_calc_19_bin' 'ps_calc_20_bin']  
2018-01-2

NameError: name 'ss_gini' is not defined