In [6]:
#coding=utf-8
import sys
import xgboost as xgb
from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedKFold
from sklearn.metrics import precision_score,recall_score
import pprint
import pandas as pd
import time
import numpy as np
import ConfigParser
import argparse
import os
from sklearn.datasets import load_svmlight_file
import log_class
from sklearn.metrics import recall_score,precision_score,accuracy_score
import copy
from sklearn.metrics import confusion_matrix



# configure parser
def conf_parser():
    cf = ConfigParser.ConfigParser()
    cf.read('xgb_grid_search.config')
    booster = cf.get('xg_grid_search', 'booster')
    silent = int(cf.get('xg_grid_search','silent'))
    nthread = int(cf.get('xg_grid_search', 'nthread'))
    eta = float(cf.get('xg_grid_search', 'eta'))
    gamma = float(cf.get('xg_grid_search', 'gamma'))
    max_delta_step = float(cf.get('xg_grid_search','max_delta_step'))
    p_lambda = float(cf.get('xg_grid_search', 'lambda'))
    alpha = float(cf.get('xg_grid_search', 'alpha'))
    sketch_eps = float(cf.get('xg_grid_search', 'sketch_eps'))
    refresh_leaf = int(cf.get('xg_grid_search', 'refresh_leaf'))
    max_depth = int(cf.get('xg_grid_search', 'max_depth'))
    subsample = float(cf.get('xg_grid_search', 'subsample'))
    min_child_weight = float(cf.get('xg_grid_search', 'min_child_weight'))
    colsample_bytree = float(cf.get('xg_grid_search', 'colsample_bytree'))
    objective = cf.get('xg_grid_search', 'objective')
    base_score = float(cf.get('xg_grid_search', 'base_score'))
    eval_metric = cf.get('xg_grid_search', 'eval_metric')
    ascend = int(cf.get('xg_grid_search','ascend'))
    seed = int(cf.get('xg_grid_search', 'seed'))



    save_period = int(cf.get('xg_grid_search', 'save_period'))
    #eval = int(cf.get('xg_grid_search', 'eval'))
    cv = int(cf.get('xg_grid_search','cv'))

    t_num_round = int(cf.get('xg_grid_search_tune','num_round'))
    t_max_depth = [int(i) for i in cf.get('xg_grid_search_tune','max_depth').split(',')]
    t_subsample = [float(i) for i in cf.get('xg_grid_search_tune','subsample').split(',')]
    t_min_child_weight = [float(i) for i in cf.get('xg_grid_search_tune','min_child_weight').split(',')]
    t_colsample_bytree = [float(i) for i in cf.get('xg_grid_search_tune','colsample_bytree').split(',')]
    t_colsample_bylevel = [float(i) for i in cf.get('xg_grid_search_tune','colsample_bylevel').split(',')]
    t_max_delta_step = [int(i) for i in cf.get('xg_grid_search_tune','max_delta_step').split(',')]
    t_gamma = [float(i) for i in cf.get('xg_grid_search_tune','gamma').split(',')]
    t_param = {'num_round':t_num_round,
                'max_depth':t_max_depth,
                'subsample':t_subsample,
               'min_child_weight':t_min_child_weight,
               'colsample_bytree':t_colsample_bytree,
                'colsample_bylevel':t_colsample_bylevel,
                'max_delta_step':t_max_delta_step,
                'gamma':t_gamma}

    params = {'booster': booster, 'objective': objective, 'silent': silent, 'eta': eta, 'gamma': gamma,
             'max_delta_step':max_delta_step,'lambda':p_lambda,'alpha':alpha,'sketch_eps':sketch_eps,
             'refresh_leaf':refresh_leaf,'base_score':base_score,'max_depth':max_depth,'subsample':subsample,
              'min_child_weight':min_child_weight,'colsample_bytree':colsample_bytree,
              # 'eval_metric':eval_metric,
             'seed':seed,'nthread': nthread}

    others = {'num_round':t_num_round,'cv':cv,'ascend':ascend,'eval_metric':eval_metric}

    data = cf.get('xg_grid_search', 'data')
    dataname = cf.get('xg_grid_search','dataname')

    
    test_data = cf.get('xg_grid_search','test_data')
    log_dir = os.path.split(test_data)[1]
    others['log_dir'] = dataname
    return data, test_data, params,t_param,others

def get_negative_positive_ratio(y):
    labels_np = np.array(y)
    neg_num = np.sum(labels_np==0)
    pos_num = np.sum(labels_np==1)
    return neg_num/float(pos_num)

def tune_num_boost_round(params,dtrain,num_boost_round,log,watchlist,eval_metric,feval=None,ascend=True):
    evals_result = {}
    if(feval==None):
        params['eval_metric'] = eval_metric
    xgb.train(params=params,dtrain=dtrain,num_boost_round=num_boost_round,evals=watchlist,feval=feval,evals_result=evals_result,early_stopping_rounds = 100)
    evals_result = evals_result['eval'][eval_metric]
    if(ascend==True):
        loc = max(enumerate(evals_result), key=lambda x: x[1])[0]
    else:
        loc = min(enumerate(evals_result), key=lambda x: x[1])[0]
    loc += 1
    log.add("****num_boost_round : "+str(loc)+":"+str(evals_result[loc]))
    print('****  num_boost_round : %s : %s'%(loc,evals_result[loc-1]))
    return loc


def custom_eval_metirc_precison(preds,dtrain):
    labels = dtrain.get_label()
    flag1 = np.prod(preds<=1.0)
    flag2 = np.prod(preds>=0.0)
    flag = flag1*flag2
    assert flag == 1,"预测出来的值不是概率"
    preds = preds>=0.5
    preds = preds.astype(int)
    precison = precision_score(labels,preds)
    return 'precision',precison

def custom_eval_metirc_recall(preds,dtrain):
    labels = dtrain.get_label()
    flag1 = np.prod(preds<=1.0)
    flag2 = np.prod(preds>=0.0)
    flag = flag1*flag2
    assert flag == 1,"预测出来的值不是概率"
    preds = preds>=0.5
    preds = preds.astype(int)
    recall = recall_score(labels,preds)
    return 'recall',recall

def set_custom_eval_metirc(eval_metirc):

    custom_fs = dict(precision=custom_eval_metirc_precison,
                     recall=custom_eval_metirc_recall)
    for k,v in custom_fs.items():
        if(eval_metirc==k):
            return v
    return None

def predict_test(model,test_X,test_y,log):
    dtest = xgb.DMatrix(test_X,label = test_y)
    pred = model.predict(dtest)
    #split by 0.5
    pos_ind = pred>=0.5
    neg_ind = pred<0.5
    pred[pos_ind] = 1
    pred[neg_ind] = 0
    recall = recall_score(test_y,pred)
    precision = precision_score(test_y,pred)
    accuracy = accuracy_score(test_y,pred)
    log.add('test recall:'+str(recall))
    log.add('test precision:'+str(precision))
    log.add('test accuracy'+str(accuracy))
    print('test recall:'+str(recall))
    print('test precision:'+str(precision))
    print('test accuracy:'+str(accuracy))
    c= confusion_matrix(test_y,pred)
    if c.shape[0] >=2:
        cm = copy.deepcopy(c)
        cm[0,0] = int(c[1,1])
        cm[0,1] = int(c[1,0])
        cm[1,0] = int(c[0,1])
        cm[1,1] = int(c[0,0])
        print(cm)
        log.add('confusion matrix:')
        log.add(str(cm[0,0])+' '+str(cm[0,1]))
        log.add(str(cm[1,0])+' '+str(cm[1,1]))

def load_model_test():
    config = dict()
    config['model_path'] = './models/0727_shake_512/2017_08_15_11_50_27.xgmodel'
    config['data_path']='../datas/Partition/train-test-set-0727/shake_512/test_vec.dat.libsvm'
    X,y = load_svmlight_file(config['data_path'])
    X = X.todense()
    print('test_data shape:')
    print(X.shape)
    dtest = xgb.DMatrix(X,label = y)
    model = xgb.Booster(model_file=config['model_path'])
    params_xgb = model.attr('gamma')
    print(params_xgb)
    pred = model.predict(dtest)
    test_y = y
    pred = model.predict(dtest)
    #split by 0.5
    pos_ind = pred>=0.5
    neg_ind = pred<0.5
    pred[pos_ind] = 1
    pred[neg_ind] = 0
    recall = recall_score(test_y,pred)
    precision = precision_score(test_y,pred)
    accuracy = accuracy_score(test_y,pred)
    print('test recall:'+str(recall))
    print('test precision:'+str(precision))
    print('test accuracy:'+str(accuracy))
    c= confusion_matrix(test_y,pred)
    cm = copy.deepcopy(c)
    cm[0,0] = int(c[1,1])
    cm[0,1] = int(c[1,0])
    cm[1,0] = int(c[0,1])
    cm[1,1] = int(c[0,0])
    print(cm)
   




usage: __main__.py [-h] -c CONF
__main__.py: error: argument -c/--conf is required


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [7]:

xgdata,data_test,params,params_t,params_other = conf_parser()
'''x,y = load_svmlight_file(xgdata)
x = x.todense()
test_x,test_y = load_svmlight_file(data_test)
test_x = test_x.todense()'''

 

'x,y = load_svmlight_file(xgdata)\nx = x.todense()\ntest_x,test_y = load_svmlight_file(data_test)\ntest_x = test_x.todense()'

In [8]:
xgdata

'../datas/features-201707/train_flags_normalized.csv'

In [9]:
df_train = pd.read_csv(xgdata)


In [10]:
y = df_train['label'].values



array([ 0.,  0.,  0., ...,  1.,  1.,  1.])

In [12]:
x_columns = df_train.columns[df_train.columns!='label']




In [13]:
x = df_train[x_columns].as_matrix()

In [35]:
a = [item for item in df_train.columns if item not in ['id']]
a

['cert_valid',
 'cmd12_cmdsize_array_max',
 'cmd12_cmdsize_array_mean',
 'cmd12_cmdsize_array_min',
 'cmd12_cmdsize_array_size',
 'cmd12_offset_array_max_Normalized',
 'cmd12_offset_array_mean_Normalized',
 'cmd12_offset_array_min_Normalized',
 'cmd12_offset_array_size',
 'cmdsize_array_max',
 'cmdsize_array_mean',
 'cmdsize_array_min',
 'cmdsize_array_size',
 'cmd_count',
 'cmd_current_version.size',
 'cmd_filesize_array_max',
 'cmd_filesize_array_mean',
 'cmd_filesize_array_min',
 'cmd_filesize_array_size',
 'cmd_flags.size',
 'cmd_size',
 'cmd_timestamp.size',
 'cmd_version.size',
 'compatibility_version.size',
 'const_section_align',
 'const_section_flags_S_ATTR_EXT_RELOC',
 'const_section_flags_S_ATTR_LOC_RELOC',
 'const_section_flags_S_REGULAR',
 'const_section_flags_S_ZEROFILL',
 'const_section_nreloc',
 'const_section_offset_Normalized',
 'const_section_reloff_Normalized',
 'const_section_size',
 'cpu_type_i386',
 'cpu_type_x86_64',
 'dataoff_array_max_Normalized',
 'dataoff_ar

cert_valid                                     float64
cmd12_cmdsize_array_max                        float64
cmd12_cmdsize_array_mean                       float64
cmd12_cmdsize_array_min                        float64
cmd12_cmdsize_array_size                       float64
cmd12_offset_array_max_Normalized              float64
cmd12_offset_array_mean_Normalized             float64
cmd12_offset_array_min_Normalized              float64
cmd12_offset_array_size                        float64
cmdsize_array_max                              float64
cmdsize_array_mean                             float64
cmdsize_array_min                              float64
cmdsize_array_size                             float64
cmd_count                                      float64
cmd_current_version.size                       float64
cmd_filesize_array_max                         float64
cmd_filesize_array_mean                        float64
cmd_filesize_array_min                         float64
cmd_filesi

In [18]:
df_test = pd.read_csv(data_test)
test_y = df_test['label'].values
x_test_columns = df_test.columns[df_test.columns!='label']
test_x = df_test[x_test_columns].as_matrix()

In [19]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.3, random_state=42)
 

In [20]:
dtrain = xgb.DMatrix(x_train, label=y_train)

ValueError: invalid literal for float(): 5f7a09e4660aafc8c53e73bd2c9a2e97f1f1b1b23cf1338f74efca17654dc8db

In [None]:
   
    dval = xgb.DMatrix(x_val, y_val)
    dtrain_whole = xgb.DMatrix(x,label = y)
    watchlist = [(dtrain, 'train'), (dval, 'eval')]
    watchlist_whole = [(dtrain_whole, 'eval')]
    scale_pos_weight = get_negative_positive_ratio(y)

    params['scale_pos_weight'] = scale_pos_weight
    custom_feval = set_custom_eval_metirc(params_other['eval_metric'])
    log = log_class.log_class('grid_search_xgb',params_other['log_dir'])
    log.add('scale_pos_weight:'+str(scale_pos_weight))
    log.add('eval_metric:'+params_other['eval_metric'])
    print(params)
    num_round = tune_num_boost_round(params,dtrain,params_other['num_round'],log,watchlist,eval_metric=params_other['eval_metric'],feval=custom_feval,ascend=params_other['ascend'])

    params_t = [dict(max_depth=params_t['max_depth']),
                dict(subsample=params_t['subsample']),
                dict(min_child_weight=params_t['min_child_weight']),
                dict(colsample_bytree=params_t['colsample_bytree']),
                dict(colsample_bylevel = params_t['colsample_bylevel']),
                dict(max_delta_step = params_t['max_delta_step']),
                dict(gamma = params_t['gamma'])]
    for param_t in params_t:
        k = param_t.keys()[0]
        values = param_t[k]
        if(k=='num_round'):
            continue
        log.add("====="+str(k)+"======="+str(values))
        print('========== ',k,' ========== ',values)
        result = []
        if(len(values) == 1):
            params[k] = values[0]
            continue
        for v in values:
            print('**** for : %s ****\n'%(str(v)))
            log.add("**** for :"+str(v)+"****")
            params[k] = v
            if (custom_feval == None):
                params['eval_metric'] = params_other['eval_metric']
            result_df = xgb.cv(params=params,
                               dtrain=dtrain_whole,
                               num_boost_round=num_round,
                               nfold=params_other['cv'],
                               # metrics=params_other['eval_metric'],
                               feval=custom_feval,
                               stratified=True,
                               verbose_eval=False,
                               show_stdv=False,
                               shuffle=True,
                               early_stopping_rounds = 100)
            result_df = result_df[['test-'+params_other['eval_metric']+'-mean']]
            assert result_df.columns[0]=='test-'+params_other['eval_metric']+'-mean','choose the correct column\n'
            result_np = result_df.as_matrix()
            result.append(float(result_np[-1][0]))
        print(zip(values,result))
        if(params_other['ascend'] == 1):
            loc = max(enumerate(result),key=lambda x:x[1])[0]
        else:
            loc = min(enumerate(result),key=lambda x:x[1])[0]
        params[k] = values[loc]
        print('%s : %s\n'%(k,params[k]))
        log.add(k)
        log.add(str(params[k]))
    num_round = tune_num_boost_round(params,dtrain_whole,params_other['num_round'],log,watchlist_whole,eval_metric=params_other['eval_metric'],feval=custom_feval,ascend=params_other['ascend'])
    model = xgb.train(params,dtrain_whole,num_round,watchlist_whole,feval=custom_feval)
    pprint.pprint(params)
    time_str = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
    if not os.path.isdir('./models'):
        os.mkdir('./models')
    dataname_model_path = os.path.join('./models',params_other['log_dir'])
    if not os.path.isdir(dataname_model_path):
        os.mkdir(dataname_model_path)
    model.save_model(dataname_model_path + '/' + time_str + '.xgmodel')
    print('saved : %s' % (dataname_model_path + '/' + time_str + '.xgmodel'))
    predict_test(model,test_x,test_y,log)