In [1]:
!pip install catboost

Looking in indexes: https://pypi.mirrors.ustc.edu.cn/simple/
Collecting catboost
[?25l  Downloading https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/05/82/6c3581379caa6b5f50214d210d9dfe648e3fa771892f348cbb4bba8c29a8/catboost-0.19.1-cp37-none-manylinux1_x86_64.whl (63.0MB)
[K     |████████████████████████████████| 63.0MB 4.2MB/s eta 0:00:01
Collecting pandas>=0.24.0 (from catboost)
[?25l  Downloading https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/63/e0/a1b39cdcb2c391f087a1538bc8a6d62a82d0439693192aef541d7b123769/pandas-0.25.3-cp37-cp37m-manylinux1_x86_64.whl (10.4MB)
[K     |████████████████████████████████| 10.4MB 329kB/s eta 0:00:01
[?25hCollecting plotly (from catboost)
[?25l  Downloading https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e3/67/eb2b2be7a63a66548abea92447fc04d9abf363520f1af6145c5f033cd1b3/plotly-4.3.0-py2.py3-none-any.whl (7.3MB)
[K     |████████████████████████████████| 7.3MB 677kB/s eta 0:00:01
Collecting retrying>=1.3.3 (from plotly->catboo

In [2]:
import pandas as pd
import numpy as np
import catboost as cat
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
import time
from collections import Counter

SEED = 2019

#### 加载数据

In [3]:
train_data = pd.read_csv('train.csv')
label_data = pd.read_csv('train_label.csv')
test_data = pd.read_csv('test.csv')
submit = pd.read_csv('submission.csv')

In [4]:
data = train_data.append(test_data)

#### 数据分析

In [5]:
data.head()

Unnamed: 0,ID,date,A1,A2,A3,B1,B2,B3,C1,C2,...,E20,E21,E22,E23,E24,E25,E26,E27,E28,E29
0,0,1900-07-01 23:30:00,-7834936860748470404,2998835353230292833,-7494990137288550401,-8639208079192601888,8626319289109649330,-5533056078568352733,-2379289153703677865,-6705654019294684257,...,0,0.266158,-1.60032,19,14,0,10,8,10,22
1,2,1900-07-01 23:30:00,5078873087020104664,544983809622051205,5835254758531463848,-8639208079192601888,8626319289109649330,-5533056078568352733,2501940309998358162,-6705654019294684257,...,0,0.266158,-1.60032,17,14,0,10,6,10,20
2,3,1900-07-01 23:30:00,-6322717615964561015,5363506900520348449,-1424055534205623346,-8639208079192601888,8626319289109649330,-5533056078568352733,1444408531941919970,-6705654019294684257,...,0,0.266158,0.619259,17,6,0,4,6,4,20
3,5,1900-07-01 23:30:00,4650039790794677128,2774817855788439768,9040931202870639290,-8639208079192601888,8626319289109649330,-5533056078568352733,3524367011807253962,-6705654019294684257,...,1,0.266158,0.619259,17,6,0,4,2,4,20
4,7,1900-07-01 23:30:00,-6322717615964561015,5363506900520348449,-1424055534205623346,-8639208079192601888,8626319289109649330,-5533056078568352733,2472798206889362914,-6705654019294684257,...,0,0.266158,0.619259,17,6,0,4,6,4,20


##### 抽取时间特征更多信息

In [6]:
def twosplit(x):
    times = int(x.split(' ')[-1].split(':')[0])
    if times >= 6 and times < 18: #白天
        return 0
    else: # 晚上
        return 1
def foursplit(x):
    times = int(x.split(' ')[-1].split(':')[0])
    if times >= 0 and times < 6: # 凌晨
        return 0
    elif times >= 6 and times < 12: # 上午
        return 1
    elif times >= 12 and times < 18: # 下午
        return 2
    else: # 晚上
        return 3
def countTime(x):
    day = int(x.split(' ')[0].split('-')[-1])
    hour = int(x.split(' ')[1].split(':')[0])
    minute = int(x.split(' ')[1].split(':')[1])
    return (day-1)*24*60 + hour*60 + minute
def genData(train_data=train_data):
    train_data['hours'] = train_data['date'].apply(lambda x:int(x.split(' ')[1].split(':')[0])*60+int(x.split(' ')[1].split(':')[1]))
    train_data['twosplit'] = train_data['date'].apply(twosplit)
    train_data['foursplit'] = train_data['date'].apply(foursplit)
    train_data['tobegin'] = train_data['date'].apply(lambda x:int(x.split(' ')[0].split('-')[-1]))
    train_data['toend'] = train_data['date'].apply(lambda x:12-int(x.split(' ')[0].split('-')[-1]))
    train_data['nowhour'] = train_data['date'].apply(lambda x:int(x.split(' ')[1].split(':')[0]))
    train_data['time'] = train_data['date'].apply(countTime)
    train_data['date'] = pd.to_datetime(train_data['date'])
    train_data['dayofweek'] = train_data['date'].dt.dayofweek
    train_data = train_data.drop(columns=['date'])
    
    return train_data

In [7]:
def newIdea(dataset,data=data):
    items = {}
    for col in ['A1','A2','A3','B1','B2','B3','C1','C2','C3']:
        items[col] = dict(Counter(data[col]))
    for col in ['A1','A2','A3','B1','B2','B3','C1','C2','C3']:
        dataset[col+'_rate'] = dataset[col].apply(lambda x: items[col][x]/dataset.shape[0])
        
    return dataset
    

In [8]:
def newFeature(data):
    dataset = data.copy()
#     dataset = genLabelCode(dataset)
#     dataset = minMaxCode(dataset)
    dataset = genData(dataset)
    return dataset

In [9]:
train = newIdea(newFeature(train_data))
test = newIdea(newFeature(test_data))

#### 建模及调优

In [10]:
sample = train.sample(1)
for col in list(train.columns):
    print(col, len(set(train[col])), sample[col].values)
    print('- - - - - - - - - - - - - - - - - - - -')

ID 60000 [37572]
- - - - - - - - - - - - - - - - - - - -
A1 18 [-7834936860748470404]
- - - - - - - - - - - - - - - - - - - -
A2 1262 [-5113814731114783375]
- - - - - - - - - - - - - - - - - - - -
A3 1102 [4726371201102088294]
- - - - - - - - - - - - - - - - - - - -
B1 23 [-8639208079192601888]
- - - - - - - - - - - - - - - - - - - -
B2 1049 [8626319289109649330]
- - - - - - - - - - - - - - - - - - - -
B3 74 [-5533056078568352733]
- - - - - - - - - - - - - - - - - - - -
C1 2702 [-2947615628225792381]
- - - - - - - - - - - - - - - - - - - -
C2 10191 [-6705654019294684257]
- - - - - - - - - - - - - - - - - - - -
C3 48997 [-770589883107948110]
- - - - - - - - - - - - - - - - - - - -
D1 4 [0]
- - - - - - - - - - - - - - - - - - - -
D2 3 [0]
- - - - - - - - - - - - - - - - - - - -
E1 391 [3]
- - - - - - - - - - - - - - - - - - - -
E2 23 [0.23124198]
- - - - - - - - - - - - - - - - - - - -
E3 8 [0.27176194]
- - - - - - - - - - - - - - - - - - - -
E4 4 [1]
- - - - - - - - - - - - - - - - - - 

In [13]:
cat_features = ['ID','A1','A2','A3','B1','B2','B3','C1','C2','C3','D1','D2','E4','E23','E24','E25','E26','E6','E8','E11','E12','E15','E18','E27','E28','E29','twosplit','foursplit','dayofweek']
# cat_features = ['ID','A1','A2','A3','B1','B2','B3','C1','C2','C3','D1','D2','twosplit','foursplit','dayofweek']

In [14]:
def KfoldResult(train=train,label=label_data.values[:,1],cat_features=cat_features):
    fold = StratifiedKFold(n_splits=20,shuffle=True)
    best_score = 0
    results = []
    best_result = []
    scores = []
    good_result = []
    preds = []
    models = []
    i = 0
    for train_index, valid_index in fold.split(train,label_data.values[:,1]):
        print('第',str(i),'个Fold')
        i += 1
        train_x,valid_x,train_y,valid_y = train.iloc[train_index],train.iloc[valid_index],label[train_index],label[valid_index]
        eval_set = cat.Pool(data=valid_x,label=valid_y,cat_features=cat_features)
        model = cat.CatBoostClassifier(iterations=5000, learning_rate=0.01, depth=9, loss_function='Logloss', eval_metric='AUC')
        model.fit(train_x,train_y,use_best_model=True,eval_set=eval_set,cat_features=cat_features,early_stopping_rounds=50)
        pred = model.predict_proba(test)[:,1]
        results.append(pred)
        preds.append(pred)
        scores.append(model.best_score_['validation']['AUC'])
        models.append(model)
        if model.best_score_['validation']['AUC'] > best_score:
            best_result = pred
        if model.best_score_['validation']['AUC'] > 0.730:
            good_result.append(pred)
    results = np.array(results).mean(axis=0)
    good_result = np.array(good_result).mean(axis=0)
    return models,scores,preds,results,best_result,good_result

In [15]:
model,scores,preds,results,best_result,good_result = KfoldResult()

Shrink model to first 809 iterations.

In [16]:
# model,scores,preds,results,best_result,good_result
scores

[0.7179721924496845,
 0.7466051472016948,
 0.7318653905216775,
 0.7379503488087851,
 0.7431142138514871,
 0.7433222436444935,
 0.7446194860329582,
 0.7173856385651125,
 0.7072883434234325,
 0.7375590333601286,
 0.7419892829330788,
 0.7168322008139068,
 0.7390525302702975,
 0.7424128001909164,
 0.737493091840836,
 0.7126327465584807,
 0.7400199788902292,
 0.7261961418124245,
 0.7253872480649377,
 0.7433674639374749]

In [18]:
best = []
best.append(preds[1])
best.append(preds[4])
best.append(preds[5])
best.append(preds[6])
best.append(preds[10])
best.append(preds[13])
best.append(preds[16])
best.append(preds[19])
best = np.array(best).mean(axis=0)

In [19]:
best

array([0.20812583, 0.08974861, 0.21713247, ..., 0.20918973, 0.15091132,
       0.24041759])

In [20]:
submit['label'] = best
submit.to_csv('result_'+time.strftime('%Y%m%d%H%M%S_best.csv',time.localtime(time.time())),index=False)

In [27]:
submit['label'] = good_result
submit.to_csv('result_'+time.strftime('%Y%m%d%H%M%S_good.csv',time.localtime(time.time())),index=False)

In [17]:
submit['label'] = results
submit.to_csv('result_'+time.strftime('%Y%m%d%H%M%S_fold.csv',time.localtime(time.time())),index=False)

In [None]:
best_result

array([0.21496435, 0.07887437, 0.28513194, ..., 0.23130214, 0.14291014,
       0.24300127])

In [None]:
train_x,valid_x,train_y,valid_y = train_test_split(train,label_data.values[:,1],test_size=0.1,stratify=label_data.values[:,1],random_state=SEED)

#### 建模及调优

In [None]:
model = cat.CatBoostClassifier(iterations=100, learning_rate=0.01, depth=9, loss_function='Logloss', eval_metric='AUC', random_state=SEED)

In [None]:
eval_set = cat.Pool(data=valid_x,label=valid_y,cat_features=cat_features)

In [None]:
model.fit(train,label_data.values[:,1],use_best_model=True,eval_set=eval_set, cat_features=cat_features)

0:	test: 0.6247346	best: 0.6247346 (0)	total: 129ms	remaining: 12.8s
1:	test: 0.6490826	best: 0.6490826 (1)	total: 305ms	remaining: 14.9s
2:	test: 0.6503736	best: 0.6503736 (2)	total: 414ms	remaining: 13.4s
3:	test: 0.6504741	best: 0.6504741 (3)	total: 508ms	remaining: 12.2s
4:	test: 0.6501860	best: 0.6504741 (3)	total: 621ms	remaining: 11.8s
5:	test: 0.6574598	best: 0.6574598 (5)	total: 726ms	remaining: 11.4s
6:	test: 0.6598538	best: 0.6598538 (6)	total: 825ms	remaining: 11s
7:	test: 0.6649978	best: 0.6649978 (7)	total: 998ms	remaining: 11.5s
8:	test: 0.6648806	best: 0.6649978 (7)	total: 1.1s	remaining: 11.1s
9:	test: 0.6861583	best: 0.6861583 (9)	total: 1.3s	remaining: 11.7s
10:	test: 0.6890962	best: 0.6890962 (10)	total: 1.41s	remaining: 11.4s
11:	test: 0.6886454	best: 0.6890962 (10)	total: 1.51s	remaining: 11.1s
12:	test: 0.6923022	best: 0.6923022 (12)	total: 1.7s	remaining: 11.4s
13:	test: 0.6933240	best: 0.6933240 (13)	total: 1.81s	remaining: 11.1s
14:	test: 0.6935182	best: 0.693

<catboost.core.CatBoostClassifier at 0x7fd8d4e1bc90>

In [None]:
model.best_score_

{'learn': {'Logloss': 0.4507086915475451},
 'validation': {'Logloss': 0.4461290284739399, 'AUC': 0.7297208240177854}}

In [None]:
roc_auc_score(valid_y,model.predict_proba(valid_x)[:,1])

0.8525890092757737

In [None]:
submit['label'] = best_result
submit.to_csv('result_'+time.strftime('%Y%m%d%H%M%S.csv',time.localtime(time.time())),index=False)