In [1]:
import numpy as np
import pandas as pd

df_train = pd.read_csv(
    "round1_ijcai_18_train_20180301.txt", index_col='instance_id', sep=' ')
df_test = pd.read_csv(
    "round1_ijcai_18_test_a_20180301.txt", index_col='instance_id', sep=' ')
df_train_y = df_train['is_trade']

numeric_feats = ['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level', 'user_age_level',
                 'user_star_level', 'context_timestamp', 'context_page_id', 'shop_review_num_level',
                 'shop_review_positive_rate', 'shop_star_level', 'shop_score_service', 'shop_score_delivery', 'shop_score_description']
# nominal_feats = ['item_brand_id','item_city_id', 'user_gender_id','user_occupation_id']
nominal_feats = ['user_gender_id', 'user_occupation_id']
no_use_feats = ['item_id', 'user_id', 'context_id', 'shop_id', 'item_category_list',
                'item_property_list', 'predict_category_property', 'item_brand_id', 'item_city_id']

df_train_X = df_train[numeric_feats + nominal_feats]
df_test_X = df_test[numeric_feats + nominal_feats]

num_train = df_train_X.shape[0]

df_all_X = pd.concat([df_train_X, df_test_X])
df_all_X = df_all_X.replace(to_replace=[-1], value=np.nan)

# 标称属性转换为one-hot
df_all_X = pd.get_dummies(df_all_X, dummy_na=True, columns=nominal_feats)

# 填充空值
df_all_X[numeric_feats] = df_all_X[numeric_feats].fillna(df_all_X.mean())

# 归一化
# df_all_X[numeric_feats] = df_all_X[numeric_feats].apply(
#     lambda x: (x - x.mean()) / (x.std()))
df_all_X[numeric_feats] = df_all_X[numeric_feats].apply(
    lambda x: (x - x.min()) / (x.max() - x.min()))

df_train_X = df_all_X[:num_train]
df_test_X = df_all_X[num_train:]


df_train_X.head()

Unnamed: 0_level_0,item_price_level,item_sales_level,item_collected_level,item_pv_level,user_age_level,user_star_level,context_timestamp,context_page_id,shop_review_num_level,shop_review_positive_rate,...,shop_score_description,user_gender_id_0.0,user_gender_id_1.0,user_gender_id_2.0,user_gender_id_nan,user_occupation_id_2002.0,user_occupation_id_2003.0,user_occupation_id_2004.0,user_occupation_id_2005.0,user_occupation_id_nan
instance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
108641074714126964,0.176471,0.125,0.235294,0.666667,0.428571,0.3,0.052872,0.263158,0.16,1.0,...,1.0,0,1,0,0,0,0,0,1,0
5754713551599725161,0.176471,0.125,0.235294,0.666667,0.285714,0.6,0.062548,0.0,0.16,1.0,...,1.0,1,0,0,0,0,0,0,1,0
842679481291040981,0.176471,0.125,0.235294,0.666667,0.428571,0.4,0.015989,0.0,0.16,1.0,...,1.0,1,0,0,0,0,0,0,1,0
937088850059189027,0.176471,0.125,0.235294,0.666667,0.571429,0.6,0.032798,0.789474,0.16,1.0,...,1.0,0,1,0,0,0,0,0,1,0
7975697065017708072,0.176471,0.125,0.235294,0.666667,0.285714,0.1,0.103187,0.0,0.16,1.0,...,1.0,1,0,0,0,0,0,0,1,0


In [3]:
# 正负样本个数
num_p = np.sum(df_train_y == 1)
num_f = np.sum(df_train_y == 0)
num_p, num_f

(9021, 469117)

### 计算样本中正负类的权重，以及抽样的权重，使用默认的balanced策略

In [302]:
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight

class_weight = compute_class_weight('balanced', [0, 1], df_train_y)
sample_weight = compute_sample_weight('balanced', df_train_y)

### 构建评估函数，
采用sample_weight，使测试数据正负样本比例为1:1

In [303]:
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, log_loss


def log_loss_func(y_true, y_pred):
    sample_weight = compute_sample_weight('balanced', y_true)
    return log_loss(y_true, y_pred, sample_weight=sample_weight)


def accuracy_score_func(y_true, y_pred):
    sample_weight = compute_sample_weight('balanced', y_true)
    return accuracy_score(y_true, y_pred, sample_weight=sample_weight)


loss_scorer_balanced = make_scorer(
    log_loss_func, greater_is_better=False, needs_proba=True)
acc_scorer_balanced = make_scorer(
    accuracy_score_func, greater_is_better=True)

loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

### 定义几种模型
class_weight='balanced'使得训练时自动平衡类的权重

In [298]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty="l2", class_weight='balanced', solver='liblinear', max_iter=1000, verbose=1)

from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(class_weight='balanced', max_features='auto', max_depth=10)

from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier(dt, max_samples=0.5, max_features=0.9, n_estimators=10, n_jobs=-1)

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50, class_weight='balanced', n_jobs=-1, max_depth=5, bootstrap=False)

from sklearn.ensemble import AdaBoostClassifier
ab = AdaBoostClassifier()

from sklearn.ensemble import GradientBoostingClassifier
gbdt = GradientBoostingClassifier(n_estimators=59, max_features='auto')

### K-Fold 交叉验证，默认使用Stratified

In [299]:
from sklearn.model_selection import cross_val_score
k = 5
fit_params = {'sample_weight':sample_weight}

# scores = cross_val_score(lr, df_train_X, df_train_y, cv=k, scoring=loss_scorer, fit_params=fit_params)

lr = LogisticRegression(penalty="l2", class_weight='balanced', solver='liblinear', max_iter=1000, verbose=1)
scores = cross_val_score(lr, df_train_X, df_train_y, cv=k, scoring=acc_scorer)

sum(scores)/5

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

0.50000213167347041

### 使用Grid Search找到模型最优参数

In [None]:
from sklearn.model_selection import GridSearchCV

k = 5

# 参数的组合
gbdt_param_grad = {'n_estimators': (20, 50, 100), 'max_depth': (3, 5), 'subsample': (0.7, 1.0), 'max_features': ('auto', None)}

gbdt = GradientBoostingClassifier()
clf = GridSearchCV(gbdt, param_grid=gbdt_param_grad, scoring=acc_scorer,
                   cv=k, n_jobs=-1, verbose=1, return_train_score=False)

clf.fit(df_train_X, df_train_y, sample_weight=sample_weight)

print('=====')
print("Best parameters set found on development set:")
print(clf.best_params_)

print('=====')
print("Best parameters set found on development set:")
print(clf.best_score_)

pd.DataFrame(data=clf.cv_results_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 59.4min finished


=====
Best parameters set found on development set:
{'max_depth': 3, 'max_features': 'auto', 'n_estimators': 20, 'subsample': 1.0}
=====
Best parameters set found on development set:
0.602684439099


Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,param_max_depth,param_max_features,param_n_estimators,param_subsample,params,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,std_fit_time,std_score_time,std_test_score
0,27.347675,0.161238,0.599426,3,auto,20,0.7,"{'max_depth': 3, 'max_features': 'auto', 'n_es...",4,0.603187,0.606404,0.604682,0.617751,0.565108,0.599868,0.012619,0.017911
1,29.844955,0.157116,0.602684,3,auto,20,1.0,"{'max_depth': 3, 'max_features': 'auto', 'n_es...",1,0.606494,0.602915,0.608839,0.618209,0.576965,0.646679,0.008724,0.01382
2,73.308001,0.234287,0.599736,3,auto,50,0.7,"{'max_depth': 3, 'max_features': 'auto', 'n_es...",3,0.618271,0.602885,0.599938,0.621707,0.55588,1.060026,0.012323,0.023493
3,78.824671,0.232808,0.595443,3,auto,50,1.0,"{'max_depth': 3, 'max_features': 'auto', 'n_es...",9,0.612661,0.601169,0.585569,0.624291,0.553524,1.270162,0.005754,0.024561
4,143.668622,0.335666,0.588888,3,auto,100,0.7,"{'max_depth': 3, 'max_features': 'auto', 'n_es...",13,0.614961,0.599681,0.559338,0.618415,0.552047,1.498259,0.010367,0.027923
5,155.048932,0.369166,0.58594,3,auto,100,1.0,"{'max_depth': 3, 'max_features': 'auto', 'n_es...",17,0.611536,0.601079,0.536327,0.62271,0.558049,2.586257,0.048182,0.033093
6,30.374936,0.157997,0.599426,3,,20,0.7,"{'max_depth': 3, 'max_features': None, 'n_esti...",4,0.603187,0.606404,0.604682,0.617751,0.565108,0.5711,0.005482,0.017911
7,32.620562,0.166414,0.602684,3,,20,1.0,"{'max_depth': 3, 'max_features': None, 'n_esti...",1,0.606494,0.602915,0.608839,0.618209,0.576965,0.712616,0.013696,0.01382
8,73.450979,0.224496,0.593537,3,,50,0.7,"{'max_depth': 3, 'max_features': None, 'n_esti...",12,0.619304,0.604448,0.561521,0.619126,0.563284,0.584505,0.015384,0.025992
9,77.77945,0.219487,0.595443,3,,50,1.0,"{'max_depth': 3, 'max_features': None, 'n_esti...",9,0.612661,0.601169,0.585569,0.624291,0.553524,0.775339,0.003979,0.024561


### 利用iter，手动计算score

In [292]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, accuracy_score, log_loss


losses = []
skf = StratifiedKFold(n_splits=5)


for train, test in skf.split(df_train_X, df_train_y):
    X_train, X_test, y_train, y_test = df_train_X.iloc[train], df_train_X.iloc[
        test], df_train_y.iloc[train], df_train_y.iloc[test]

    clf = LogisticRegression(penalty="l2", class_weight='balanced', solver='liblinear', max_iter=1000, verbose=1)
    
    clf.fit(X_train, y_train)

    predicted = clf.predict(X_test)

    sample_weight = compute_sample_weight('balanced', y_test)

    cur_loss = accuracy_score(y_test, predicted, sample_weight=sample_weight)
    losses.append(cur_loss)

print('==========')
print(sum(losses)/5)

0.6141557154
