In [1]:
from tqdm import tqdm
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('max_columns',300)

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score,auc,confusion_matrix,make_scorer,recall_score,roc_curve
from sklearn.model_selection import GridSearchCV,train_test_split,cross_val_score,train_test_split
from sklearn.ensemble import GradientBoostingRegressor

### Load Data

In [3]:
train_normal = pd.read_csv('./train_normal_label.csv')
train_normal.set_index([train_normal.columns.values[0]], inplace=True)
train_normal.index.names = [None]
train_others = pd.read_csv('./train_onehot_others.csv')
train_others.set_index([train_others.columns.values[0]], inplace=True)
train_others.index.names = [None]

### Feature Engineering

In [4]:
# remove some columns with high proportion of one value
drop_col_list_others = ['cate_3_167','cate_3_58','cate_3_39','cate_3_5','cate_3_4',
                        'cate_1_10','cate_1_3','cate_1_2','cate_1_7','cate_1_8','cate_1_9','cate_1_12',
                        'cate_1_13','cate_1_14','cate_1_15','level_6.0',
                    'dtype_3','dtype_4','dtype_15','dtype_11','dtype_12','dtype_14','dtype_1',
                   'dtype_17','dtype_19','dtype_20','dtype_21','dtype_22']
drop_col_list_others += ['cate_1_'+str(i) for i in range(20,30)]
drop_col_list_others += ['cate_1_'+str(i) for i in range(0,4)]

# remove device
for c in train_others.columns:
    if 'mobile_type_' in c:
        drop_col_list_others.append(c)
train_others.drop(drop_col_list_others,axis = 1,inplace = True)

# merge similar characteristics
for c in train_others.columns:
    if 'cityid_' in c:
        if 'loc_'+c in train_others.columns:
            train_others[c+'loc'] =  (train_others[c]==train_others['loc_'+c]).astype(int)
loc_city_fea = []
for c in train_others.columns:
    if 'loc_cityid_' in c:
        loc_city_fea.append(c)
train_others_new = train_others.drop(loc_city_fea,axis = 1)

# remove day
day_list = []
for c in train_others.columns:
    if 'day_' in c and 'week' not in c:
        day_list.append(c)
train_others_new.drop(day_list,axis = 1,inplace = True)  
# add is_weekend
train_others_new['is_weekend'] = (train_others[['weekday_1.0','weekday_7.0']].sum(axis = 1) > 0).astype(int)

In [5]:
# new feature generation
for c in tqdm(train_others.columns):
    if 'dtype_' in c:
        cross_list = list(zip(train_others[c].tolist(),train_normal['distance'].tolist()))
        p = lambda x: x[1]==-1 and -1 or x[0]*x[1]
        train_others_new[c+'distance'] = list(map(p,cross_list))
        
#
cate1_fea,cate2_fea,cate3_fea = [],[],[]
for c in tqdm(train_others.columns):
    if 'cate_1' in c:
        cate1_fea.append(c)
    elif 'cate_2' in c:
        cate2_fea.append(c)
    elif 'cate_3' in c:
        cate3_fea.append(c)

for i in range(1,4):
    cate_list_name = eval('cate'+str(i)+'_fea')
    cross_cate_list = list(zip(train_others[cate_list_name].sum(axis = 1).tolist(),train_normal['distance'].tolist()))
    p = lambda x: x[1]==-1 and -1 or x[0]*x[1]
    train_others_new['cate'+str(i)+'_fea' +'_distance'] = list(map(p,cross_list))

for i in range(1,4):
    cate_list_name = eval('cate'+str(i)+'_fea')
    cross_cate_list = list(zip(train_others[cate_list_name].sum(axis = 1).tolist(),train_normal['user_home_dis'].tolist()))
    p = lambda x: x[1]==-1 and -1 or x[0]*x[1]
    train_others_new['cate'+str(i)+'_fea' +'_user_home_dis'] = list(map(p,cross_list))
    
for i in range(1,4):
    cate_list_name = eval('cate'+str(i)+'_fea')
    cross_cate_list = list(zip(train_others[cate_list_name].sum(axis = 1).tolist(),train_normal['user_work_dis'].tolist()))
    p = lambda x: x[1]==-1 and -1 or x[0]*x[1]
    train_others_new['cate'+str(i)+'_fea' +'_user_work_dis'] = list(map(p,cross_list))

100%|██████████| 187/187 [00:22<00:00,  8.45it/s]
100%|██████████| 187/187 [00:00<00:00, 281123.60it/s]


In [6]:
# remove columns with high proportion of unique value
drop_fea_list_new = ['cate1_fea_distance','cate2_fea_distance','cate3_fea_distance',
                     'dtype_23distance','dtype_18distance','dtype_13distance']
train_others_new.drop(drop_fea_list_new,axis = 1,inplace = True)

### Features

In [7]:
# check the dimensions
print(train_others_new.shape)
print(train_normal.shape)

(1724711, 175)
(1724711, 10)


In [8]:
train_fea = np.concatenate([np.array(train_others_new),np.array(train_normal.drop(['label'],axis = 1))],axis = 1)
train_label = train_normal['label']
fea_names = train_others_new.columns.tolist() + train_normal.drop(['label'],axis = 1).columns.tolist()

In [9]:
pd.DataFrame(fea_names,columns = ['feature']).to_csv('train_fea_names_v7.csv')

### Training

In [10]:
train_X,valid_X,train_y,valid_y = train_test_split(train_fea,train_label,test_size = 0.2,random_state = 1212)

In [11]:
def rank_auc_cal(true_label,pred_prob):
    n_pos = np.sum(true_label)
    n_neg = len(true_label) - np.sum(true_label)
    rank_pred_prob = np.argsort(-pred_prob)
    rank_pos_ins = []
    for i in tqdm(range(len(true_label))):
        if true_label[i] == 1:
            rank_pos_ins.append((n_pos + n_neg) - (np.where(rank_pred_prob == i)))
    auc = (np.sum(rank_pos_ins) - n_pos*(n_pos+1)/2)/(n_pos*n_neg)
    return auc

rank_auc = make_scorer(rank_auc_cal,greater_is_better = True,needs_proba = True) #自定义scoring

In [12]:
gbdt = GradientBoostingRegressor(random_state = 1212,max_depth = 10,min_samples_leaf = 200,subsample = 0.8)
gbdt_model = gbdt.fit(train_X,train_y)
gbdt_pred_valid = gbdt_model.predict(valid_X)
gbdt_valid_auc = rank_auc_cal(np.array(valid_y),gbdt_pred_valid)
print(gbdt_valid_auc)

KeyboardInterrupt: 

In [None]:
import pickle
with open('GBDT_v7.pickle', 'wb') as f:
    pickle.dump(gbdt_model, f)

In [None]:
train_new_feature.reshape(-1, gbdt_model.n_estimators_)
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
enc.fit(train_new_feature)
train_new_feature2 = np.array(enc.transform(train_new_feature).toarray())
LRLasso  = LogisticRegression(penalty = 'l1',solver = 'liblinear',max_iter = 1000,C = 0.6,random_state = 1220)
LRLasso.fit(train_new_feature2,train_y)

In [None]:
with open('LRlasso_v7.pickle', 'wb') as f:
    pickle.dump(LRLasso, f)

In [None]:
new_valid_X = np.array(enc.transform(gbdt_model.apply(valid_X)).toarray())
pred_lasso_valid_prob = LRLasso.predict_proba(new_valid_X)[:,1]
valid_auc_lr = rank_auc_cal(np.array(valid_y),pred_lasso_valid_prob)
print(valid_auc_lr)

In [None]:
features_importance_gbdt = pd.DataFrame(fea_names,columns = ['features'])
index = gbdt_model.feature_importances_.argsort()
features_importance_gbdt = features_importance_gbdt.iloc[index.tolist(),:]
features_importance_gbdt.to_csv('./Feature_Selection_GBDT_v7.csv',encoding = 'gbk') 

In [None]:
features_importance_gbdt

### Test Result

In [13]:
#test数据
test_demo = pd.read_csv('test_data_preprocessed.csv')
test_demo.set_index([test_demo.columns.values[0]], inplace=True)
test_demo.index.names = [None]
test_demo.head()

Unnamed: 0,day_17,day_18,day_19,hour_11,hour_00,hour_01,hour_02,hour_03,hour_04,hour_05,hour_06,hour_07,hour_08,hour_09,hour_10,hour_11.1,hour_12,hour_13,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23,weekday_1.0,weekday_2.0,weekday_3.0,level_-1.0,level_1.0,level_2.0,level_3.0,level_4.0,level_5.0,level_6.0,col_1_-1.0,col_1_0.0,col_1_1.0,col_3_-1.0,col_3_0.0,col_3_1.0,col_2_-1.0,col_2_0.0,col_2_1.0,col_2_2.0,gender_-1.0,gender_0.0,gender_1.0,mobile_os_-1.0,mobile_os_0.0,mobile_os_1.0,age_-1.0,age_0.0,age_1.0,age_2.0,age_3.0,age_4.0,age_5.0,dtype_0,dtype_1,dtype_2,dtype_3,dtype_4,dtype_5,dtype_6,dtype_7,dtype_8,dtype_9,dtype_10,dtype_11,dtype_12,dtype_13,dtype_14,dtype_15,dtype_16,dtype_17,dtype_18,dtype_19,dtype_20,dtype_22,dtype_23,cate_1_0,cate_1_1,cate_1_2,cate_1_3,cate_1_4,cate_1_5,cate_1_6,cate_1_7,cate_1_8,cate_1_9,cate_1_10,cate_1_11,cate_1_12,cate_1_13,cate_1_14,cate_1_15,cate_1_16,cate_1_17,cate_1_18,cate_1_19,cate_1_20,cate_1_21,cate_1_22,cate_1_23,cate_1_24,cate_1_25,cate_1_26,cate_1_27,cate_1_28,cate_1_29,cate_1_30,geo_cat_[0 40],geo_cat_[0 41],geo_cat_[0 42],geo_cat_[0 43],geo_cat_[0 45],geo_cat_[0 46],geo_cat_[0 47],geo_cat_[0 48],geo_cat_[0 49],geo_cat_[0 50],geo_cat_[0 51],geo_cat_[0 52],geo_cat_[0 53],geo_cat_[0 54],geo_cat_[0 56],geo_cat_[0 57],geo_cat_[0 58],geo_cat_[0 59],geo_cat_[0 81],geo_cat_[0 82],geo_cat_[0 83],geo_cat_[0 84],geo_cat_[0 86],geo_cat_[0 87],geo_cat_[0 88],geo_cat_[0 89],geo_cat_[0 91],geo_cat_[0 92],geo_cat_[0 93],geo_cat_[0 94],geo_cat_[0 96],geo_cat_[0 97],geo_cat_[0 98],geo_cat_[0 99],geo_cat_[1 40],...,geo_cat_[97 43],geo_cat_[97 45],geo_cat_[97 46],geo_cat_[97 47],geo_cat_[97 48],geo_cat_[97 49],geo_cat_[97 5],geo_cat_[97 6],geo_cat_[97 7],geo_cat_[97 8],geo_cat_[98 0],geo_cat_[98 1],geo_cat_[98 2],geo_cat_[98 3],geo_cat_[98 40],geo_cat_[98 41],geo_cat_[98 42],geo_cat_[98 43],geo_cat_[98 45],geo_cat_[98 46],geo_cat_[98 47],geo_cat_[98 48],geo_cat_[98 5],geo_cat_[98 6],geo_cat_[98 7],geo_cat_[98 8],geo_cat_[99 0],geo_cat_[99 1],geo_cat_[99 40],geo_cat_[99 41],geo_cat_[99 42],geo_cat_[99 45],geo_cat_[99 46],geo_cat_[99 47],geo_cat_[99 5],geo_cat_[99 6],cityid_1.0,cityid_10.0,cityid_20.0,cityid_30.0,cityid_40.0,cityid_42.0,cityid_45.0,cityid_50.0,cityid_55.0,cityid_56.0,cityid_57.0,cityid_59.0,cityid_66.0,cityid_70.0,cityid_73.0,cityid_91.0,cityid_92.0,cityid_96.0,cityid_99.0,cityid_114.0,cityid_others,loc_cityid_1.0,loc_cityid_10.0,loc_cityid_20.0,loc_cityid_30.0,loc_cityid_40.0,loc_cityid_42.0,loc_cityid_45.0,loc_cityid_50.0,loc_cityid_55.0,loc_cityid_56.0,loc_cityid_57.0,loc_cityid_59.0,loc_cityid_66.0,loc_cityid_70.0,loc_cityid_73.0,loc_cityid_80.0,loc_cityid_91.0,loc_cityid_92.0,loc_cityid_99.0,loc_cityid_114.0,loc_cityid_others,mobile_type_394.0,mobile_type_758.0,mobile_type_1338.0,mobile_type_1344.0,mobile_type_1360.0,mobile_type_1361.0,mobile_type_1391.0,mobile_type_2179.0,mobile_type_2180.0,mobile_type_2181.0,mobile_type_2182.0,mobile_type_2183.0,mobile_type_2184.0,mobile_type_2185.0,mobile_type_2186.0,mobile_type_2188.0,mobile_type_2285.0,mobile_type_2290.0,mobile_type_2328.0,mobile_type_2334.0,mobile_type_others,cate_2_2,cate_2_3,cate_2_4,cate_2_10,cate_2_11,cate_2_50,cate_2_59,cate_2_61,cate_2_70,cate_2_76,cate_2_89,cate_2_94,cate_2_234,cate_2_277,cate_2_313,cate_2_361,cate_2_363,cate_2_374,cate_2_375,cate_2_675,cate_2_others,cate_3_1,cate_3_2,cate_3_4,cate_3_5,cate_3_37,cate_3_39,cate_3_42,cate_3_43,cate_3_44,cate_3_53,cate_3_54,cate_3_58,cate_3_59,cate_3_69,cate_3_71,cate_3_72,cate_3_137,cate_3_138,cate_3_167,cate_3_329,cate_3_others,distance,item_ave_price,price,user_home_dis,user_work_dis,user_displayed_item_num,online_days,col_4,delta_price
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.66,0.565657,0.565657,0.161616,0.131313,0.0,0.0,0.328824,0.0
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.72,0.626263,0.949495,0.212121,0.888889,0.48,0.64,0.554193,0.663317
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.52,0.565657,0.565657,0.89899,0.888889,0.0,0.0,0.747384,0.0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.45,0.666667,0.686869,0.59596,0.575758,0.0,0.12,0.412129,0.512563
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.43,0.666667,0.686869,0.10101,0.444444,0.0,0.12,0.343069,0.512563


In [14]:
#test_id
mapping_id = pd.read_csv('id_mapping_4.csv')
mapping_id.drop('id',axis = 1,inplace = True)
test_id = pd.read_csv('test_data_preprocessed_without_onehot_tail_not_merged.csv')[['global_id','user_id','item_id']]

In [15]:
mapping_id.head()

Unnamed: 0,global_id,user_id,item_id
0,63cba9f5b2597e90297f49e45a72166721139640885030...,53a19797c92909993026284684561c51e55c9675b96daa...,3778fec4a76d70df10870d6e66867ae768ad7217c0a5dc...
1,93b58966a2efcc70265d2ee1c9540ab143e2d742f515b2...,95072f95c7af9e22bbb678e843d98407c84a793c37e7ff...,88b81f1a46090be2b9e79d2da602aa65a01c7f0875aa0b...
2,dd7f646f80fd4b85d095c00332c224f47819cf965e4aa9...,4cc6901ee804e3417958511167381d480e2fff24ff9d4b...,dc504742203a9deee31c10c7d2bd5b9f793db6e547bc46...
3,982d06e5b69af6a5d470a79fa8bf38d398b4ba634ae3a1...,f34955c11e4bbb8188ba779d6a3e44386b790442199e11...,5693a14b3098805d96901f6c2b5ba0400cd3895f49c916...
4,224e40ed1e6aafab4e767ba97eb73ab9695c28853083a1...,8da07f7c9275826f2bae21be89d152b8b89cd1a49c3214...,5693a14b3098805d96901f6c2b5ba0400cd3895f49c916...


In [16]:
test_id.head()

Unnamed: 0,global_id,user_id,item_id
0,63cba9f5b2597e90297f49e45a72166721139640885030...,53a19797c92909993026284684561c51e55c9675b96daa...,3778fec4a76d70df10870d6e66867ae768ad7217c0a5dc...
1,93b58966a2efcc70265d2ee1c9540ab143e2d742f515b2...,95072f95c7af9e22bbb678e843d98407c84a793c37e7ff...,88b81f1a46090be2b9e79d2da602aa65a01c7f0875aa0b...
2,dd7f646f80fd4b85d095c00332c224f47819cf965e4aa9...,4cc6901ee804e3417958511167381d480e2fff24ff9d4b...,dc504742203a9deee31c10c7d2bd5b9f793db6e547bc46...
3,982d06e5b69af6a5d470a79fa8bf38d398b4ba634ae3a1...,f34955c11e4bbb8188ba779d6a3e44386b790442199e11...,5693a14b3098805d96901f6c2b5ba0400cd3895f49c916...
4,224e40ed1e6aafab4e767ba97eb73ab9695c28853083a1...,8da07f7c9275826f2bae21be89d152b8b89cd1a49c3214...,5693a14b3098805d96901f6c2b5ba0400cd3895f49c916...


In [17]:
geo_list = []
for c in test_demo.columns:
    if 'geo_cat' in c:
        geo_list.append(c)
test_demo.drop(geo_list,axis = 1,inplace = True)

In [18]:
test_demo.drop(['day_17','day_18','day_19','hour_11.1'],axis = 1,inplace = True)

In [19]:
drop_col_list_others = ['cate_3_167','cate_3_58','cate_3_39','cate_3_5','cate_3_4',
                        'cate_1_10','cate_1_3','cate_1_2','cate_1_7','cate_1_8','cate_1_9','cate_1_12',
                        'cate_1_13','cate_1_14','cate_1_15','level_6.0','dtype_17',
                    'dtype_3','dtype_4','dtype_15','dtype_11','dtype_12','dtype_14','dtype_1','dtype_19','dtype_20','dtype_22']
drop_col_list_others += ['cate_1_'+str(i) for i in range(20,30)]
drop_col_list_others += ['cate_1_'+str(i) for i in range(0,4)]

for c in test_demo.columns:
    if 'mobile_type_' in c:
        drop_col_list_others.append(c)
test_demo.drop(drop_col_list_others,axis = 1,inplace = True)

for c in test_demo.columns:
    if 'cityid_' in c:
        if 'loc_'+c in test_demo.columns:
            test_demo[c+'loc'] =  (test_demo[c]==test_demo['loc_'+c]).astype(int)
loc_city_fea = []
for c in test_demo.columns:
    if 'loc_cityid_' in c:
        loc_city_fea.append(c)
test_demo = test_demo.drop(loc_city_fea,axis = 1)

for c in tqdm(test_demo.columns):
    if 'dtype_' in c:
        cross_list = list(zip(test_demo[c].tolist(),test_demo['distance'].tolist()))
        p = lambda x: x[1]==-1 and -1 or x[0]*x[1]
        test_demo[c+'distance'] = list(map(p,cross_list))

cate1_fea,cate2_fea,cate3_fea = [],[],[]
for c in tqdm(test_demo.columns):
    if 'cate_1' in c:
        cate1_fea.append(c)
    elif 'cate_2' in c:
        cate2_fea.append(c)
    elif 'cate_3' in c:
        cate3_fea.append(c)

for i in range(1,4):
    cate_list_name = eval('cate'+str(i)+'_fea')
    cross_cate_list = list(zip(test_demo[cate_list_name].sum(axis = 1).tolist(),test_demo['distance'].tolist()))
    p = lambda x: x[1]==-1 and -1 or x[0]*x[1]
    test_demo['cate'+str(i)+'_fea' +'_distance'] = list(map(p,cross_list))

for i in range(1,4):
    cate_list_name = eval('cate'+str(i)+'_fea')
    cross_cate_list = list(zip(test_demo[cate_list_name].sum(axis = 1).tolist(),test_demo['user_home_dis'].tolist()))
    p = lambda x: x[1]==-1 and -1 or x[0]*x[1]
    test_demo['cate'+str(i)+'_fea' +'_user_home_dis'] = list(map(p,cross_list))
    
for i in range(1,4):
    cate_list_name = eval('cate'+str(i)+'_fea')
    cross_cate_list = list(zip(test_demo[cate_list_name].sum(axis = 1).tolist(),test_demo['user_work_dis'].tolist()))
    p = lambda x: x[1]==-1 and -1 or x[0]*x[1]
    test_demo['cate'+str(i)+'_fea' +'_user_work_dis'] = list(map(p,cross_list))

drop_fea_list_new = ['cate1_fea_distance','cate2_fea_distance','cate3_fea_distance',
                     'dtype_23distance','dtype_18distance','dtype_13distance']
test_demo.drop(drop_fea_list_new,axis = 1,inplace = True)
test_demo['is_weekend'] = (test_demo['weekday_1.0'] > 0).astype(int)
day_list = []
for c in test_demo.columns:
    if 'day_' in c and 'week' not in c:
        day_list.append(c)
test_demo.drop(day_list,axis = 1,inplace = True)

100%|██████████| 164/164 [00:08<00:00, 18.83it/s]
100%|██████████| 176/176 [00:00<00:00, 75805.86it/s]


In [20]:
weekday_df = pd.DataFrame(np.zeros((test_demo.shape[0],4)),columns = ['weekday_4.0','weekday_5.0','weekday_6.0','weekday_7.0'])
test_demo = pd.concat([test_demo,weekday_df],axis = 1)

In [21]:
#检查特征数目
print(len(test_demo.columns))
print(len(fea_names))

184
184


In [22]:
#调整特征顺序
test_fea = np.zeros(test_demo.shape)
for i in range(len(fea_names)):
    test_fea[:,i] = test_demo[fea_names[i]] 

In [23]:
import pickle
with open('GBDT_v7.pickle','rb') as f:
    gbdt = pickle.load(f)

In [24]:
test_pred = gbdt.predict(test_fea)
result_df = pd.concat([test_id,pd.DataFrame(test_pred,columns = ['result'])],axis = 1)

In [25]:
result_df.head()

Unnamed: 0,global_id,user_id,item_id,result
0,63cba9f5b2597e90297f49e45a72166721139640885030...,53a19797c92909993026284684561c51e55c9675b96daa...,3778fec4a76d70df10870d6e66867ae768ad7217c0a5dc...,0.344959
1,93b58966a2efcc70265d2ee1c9540ab143e2d742f515b2...,95072f95c7af9e22bbb678e843d98407c84a793c37e7ff...,88b81f1a46090be2b9e79d2da602aa65a01c7f0875aa0b...,0.282436
2,dd7f646f80fd4b85d095c00332c224f47819cf965e4aa9...,4cc6901ee804e3417958511167381d480e2fff24ff9d4b...,dc504742203a9deee31c10c7d2bd5b9f793db6e547bc46...,0.321509
3,982d06e5b69af6a5d470a79fa8bf38d398b4ba634ae3a1...,f34955c11e4bbb8188ba779d6a3e44386b790442199e11...,5693a14b3098805d96901f6c2b5ba0400cd3895f49c916...,0.377434
4,224e40ed1e6aafab4e767ba97eb73ab9695c28853083a1...,8da07f7c9275826f2bae21be89d152b8b89cd1a49c3214...,5693a14b3098805d96901f6c2b5ba0400cd3895f49c916...,0.362271


In [26]:
result_df_merged = pd.merge(mapping_id,result_df,how = 'left',on = ['global_id','user_id','item_id'])
result_df_merged[['result']].to_csv('result_v8_gbdt.csv')

In [27]:
result_df_merged

Unnamed: 0,global_id,user_id,item_id,result
0,63cba9f5b2597e90297f49e45a72166721139640885030...,53a19797c92909993026284684561c51e55c9675b96daa...,3778fec4a76d70df10870d6e66867ae768ad7217c0a5dc...,0.344959
1,93b58966a2efcc70265d2ee1c9540ab143e2d742f515b2...,95072f95c7af9e22bbb678e843d98407c84a793c37e7ff...,88b81f1a46090be2b9e79d2da602aa65a01c7f0875aa0b...,0.282436
2,dd7f646f80fd4b85d095c00332c224f47819cf965e4aa9...,4cc6901ee804e3417958511167381d480e2fff24ff9d4b...,dc504742203a9deee31c10c7d2bd5b9f793db6e547bc46...,0.321509
3,982d06e5b69af6a5d470a79fa8bf38d398b4ba634ae3a1...,f34955c11e4bbb8188ba779d6a3e44386b790442199e11...,5693a14b3098805d96901f6c2b5ba0400cd3895f49c916...,0.377434
4,224e40ed1e6aafab4e767ba97eb73ab9695c28853083a1...,8da07f7c9275826f2bae21be89d152b8b89cd1a49c3214...,5693a14b3098805d96901f6c2b5ba0400cd3895f49c916...,0.362271
...,...,...,...,...
670326,94559aecffaf3024fc633fea798767d5ff30f84c8e54ac...,56cbc9083353c03fa02e7093cdd3117e348656ce111101...,6490741f9f93602b405f721570d9f5b4f93a8d5e35bf20...,0.360521
670327,e014187e919226d20f65a5935b69c318c076873e0e3d77...,a0e93786bd6dfe770ec0c5f9d7be25bea0e07d1aa0cf29...,5d88ad0a662ef4d7a27356322bcd1d448eeb9fd967d172...,0.350781
670328,7bf061edde1498db2ccff973f4c4be0f94f9aa37b97e96...,2733aa6b7695e96618c94c96056ba727a553f8eb3d6a71...,5d88ad0a662ef4d7a27356322bcd1d448eeb9fd967d172...,0.309581
670329,c784eea960c179b1b17b07017d417d220b198720c786a3...,0e1903cb3285e15221db65d5a6b0a81cb86974ef5f9f5d...,9190e52a1c7e6c1bca1d3b6f3037daf28b539d53463ade...,0.214958
