# 2018-04-05 特征提取

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns

sns.set()
%matplotlib inline

train = pd.read_csv('round1_ijcai_18_train_20180301.txt',delimiter=' ')
example = train.copy()

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478138 entries, 0 to 478137
Data columns (total 27 columns):
instance_id                  478138 non-null int64
item_id                      478138 non-null int64
item_category_list           478138 non-null object
item_property_list           478138 non-null object
item_brand_id                478138 non-null int64
item_city_id                 478138 non-null int64
item_price_level             478138 non-null int64
item_sales_level             478138 non-null int64
item_collected_level         478138 non-null int64
item_pv_level                478138 non-null int64
user_id                      478138 non-null int64
user_gender_id               478138 non-null int64
user_age_level               478138 non-null int64
user_occupation_id           478138 non-null int64
user_star_level              478138 non-null int64
context_id                   478138 non-null int64
context_timestamp            478138 non-null int64
context_page_id     

1. instance_id 样本编号
2. item_id 广告商品编号
3. item_category_list 广告商品的的类目列表，String类型；从根类目（最粗略的一级类目）向叶子类目（最精细的类目）依次排列，数据拼接格式为 "category_0;category_1;category_2"，其中 category_1 是 category_0 的子类目，category_2 是 category_1 的子类目
4. item_property_list 广告商品的属性列表，String类型；数据拼接格式为 "property_0;property_1;property_2"，各个属性没有从属关系
5. item_brand_id 广告商品的品牌编号
6. item_city_id 广告商品的城市编号
7. item_price_level 广告商品的价格等级
8. item_sales_level 广告商品的销量等级
9. item_collected_level  广告商品被收藏次数的等级
10. item_pv_level 广告商品被展示次数的等级
11. user_id 用户的编号
12. user_gender_id 用户的预测性别编号
13. user_age_level 用户的预测年龄等级
14. user_occupation_id 用户的预测职业编号
15. user_star_level 用户的星级编号
16. context_id 上下文信息的编号
17. context_timestamp 广告商品的展示时间
18. context_page_id  广告商品的展示页面编号
19. predict_category_property 根据查询词预测的类目属性列表
20. shop_id 店铺的编号
21. shop_review_num_level 店铺的评价数量等级
22. shop_review_positive_rate  店铺的好评率
23. shop_star_level 店铺的星级编号
24. shop_score_service 店铺的服务态度评分
25. shop_score_delivery 店铺的物流服务评分 
26. shop_score_description 店铺的描述相符评分
27. is_trade 是否被转化

In [67]:
example['shop_score_description'].describe()

count    478138.000000
mean          0.974863
std           0.025024
min          -1.000000
25%           0.969268
50%           0.978493
75%           0.983640
max           1.000000
Name: shop_score_description, dtype: float64

In [76]:
example['shop_score_description_int'] = np.where(example['shop_score_description']>=0.5,1,0)

In [77]:
percent = example.groupby(['shop_review_positive_rate_int','is_trade']).size().transform(lambda x: x/sum(x)).reset_index().rename(columns={0:'percent'})
percent

Unnamed: 0,shop_review_positive_rate_int,is_trade,percent
0,0,0,0.429418
1,0,1,0.00915
2,1,0,0.551715
3,1,1,0.009717


In [87]:
np.where(example['user_star_level']==3002|3003|3004|3005|3006,1,0)

array([0, 0, 0, ..., 0, 0, 0])

### 特征处理函数

In [98]:
import time
def timestamp_datetime(value):
    format = '%Y-%m-%d %H:%M:%S'
    value = time.localtime(value)
    dt = time.strftime(format,value)
    return dt

def process(data):
    data['item_id_hash'] = data['item_id'].apply(lambda x:hash(x)%1000000)
    data['item_brand_id_hash'] = data['item_brand_id'].apply(lambda x:hash(x)%1000000)
    data['item_city_id_hash'] = data['item_city_id'].apply(lambda x:hash(x)%1000000)
    data['item_price_level_7'] = np.where(data['item_price_level']==7,1,0)
    data['item_sales_level_11_12_13'] = np.where(data['item_sales_level']==11|12|13,1,0)
    data['item_collected_level_12_13_14'] = np.where(data['item_collected_level']==14|12|13,1,0)
    data['item_pv_level_17_18'] = np.where(data['item_pv_level']==17|18,1,0)
    data['user_id_hash'] = data['user_id'].apply(lambda x:hash(x)%1000000)
    data['user_gender_id_0'] = np.where(data['user_gender_id']==0,1,0)
    data['user_age_level_1003_1004'] = np.where(data['user_age_level']==1003|1004,1,0)
    data['user_occupation_id_2005'] = np.where(data['user_occupation_id']==2005,1,0)
    data['user_star_level_3002_3007'] = np.where(data['user_star_level']==3002|3003|3004|3005|3006|3007,1,0)
    data['context_id_hash'] = data['context_id'].apply(lambda x:hash(x)%1000000)
    
    
    
    data['time'] = data['context_timestamp'].apply(timestamp_datetime)
    data['day'] = data['time'].apply(lambda x: int(x[8:10]))
    data['hour'] = data['time'].apply(lambda x: int(x[11:13]))
    user_query_day = data.groupby(['user_id','day']).size().reset_index().rename(columns={0:'user_query_day'})
    data = pd.merge(data, user_query_day,'left',on=['user_id', 'day'])
    
    user_query_day_hour = data.groupby(['user_id', 'day', 'hour']).size().reset_index().rename(
        columns={0: 'user_query_day_hour'})
    
    data = pd.merge(data, user_query_day_hour, 'left',
                    on=['user_id', 'day', 'hour'])
    
    
    data['context_page_id_hash'] = data['context_page_id'].apply(lambda x:hash(x)%1000000)
    data['shop_id_hash'] = data['shop_id'].apply(lambda x:hash(x)%1000000)
    
    data['shop_review_num_level_16_17'] = np.where(data['shop_review_num_level']==16|17,1,0)
    
    data['shop_review_positive_rate_1'] = np.where(data['shop_review_positive_rate']>=1., 1, 0)
    
    data['shop_star_level_5013_5014'] = np.where(data['shop_review_num_level']==5013|5014,1,0)
    
    data['shop_score_service_1'] = np.where(data['shop_score_service']>=1., 1, 0)
    data['shop_score_delivery_1'] = np.where(data['shop_score_delivery']>=1., 1, 0)
    data['shop_score_description_1'] = np.where(data['shop_score_description']>=1., 1, 0)
    
    return data

In [99]:
import lightgbm as lgb
from sklearn.metrics import log_loss

In [100]:
data = process(train)

In [122]:
train_ = data.loc[data.day < 24]
test_ = data.loc[data.day == 24]

In [123]:
feature = ['item_id_hash','item_brand_id_hash','item_city_id_hash','item_price_level_7','item_sales_level_11_12_13',
              'item_collected_level_12_13_14','item_pv_level_17_18','user_id_hash','user_gender_id_0','user_age_level_1003_1004',
              'user_occupation_id_2005','user_star_level_3002_3007','context_id_hash','user_query_day',
              'user_query_day_hour','context_page_id_hash','shop_id_hash','shop_review_num_level_16_17',
              'shop_review_positive_rate_1','shop_star_level_5013_5014','shop_score_service_1',
              'shop_score_delivery_1','shop_score_description_1']
target = ['is_trade']

In [136]:
clf = lgb.LGBMClassifier(num_leaves=31, max_depth=7, n_estimators=80, n_jobs=-1)

In [137]:
clf.fit(train_[feature],train_[target])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.1, max_depth=7, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=80,
        n_jobs=-1, num_leaves=31, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)

In [138]:
res = clf.predict_proba(test_[feature],)[:, 1]

In [139]:
print(log_loss(test_[target], res))

0.0835065551289


In [140]:
test = pd.read_csv('round1_ijcai_18_test_a_20180301.txt',sep=' ')

In [142]:
test = process(test)

In [144]:
test['predicted_score'] = clf.predict_proba(test[feature],)[:, 1]

In [145]:
test[['instance_id', 'predicted_score']].to_csv('2018-04-05.csv',sep=' ', index=False)