In [2]:
import time
import pandas as pd
import numpy as np

# 时间戳转字符串
def timestamp_datetime(value):
    format = '%Y-%m-%d %H:%M:%S'
    value = time.localtime(value)
    dt = time.strftime(format, value)
    return dt

### 提取特征

In [3]:
numeric_feats = ['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level', 'user_age_level',
                 'user_star_level', 'shop_review_num_level',
                 'shop_review_positive_rate', 'shop_star_level', 'shop_score_service', 'shop_score_delivery', 'shop_score_description',
                 'hour', 'day',
                 'user_query_day', 'user_query_day_hour',
                 'item_CTR', 'shop_CTR', 
                 ]

nominal_feats = ['item_id', 'user_id', 'shop_id', 'item_brand_id',
                 'item_city_id', 'user_gender_id', 'user_occupation_id', 'context_page_id']

features = numeric_feats + nominal_feats

target = 'is_trade'

def extract_date(data):
    data['time'] = data.context_timestamp.apply(timestamp_datetime)
    data['day'] = data.time.apply(lambda x: int(x[8:10]))
    data['hour'] = data.time.apply(lambda x: int(x[11:13]))
    del data['time']
    return data

def extract_ctr(data, feature, alias):
    '''统计给定数据的转化率
    
    '''
    ctr_feat = alias + '_CTR'
    query_cnt_feat = alias + '_query_cnt_history'
    conversion_cnt_feat = alias + '_conversion_cnt_history'
    
    query_cnt = data.groupby([feature]).size().reset_index().rename(columns={0: query_cnt_feat})
    conversion_cnt = data[data['is_trade'] == 1].groupby([feature]).size().reset_index().rename(columns={0: conversion_cnt_feat})
    ctr = pd.merge(query_cnt, conversion_cnt, how='left', on=[feature])
    ctr[[conversion_cnt_feat]] = ctr[[conversion_cnt_feat]].fillna(0)
    ctr[ctr_feat] = ctr[conversion_cnt_feat] / ctr[query_cnt_feat]
    return ctr

def extract_history_ctr(data, feature, alias):
    '''统计每一天之前的历史转化率
    
    '''
    history_ctr = pd.DataFrame()
    
    for day in range(18, 26):
        # 每一天之前的历史数据
        history_data = data[data['day'] < day]
        ctr = extract_ctr(history_data, feature, alias)        
    
        # 添加date字段，方便merge
        ctr['day'] = day
        history_ctr = history_ctr.append(ctr)
        
    return history_ctr

def extract_last_day_ctr(data):
    '''统计前一天的总体转化率
    
    '''
    days_ctr = pd.DataFrame()
    ctr_feat = 'last_day_CTR'
    query_cnt_feat = 'last_day_query_cnt'
    conversion_cnt_feat = 'last_day_conversion_cnt'
    
    for day in range(19, 26):
        # 每一天之前的历史数据
        history_data = data[data['day'] == day - 1]

        query_cnt = history_data.shape[0]
        conversion_cnt = history_data[history_data['is_trade'] == 1].shape[0]
        ctr = conversion_cnt / query_cnt 
    
        # 添加date字段，方便merge
        days_ctr = days_ctr.append({ctr_feat: ctr, query_cnt_feat:query_cnt, conversion_cnt_feat:conversion_cnt, 'day':day}, ignore_index=True)
        
    return days_ctr
    

def extract_manual_features(data):
    
    user_query_day = data.groupby(['user_id', 'day']).size().reset_index().rename(columns={0: 'user_query_day'})
    data = pd.merge(data, user_query_day, how='left',on=['user_id', 'day'])

    user_query_day_hour = data.groupby(['user_id', 'day', 'hour']).size().reset_index().rename(columns={0: 'user_query_day_hour'})
    data = pd.merge(data, user_query_day_hour, how='left',on=['user_id', 'day', 'hour'])
    
    item_query_day = data.groupby(['item_id', 'day']).size().reset_index().rename(columns={0: 'item_query_day'})
    data = pd.merge(data, item_query_day, 'left', on=['item_id', 'day'])
    
    item_query_day_hour = data.groupby(['item_id', 'day', 'hour']).size().reset_index().rename(columns={0: 'item_query_day_hour'})
    data = pd.merge(data, item_query_day_hour, 'left',on=['item_id', 'day', 'hour'])
    
    shop_query_day = data.groupby(['shop_id', 'day']).size().reset_index().rename(columns={0: 'shop_query_day'})
    data = pd.merge(data, shop_query_day, 'left', on=['shop_id', 'day'])
    
    shop_query_day_hour = data.groupby(['shop_id', 'day', 'hour']).size().reset_index().rename(columns={0: 'shop_query_day_hour'})
    data = pd.merge(data, shop_query_day_hour, 'left',on=['shop_id', 'day', 'hour'])
       
    history_item_ctr = extract_history_ctr(data, 'item_id', 'item')
    data = pd.merge(data, history_item_ctr, how='left', on=['item_id', 'day'])
    data[['item_CTR']] = data[['item_CTR']].fillna(-1)
        
    history_shop_ctr = extract_history_ctr(data, 'shop_id', 'shop')
    data = pd.merge(data, history_shop_ctr, how='left', on=['shop_id', 'day'])
    data[['shop_CTR']] = data[['shop_CTR']].fillna(-1)
    
    history_user_ctr = extract_history_ctr(data, 'user_id', 'user')
    data = pd.merge(data, history_user_ctr, how='left', on=['user_id', 'day'])
    data[['user_CTR']] = data[['user_CTR']].fillna(-1)
    
    history_day_ctr = extract_last_day_ctr(data)
    data = pd.merge(data, history_day_ctr, how='left', on=['day'])
    data[['last_day_CTR']] = data[['last_day_CTR']].fillna(-1)
    
    
    feature_with_CTR = ['shop_id', 'item_id', 'day', 'user_id']
    data[feature_with_CTR] = data[feature_with_CTR].astype(np.int64)
    
#     data = pd.get_dummies(data, dummy_na=True, columns=['user_gender_id', 'user_occupation_id', 'context_page_id'])

    
#     data = data.replace(to_replace=[-1], value=np.NaN)
#     data[numeric_feats] = data[numeric_feats].fillna(data.mean())
#     data[numeric_feats] = data[numeric_feats].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

    return data

### 读取训练样本并转换

In [4]:
df_train = pd.read_csv("round1_ijcai_18_train_20180301.txt", sep=' ')
df_test = pd.read_csv("round1_ijcai_18_test_a_20180301.txt", sep=' ')

df_train.drop_duplicates(inplace=True)
df_train = extract_date(df_train)
df_test = extract_date(df_test)

num_train = df_train.shape[0]
all_data = pd.concat([df_train, df_test])
all_data = extract_manual_features(all_data)
df_train = all_data[:num_train]
df_test = all_data[num_train:]
del df_test['is_trade']


# 根据日期，取24号为验证集
train_data = df_train.loc[df_train.day < df_train['day'].max()]
val_data = df_train.loc[df_train.day == df_train['day'].max()]

train_data.shape, val_data.shape

((420693, 47), (57418, 47))

In [118]:
train_data.columns

Index(['context_id', 'context_page_id', 'context_timestamp', 'day', 'hour',
       'instance_id', 'is_trade', 'item_brand_id', 'item_category_list',
       'item_city_id', 'item_collected_level', 'item_id', 'item_price_level',
       'item_property_list', 'item_pv_level', 'item_sales_level',
       'predict_category_property', 'shop_id', 'shop_review_num_level',
       'shop_review_positive_rate', 'shop_score_delivery',
       'shop_score_description', 'shop_score_service', 'shop_star_level',
       'user_age_level', 'user_gender_id', 'user_id', 'user_occupation_id',
       'user_star_level', 'user_query_day', 'user_query_day_hour',
       'item_query_day', 'item_query_day_hour', 'shop_query_day',
       'shop_query_day_hour', 'item_CTR', 'item_conversion_cnt_history',
       'item_query_cnt_history', 'shop_CTR', 'shop_conversion_cnt_history',
       'shop_query_cnt_history', 'user_CTR', 'user_conversion_cnt_history',
       'user_query_cnt_history', 'last_day_CTR', 'last_day_conversio

In [144]:
# extract_ctr(df_train, feature='day', alias='page')
extract_days_ctr(df_train)

Unnamed: 0,day,day_CTR,day_conversion_cnt_history,day_query_cnt_history
0,19.0,0.019933,1560.0,78261.0
1,20.0,0.019626,1392.0,70927.0
2,21.0,0.019317,1321.0,68384.0
3,22.0,0.019257,1371.0,71195.0
4,23.0,0.018825,1286.0,68315.0
5,24.0,0.01723,1096.0,63611.0
6,25.0,0.016859,968.0,57418.0


In [5]:
features = ['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level', 'item_city_id', 'item_brand_id',
            'user_age_level', 'user_star_level', 'user_gender_id', 'user_occupation_id',
            'shop_review_num_level', 'shop_review_positive_rate', 'shop_star_level', 'shop_score_service',
            'shop_score_delivery', 'shop_score_description', 'context_page_id',
            'item_id', 'user_id', 'shop_id',
            'day', 'hour',
            'user_query_day',
            'user_query_day_hour',
            'item_query_day',
            'item_query_day_hour',
            'shop_query_day',
            'shop_query_day_hour',
            'item_CTR',
            'shop_CTR',
            'user_CTR',
            
            'item_conversion_cnt_history', 
            'user_conversion_cnt_history',
            
            'last_day_CTR',
            'last_day_conversion_cnt',
            'last_day_query_cnt'
            
            ]

nominal_feats = ['user_gender_id', ]


target = 'is_trade'

train_data.columns

Index(['context_id', 'context_page_id', 'context_timestamp', 'day', 'hour',
       'instance_id', 'is_trade', 'item_brand_id', 'item_category_list',
       'item_city_id', 'item_collected_level', 'item_id', 'item_price_level',
       'item_property_list', 'item_pv_level', 'item_sales_level',
       'predict_category_property', 'shop_id', 'shop_review_num_level',
       'shop_review_positive_rate', 'shop_score_delivery',
       'shop_score_description', 'shop_score_service', 'shop_star_level',
       'user_age_level', 'user_gender_id', 'user_id', 'user_occupation_id',
       'user_star_level', 'user_query_day', 'user_query_day_hour',
       'item_query_day', 'item_query_day_hour', 'shop_query_day',
       'shop_query_day_hour', 'item_CTR', 'item_conversion_cnt_history',
       'item_query_cnt_history', 'shop_CTR', 'shop_conversion_cnt_history',
       'shop_query_cnt_history', 'user_CTR', 'user_conversion_cnt_history',
       'user_query_cnt_history', 'last_day_CTR', 'last_day_conversio

### lightgbm

In [6]:
import lightgbm as lgb
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, log_loss
clf = lgb.LGBMClassifier(max_depth=4, n_estimators=100, n_jobs=-1)

clf.fit(train_data[features], train_data[target], feature_name=features, categorical_feature=nominal_feats)

loss_train = log_loss(train_data[target], clf.predict_proba(train_data[features]))
loss_val = log_loss(val_data[target], clf.predict_proba(val_data[features]))

loss_train, loss_val



(0.087909701732396081, 0.081300234297209242)

### 训练和测试

In [294]:
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, log_loss
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty="l2", solver='liblinear',
                        max_iter=1000, verbose=1)

from sklearn.ensemble import GradientBoostingClassifier
gbdt = GradientBoostingClassifier(n_estimators=100, max_features='auto')

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=5, bootstrap=True, max_features='auto')

clf = gbdt
clf.fit(train_data[features], train_data[target])

loss_train = log_loss(train_data[target], clf.predict_proba(train_data[features]))
predicted = clf.predict_proba(val_data[features])
loss_val = log_loss(val_data[target], predicted)

loss_train, loss_val

(0.088690767298507736, 0.081663202197175475)

# 提交测试数据

In [34]:
sample_weight = val_data.shape[0] / df_test.shape[0]


<bound method NDFrame.head of                  context_id  context_page_id  context_timestamp  day  hour  \
478111  2858926263178884522             4005         1537885700   25    22   
478112  4014843148303402290             4001         1537862843   25    16   
478113  8141706531104182980             4011         1537858222   25    14   
478114  5878932623083097102             4001         1537860915   25    15   
478115  7218739953079190510             4001         1537859074   25    15   
478116  8694995196932279401             4001         1537862577   25    16   
478117  7376997640064438002             4001         1537846509   25    11   
478118  5628496166637712141             4001         1537852269   25    13   
478119  5099728009197593980             4001         1537846083   25    11   
478120  7027494912964581382             4001         1537855748   25    14   
478121  7060719899999167791             4001         1537866277   25    17   
478122  4637022885768521432       

In [35]:
feats = ['user_query_day', 'user_query_day_hour', 
             'item_query_day', 'item_query_day_hour', 
             'shop_query_day', 'shop_query_day_hour']
    
df_test[feats] = df_test[feats] * sample_weight
  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


<bound method NDFrame.head of                  context_id  context_page_id  context_timestamp  day  hour  \
478111  2858926263178884522             4005         1537885700   25    22   
478112  4014843148303402290             4001         1537862843   25    16   
478113  8141706531104182980             4011         1537858222   25    14   
478114  5878932623083097102             4001         1537860915   25    15   
478115  7218739953079190510             4001         1537859074   25    15   
478116  8694995196932279401             4001         1537862577   25    16   
478117  7376997640064438002             4001         1537846509   25    11   
478118  5628496166637712141             4001         1537852269   25    13   
478119  5099728009197593980             4001         1537846083   25    11   
478120  7027494912964581382             4001         1537855748   25    14   
478121  7060719899999167791             4001         1537866277   25    17   
478122  4637022885768521432       

In [36]:
df_train.head

<bound method NDFrame.head of                  context_id  context_page_id  context_timestamp  day  hour  \
0        282924576738839389             4006         1537236544   18    10   
1       4007979028023783431             4001         1537243232   18    12   
2       4884875192608989870             4001         1537211052   18     3   
3        840119421106178602             4016         1537222670   18     6   
4       1736769971710354684             4001         1537271320   18    19   
5       4434980272230296456             4003         1537282855   18    23   
6       3622211816051289512             4001         1537280317   18    22   
7       7851031132945961016             4001         1537261120   18    16   
8       8388974876851097582             4001         1537208871   18     2   
9       1138535512266486347             4003         1537285390   18    23   
10      1321094267098227649             4001         1537263036   18    17   
11      3293182224165193298       

In [325]:

clf = lgb.LGBMClassifier(max_depth=4, n_estimators=100, n_jobs=-1)
clf.fit(df_train[features], df_train[target], feature_name=features, categorical_feature=['user_gender_id'])

loss_train = log_loss(df_train[target], clf.predict_proba(df_train[features]))

df_test['predicted_score'] = clf.predict_proba(df_test[features])[:, 1]

df_test[['instance_id', 'predicted_score']].to_csv(
    '20180326.txt', index=False, sep=' ')

loss_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0.087222402067231086