In [242]:
import time
import pandas as pd
import numpy as np

# 时间戳转字符串
def timestamp_datetime(value):
    format = '%Y-%m-%d %H:%M:%S'
    value = time.localtime(value)
    dt = time.strftime(format, value)
    return dt

### 提取特征

In [245]:
numeric_feats = ['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level', 'user_age_level',
                 'user_star_level', 'shop_review_num_level',
                 'shop_review_positive_rate', 'shop_star_level', 'shop_score_service', 'shop_score_delivery', 'shop_score_description',
                 'hour', 'day',
                 'user_query_day', 'user_query_day_hour',
                 'item_CTR', 'shop_CTR', 
                 ]

nominal_feats = ['item_id', 'user_id', 'shop_id', 'item_brand_id',
                 'item_city_id', 'user_gender_id', 'user_occupation_id', 'context_page_id']

features = numeric_feats + nominal_feats

target = 'is_trade'

def extract_date(data):
    data['time'] = data.context_timestamp.apply(timestamp_datetime)
    data['day'] = data.time.apply(lambda x: int(x[8:10]))
    data['hour'] = data.time.apply(lambda x: int(x[11:13]))
    data['minute'] = data.time.apply(lambda x: int(x[14:15]))
    del data['time']
    return data

def extract_ctr(data, feature, alias):
    '''统计给定数据的转化率
    
    '''
    ctr_feat = alias + '_CTR'
    query_cnt_feat = alias + '_query_cnt_history'
    conversion_cnt_feat = alias + '_conversion_cnt_history'
    
    
    query_cnt = data.groupby([feature]).size().reset_index().rename(columns={0: query_cnt_feat})
    conversion_cnt = data[data['is_trade'] == 1].groupby([feature]).size().reset_index().rename(columns={0: conversion_cnt_feat})
    ctr = pd.merge(query_cnt, conversion_cnt, how='left', on=[feature])
    ctr[[conversion_cnt_feat]] = ctr[[conversion_cnt_feat]].fillna(0)
    ctr[ctr_feat] = ctr[conversion_cnt_feat] / ctr[query_cnt_feat]
    return ctr

def extract_history_ctr(data, feature, alias):
    '''统计每一天之前的历史转化率
    
    '''
    history_ctr = pd.DataFrame()
    
    for day in range(18, 26):
        # 每一天之前的历史数据
        history_data = data[data['day'] < day]
        ctr = extract_ctr(history_data, feature, alias)        
    
        # 添加date字段，方便merge
        ctr['day'] = day
        history_ctr = history_ctr.append(ctr)
        
    return history_ctr

def extract_last_day_ctr(data):
    '''统计前一天的总体转化率
    
    '''
    days_ctr = pd.DataFrame()
    ctr_feat = 'last_day_CTR'
    query_cnt_feat = 'last_day_query_cnt'
    conversion_cnt_feat = 'last_day_conversion_cnt'
    
    for day in range(19, 26):
        # 每一天之前的历史数据
        history_data = data[data['day'] == day - 1]

        query_cnt = history_data.shape[0]
        conversion_cnt = history_data[history_data['is_trade'] == 1].shape[0]
        ctr = conversion_cnt / query_cnt 
    
        # 添加date字段，方便merge
        days_ctr = days_ctr.append({ctr_feat: ctr, query_cnt_feat:query_cnt, conversion_cnt_feat:conversion_cnt, 'day':day}, ignore_index=True)
        
    return days_ctr

def extract_user_item_conversion(data):
    '''统计已经转化的user-item组合
    
    '''
    history_user_item_conversion = pd.DataFrame()
    
    for day in range(19, 26):
        # 每一天之前的历史数据
        user_item_conversion = data[(data['day'] < day)][['user_id', 'item_id']]
        user_item_conversion['converted'] = 1
        user_item_conversion['day'] = day
        user_item_conversion.drop_duplicates(inplace=True)
        
        # 添加date字段，方便merge
        history_user_item_conversion = history_user_item_conversion.append(user_item_conversion, ignore_index=True)
        
    return history_user_item_conversion

def extract_user_item_click(data):
    '''统计历史的user-item组合次数
    
    '''
    history_user_item_click = pd.DataFrame()
    feat = 'user_item_click'
    
    for day in range(19, 26):
        # 每一天之前的历史数据
        user_item_click = data[(data['day'] < day)][['user_id', 'item_id']]
        user_item_click_cnt = user_item_click.groupby(['user_id', 'item_id']).size().reset_index().rename(columns={0: 'user_item_click_cnt'})
        user_item_click_cnt['day'] = day
        
        # 添加date字段，方便merge
        history_user_item_click = history_user_item_click.append(user_item_click_cnt, ignore_index=True)
        
    return history_user_item_click
    

def extract_manual_features(data):
    
    user_item_query_day = data.groupby(['user_id', 'item_id', 'day']).size().reset_index().rename(columns={0: 'user_item_query_day'})
    data = pd.merge(data, user_item_query_day, how='left',on=['user_id', 'item_id', 'day'])
    
    user_item_query_day_hour = data.groupby(['user_id', 'item_id', 'day', 'hour']).size().reset_index().rename(columns={0: 'user_item_query_day_hour'})
    data = pd.merge(data, user_item_query_day_hour, how='left',on=['user_id', 'item_id', 'day', 'hour'])

        
#     ###############
    
    user_query_day = data.groupby(['user_id', 'day']).size().reset_index().rename(columns={0: 'user_query_day'})
    data = pd.merge(data, user_query_day, how='left',on=['user_id', 'day'])

    user_query_day_hour = data.groupby(['user_id', 'day', 'hour']).size().reset_index().rename(columns={0: 'user_query_day_hour'})
    data = pd.merge(data, user_query_day_hour, how='left',on=['user_id', 'day', 'hour'])
    
    item_query_day = data.groupby(['item_id', 'day']).size().reset_index().rename(columns={0: 'item_query_day'})
    data = pd.merge(data, item_query_day, 'left', on=['item_id', 'day'])
    
    item_query_day_hour = data.groupby(['item_id', 'day', 'hour']).size().reset_index().rename(columns={0: 'item_query_day_hour'})
    data = pd.merge(data, item_query_day_hour, 'left',on=['item_id', 'day', 'hour'])
    
    shop_query_day = data.groupby(['shop_id', 'day']).size().reset_index().rename(columns={0: 'shop_query_day'})
    data = pd.merge(data, shop_query_day, 'left', on=['shop_id', 'day'])
    
    shop_query_day_hour = data.groupby(['shop_id', 'day', 'hour']).size().reset_index().rename(columns={0: 'shop_query_day_hour'})
    data = pd.merge(data, shop_query_day_hour, 'left',on=['shop_id', 'day', 'hour'])
    
    
#     ======= brand =======
#     item_in_brand = data.groupby(['item_id', 'item_brand_id']).size().reset_index().rename(columns={0: 'item_in_brand'})
#     data = pd.merge(data, user_brand_query_day, how='left',on=['item_id', 'item_brand_id'])

    user_brand_query_day = data.groupby(['user_id', 'item_brand_id', 'day']).size().reset_index().rename(columns={0: 'user_brand_query_day'})
    data = pd.merge(data, user_brand_query_day, how='left',on=['user_id', 'item_brand_id', 'day'])
    
    user_brand_query_day_hour = data.groupby(['user_id', 'item_brand_id', 'day', 'hour']).size().reset_index().rename(columns={0: 'user_brand_query_day_hour'})
    data = pd.merge(data, user_brand_query_day_hour, how='left',on=['user_id', 'item_brand_id', 'day', 'hour'])
    
    brand_query_day = data.groupby(['item_brand_id', 'day']).size().reset_index().rename(columns={0: 'brand_query_day'})
    data = pd.merge(data, brand_query_day, 'left', on=['item_brand_id', 'day'])
    
    brand_query_day_hour = data.groupby(['item_brand_id', 'day', 'hour']).size().reset_index().rename(columns={0: 'brand_query_day_hour'})
    data = pd.merge(data, brand_query_day_hour, 'left',on=['item_brand_id', 'day', 'hour'])
       
    data['shop_to_user'] = data.user_brand_query_day / data.brand_query_day
    
        
    history_item_ctr = extract_history_ctr(data, 'item_id', 'item')
    data = pd.merge(data, history_item_ctr, how='left', on=['item_id', 'day'])
    data[['item_CTR']] = data[['item_CTR']].fillna(-1)
        
    history_shop_ctr = extract_history_ctr(data, 'shop_id', 'shop')
    data = pd.merge(data, history_shop_ctr, how='left', on=['shop_id', 'day'])
    data[['shop_CTR']] = data[['shop_CTR']].fillna(-1)
    
    history_user_ctr = extract_history_ctr(data, 'user_id', 'user')
    data = pd.merge(data, history_user_ctr, how='left', on=['user_id', 'day'])
    data[['user_CTR']] = data[['user_CTR']].fillna(-1)
    
    history_day_ctr = extract_last_day_ctr(data)
    data = pd.merge(data, history_day_ctr, how='left', on=['day'])
    data[['last_day_CTR']] = data[['last_day_CTR']].fillna(-1)
    
    history_user_item_conversion = extract_user_item_conversion(data)
    data = pd.merge(data, history_user_item_conversion, how='left', on=['user_id', 'item_id', 'day'])
    data[['converted']] = data[['converted']].fillna(0)
    
#     history_user_item_click = extract_user_item_click(data)
#     data = pd.merge(data, history_user_item_click, how='left', on=['user_id', 'item_id', 'day'])
#     data[['user_item_click_cnt']] = data[['user_item_click_cnt']].fillna(0)

    user_item_std = data.groupby(['user_id', 'item_id', 'day']).hour.std().reset_index().rename(columns={'hour': 'user_item_std'})
    data = pd.merge(data, user_item_std, how='left', on=['user_id','item_id', 'day'])
    
    
    feature_with_CTR = ['shop_id', 'item_id', 'day', 'user_id']
    data[feature_with_CTR] = data[feature_with_CTR].astype(np.int64)
    
#     data = pd.get_dummies(data, dummy_na=True, columns=['user_gender_id', 'user_occupation_id', 'context_page_id'])

    
#     data = data.replace(to_replace=[-1], value=np.NaN)
#     data[numeric_feats] = data[numeric_feats].fillna(data.mean())
#     data[numeric_feats] = data[numeric_feats].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

    return data

### 读取训练样本并转换

In [246]:
df_train = pd.read_csv("round1_ijcai_18_train_20180301.txt", sep=' ')
df_test = pd.read_csv("round1_ijcai_18_test_a_20180301.txt", sep=' ')

df_train.drop_duplicates(inplace=True)
df_train = extract_date(df_train)
df_test = extract_date(df_test)

num_train = df_train.shape[0]
all_data = pd.concat([df_train, df_test])
all_data = extract_manual_features(all_data)
df_train = all_data[:num_train]
df_test = all_data[num_train:]
del df_test['is_trade']

In [247]:
# 根据日期，取24号为验证集
train_data = df_train.loc[df_train.day < df_train['day'].max()]
val_data = df_train.loc[df_train.day == df_train['day'].max()]

train_data.shape, val_data.shape

# train_data.columns

((420693, 57), (57418, 57))

In [248]:
# user_item_std = df_train.groupby(['user_id', 'user_item_query_day']).context_timestamp.count().reset_index().rename(columns={'context_timestamp': 'user_item_query_day_m'})

# user_item_std[user_item_std['user_id'] == 228793539864019462]

a = df_train[['item_brand_id']]

# a.groupby(['item_brand_id'], as_index=False).count().rename(columns={0: 'item_brand_query_cnt_feat'})
# df_train.groupby(['item_brand_id'], as_index=False).size().reset_index().rename(columns={0: 'item_brand_query_cnt_feat'})

In [29]:
# extract_ctr(df_train, feature='day', alias='page')
extract_last_day_ctr(df_train)
history_user_item_conversion = extract_user_item_conversion(df_train)
# history_user_item_conversion[history_user_item_conversion['day']==23]

a = df_train[(df_train['converted'] == 1) & (df_train['day'] == 23)][['user_id','item_id','is_trade','converted']]
a[a['is_trade'] == 1].shape[0] / a.shape[0]

# df_test[(df_test['converted'] == 1) & (df_test['day'] == 25)][['user_id','item_id','converted']]

0.018811531213462983

In [249]:
features = ['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level', 'item_city_id', 'item_brand_id',
            'user_age_level', 'user_star_level', 'user_gender_id', 'user_occupation_id',
            'shop_review_num_level', 'shop_review_positive_rate', 'shop_star_level', 'shop_score_service',
            'shop_score_delivery', 'shop_score_description', 'context_page_id',
            'item_id', 'user_id', 'shop_id',
            'day', 'hour',
            'user_query_day',
            'user_query_day_hour',
            'item_query_day',
            'item_query_day_hour',
            'shop_query_day',
            'shop_query_day_hour',
            'item_CTR',
            'shop_CTR',
            'user_CTR',
            
#             'user_item_click_cnt',
            
            'item_conversion_cnt_history', 
            'user_conversion_cnt_history',
            
            'user_item_query_day',
#             'user_item_query_day_hour',
            
            'user_brand_query_day_hour',
            'user_brand_query_day',
            
#             'user_item_std',
            
#             'brand_query_day',
#             'brand_query_day_hour',
            'shop_to_user',
            
            
            'last_day_CTR',
            'last_day_conversion_cnt',
            'last_day_query_cnt',
            
            'converted'
            ]

nominal_feats = ['user_gender_id', ]


target = 'is_trade'

# train_data.columns

### lightgbm

In [250]:
import lightgbm as lgb
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, log_loss
clf = lgb.LGBMClassifier(max_depth=3, n_estimators=200, n_jobs=-1)

clf.fit(train_data[features], train_data[target], feature_name=features, categorical_feature=nominal_feats)

loss_train = log_loss(train_data[target], clf.predict_proba(train_data[features]))

predict_proba = clf.predict_proba(val_data[features])
    
loss_val = log_loss(val_data[target], predict_proba)

loss_train, loss_val




(0.087448212455262769, 0.080831573469185056)

In [253]:
# clf = lgb.LGBMClassifier(max_depth=3, n_estimators=200, n_jobs=-1)
# clf.fit(df_train[features], df_train[target], feature_name=features, categorical_feature=['user_gender_id'])

loss_train = log_loss(val_data[target], clf.predict_proba(val_data[features]))

df_test['predicted_score'] = clf.predict_proba(df_test[features])[:, 1]

df_test[['instance_id', 'predicted_score']].to_csv(
    '20180328.txt', index=False, sep=' ')

loss_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0.080831573469185056