In [1]:
import time
import pandas as pd
import numpy as np

# 时间戳转字符串
def timestamp_datetime(value):
    format = '%Y-%m-%d %H:%M:%S'
    value = time.localtime(value)
    dt = time.strftime(format, value)
    return dt

### 提取特征

In [4]:
numeric_feats = ['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level', 'user_age_level',
                 'user_star_level', 'shop_review_num_level',
                 'shop_review_positive_rate', 'shop_star_level', 'shop_score_service', 'shop_score_delivery', 'shop_score_description',
                 'hour', 'day',
                 'user_query_day', 'user_query_day_hour',
                 'item_CTR', 'shop_CTR', 
                 ]

nominal_feats = ['item_id', 'user_id', 'shop_id', 'item_brand_id',
                 'item_city_id', 'user_gender_id', 'user_occupation_id', 'context_page_id']

features = numeric_feats + nominal_feats

target = 'is_trade'

def extract_date(data):
    data['time'] = data.context_timestamp.apply(timestamp_datetime)
    data['day'] = data.time.apply(lambda x: int(x[8:10]))
    data['hour'] = data.time.apply(lambda x: int(x[11:13]))
    del data['time']
    return data

def extract_ctr(data, feature, alias):
    '''统计给定数据的转化率
    
    '''
    query_cnt = data.groupby([feature]).size().reset_index().rename(columns={0: 'query_cnt'})
    conversion_cnt = data[data['is_trade'] == 1].groupby([feature]).size().reset_index().rename(columns={0: 'conversion_cnt'})
    ctr = pd.merge(query_cnt, conversion_cnt, how='left', on=[feature])
    ctr[['conversion_cnt']] = ctr[['conversion_cnt']].fillna(0)
    ctr[alias] = ctr['conversion_cnt'] / ctr['query_cnt']
    del ctr['query_cnt']
    del ctr['conversion_cnt']
    return ctr

def extract_history_ctr(data, feature, alias):
    '''统计每一天之前的历史转化率
    
    '''
    history_ctr = pd.DataFrame(columns = [feature, alias, 'day'])
    
    for day in range(18, 26):
        # 每一天之前的历史数据
        history_data = data[data['day'] < day]
        ctr = extract_ctr(history_data, feature, alias)        
    
        # 添加date字段，方便merge
        ctr['day'] = day
        history_ctr = history_ctr.append(ctr)
        
    return history_ctr
    

def extract_manual_features(data):
    
    user_query_day = data.groupby(['user_id', 'day']).size(
    ).reset_index().rename(columns={0: 'user_query_day'})
    data = pd.merge(data, user_query_day, how='left',
                    on=['user_id', 'day'])

    user_query_day_hour = data.groupby(['user_id', 'day', 'hour']).size(
    ).reset_index().rename(columns={0: 'user_query_day_hour'})
    data = pd.merge(data, user_query_day_hour, how='left',
                    on=['user_id', 'day', 'hour'])
    
    history_item_ctr = extract_history_ctr(data, 'item_id', 'item_CTR')
    data = pd.merge(data, history_item_ctr, how='left', on=['item_id', 'day'])
    data[['item_CTR']] = data[['item_CTR']].fillna(-1)
        
    history_shop_ctr = extract_history_ctr(data, 'shop_id', 'shop_CTR')
    data = pd.merge(data, history_shop_ctr, how='left', on=['shop_id', 'day'])
    data[['shop_CTR']] = data[['shop_CTR']].fillna(-1)
    
    feature_with_CTR = ['shop_id', 'item_id', 'day']
    data[feature_with_CTR] = data[feature_with_CTR].astype(np.int64)
    
#     data = pd.get_dummies(data, dummy_na=True, columns=['user_gender_id', 'user_occupation_id', 'context_page_id'])

    
#     data = data.replace(to_replace=[-1], value=np.NaN)
#     data[numeric_feats] = data[numeric_feats].fillna(data.mean())
#     data[numeric_feats] = data[numeric_feats].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

    return data

### 读取训练样本并转换

In [5]:
df_train = pd.read_csv("round1_ijcai_18_train_20180301.txt", sep=' ')
df_test = pd.read_csv("round1_ijcai_18_test_a_20180301.txt", sep=' ')

df_train.drop_duplicates(inplace=True)
df_train = extract_date(df_train)
df_test = extract_date(df_test)

num_train = df_train.shape[0]
all_data = pd.concat([df_train, df_test])
all_data = extract_manual_features(all_data)
df_train = all_data[:num_train]
df_test = all_data[num_train:]
del df_test['is_trade']


# 根据日期，取24号为验证集
train_data = df_train.loc[df_train.day < df_train['day'].max()]
val_data = df_train.loc[df_train.day == df_train['day'].max()]

train_data.shape, val_data.shape

AttributeError: 'Series' object has no attribute 'columns'

In [301]:
extract_ctr(df_train, feature='user_gender_id', alias='page_CTR')

Unnamed: 0,user_gender_id,page_CTR
0,-1,0.012866
1,0,0.01842
2,1,0.021124
3,2,0.018842


In [324]:
numeric_feats = ['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level', 'user_age_level',
                 'user_star_level', 'shop_review_num_level',
                 'shop_review_positive_rate', 'shop_star_level', 'shop_score_service', 'shop_score_delivery', 'shop_score_description',
                 'hour', 'day',
                 'user_query_day', 'user_query_day_hour',
                 'item_CTR', 'shop_CTR', 
                 ]

nominal_feats = ['item_id', 'user_id', 'shop_id', 'item_brand_id',
                 'item_city_id', 'user_gender_id', 'user_occupation_id', 'context_page_id']

features = numeric_feats + nominal_feats

target = 'is_trade'

train_data.columns

Index(['context_id', 'context_page_id', 'context_timestamp', 'day', 'hour',
       'instance_id', 'is_trade', 'item_brand_id', 'item_category_list',
       'item_city_id', 'item_collected_level', 'item_id', 'item_price_level',
       'item_property_list', 'item_pv_level', 'item_sales_level',
       'predict_category_property', 'shop_id', 'shop_review_num_level',
       'shop_review_positive_rate', 'shop_score_delivery',
       'shop_score_description', 'shop_score_service', 'shop_star_level',
       'user_age_level', 'user_gender_id', 'user_id', 'user_occupation_id',
       'user_star_level', 'user_query_day', 'user_query_day_hour', 'item_CTR',
       'shop_CTR'],
      dtype='object')

### lightgbm

In [326]:
import lightgbm as lgb
clf = lgb.LGBMClassifier(max_depth=4, n_estimators=100, n_jobs=-1)

clf.fit(train_data[features], train_data[target], feature_name=features, categorical_feature=['user_gender_id'])

loss_train = log_loss(train_data[target], clf.predict_proba(train_data[features]))
loss_val = log_loss(val_data[target], clf.predict_proba(val_data[features]))

loss_train, loss_val



(0.088003286862289073, 0.081456150703436425)

### 训练和测试

In [294]:
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, log_loss
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty="l2", solver='liblinear',
                        max_iter=1000, verbose=1)

from sklearn.ensemble import GradientBoostingClassifier
gbdt = GradientBoostingClassifier(n_estimators=100, max_features='auto')

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=5, bootstrap=True, max_features='auto')

clf = gbdt
clf.fit(train_data[features], train_data[target])

loss_train = log_loss(train_data[target], clf.predict_proba(train_data[features]))
predicted = clf.predict_proba(val_data[features])
loss_val = log_loss(val_data[target], predicted)

loss_train, loss_val

(0.088690767298507736, 0.081663202197175475)

# 提交测试数据

In [325]:


clf = lgb.LGBMClassifier(max_depth=4, n_estimators=100, n_jobs=-1)
clf.fit(df_train[features], df_train[target], feature_name=features, categorical_feature=['user_gender_id'])

loss_train = log_loss(df_train[target], clf.predict_proba(df_train[features]))

df_test['predicted_score'] = clf.predict_proba(df_test[features])[:, 1]

df_test[['instance_id', 'predicted_score']].to_csv(
    '20180325.txt', index=False, sep=' ')

loss_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0.087222402067231086