In [1]:
import numpy as np
import pandas as pd
import time
from datetime import datetime

def time2str(t):
    return datetime.fromtimestamp(t).strftime('%Y-%m-%d %H:%M:%S')

def str2time(s):
    return datetime.strptime(s, '%Y-%m-%d %H:%M:%S').timestamp()

df_train = pd.read_csv(
    "../data/round1_ijcai_18_train_20180301.txt", index_col='instance_id', sep=' ')
df_test = pd.read_csv(
    "../data/round1_ijcai_18_test_a_20180301.txt", index_col='instance_id', sep=' ')

df_test['is_trade'] = None

numeric_feats = ['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level', 'user_age_level',
                 'user_star_level', 'context_page_id', 'shop_review_num_level',
                 'shop_review_positive_rate', 'shop_star_level', 'shop_score_service', 'shop_score_delivery', 'shop_score_description']


# nominal_feats = ['item_brand_id','item_city_id', 'user_gender_id','user_occupation_id']
nominal_feats = ['user_gender_id', 'user_occupation_id']
no_use_feats = ['item_id', 'user_id', 'context_id', 'shop_id', 'item_category_list',
                'item_property_list', 'predict_category_property', 'item_brand_id', 'item_city_id']
time_feats = ['context_timestamp']
label_feats = ['is_trade']

user = ['user_id']


df_train = df_train[numeric_feats + nominal_feats + time_feats + label_feats + user]
df_test = df_test[numeric_feats + nominal_feats + time_feats + label_feats + user]

num_train = df_train.shape[0]

df_all = pd.concat([df_train, df_test])
df_all = df_all.replace(to_replace=[-1], value=np.NaN)

# 标称属性转换为one-hot
df_all = pd.get_dummies(df_all, dummy_na=True, columns=nominal_feats)

# 填充空值
df_all[numeric_feats] = df_all[numeric_feats].fillna(df_all.mean())

# df_all['time'] = df_all.context_timestamp.apply(time2str).apply(lambda x:float(x[11:13]))

#增加小时项
df_all['day'] = df_all.context_timestamp.apply(time2str).apply(lambda x: float(x[8:10]))
df_all['hour'] = df_all.context_timestamp.apply(time2str).apply(lambda x: float(x[11:13]))

# 归一化
# df_all_X[numeric_feats] = df_all_X[numeric_feats].apply(
#     lambda x: (x - x.mean()) / (x.std()))
df_all[numeric_feats] = df_all[numeric_feats].apply(
    lambda x: (x - x.min()) / (x.max() - x.min()))


user_query_day = df_all.groupby(['user_id', 'day']).size( ).reset_index().rename(columns={0: 'user_query_day'})
df_all = pd.merge(df_all, user_query_day, 'left', on=['user_id', 'day'])

user_query_day_hour = df_all.groupby(['user_id', 'day', 'hour']).size().reset_index().rename(
        columns={0: 'user_query_day_hour'})
df_all = pd.merge(df_all, user_query_day_hour, 'left',
                    on=['user_id', 'day', 'hour'])

df_all['user_query_day'] = df_all[['user_query_day']].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
df_all['user_query_day_hour'] = df_all[['user_query_day_hour']].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
df_all['day'] = df_all[['day']].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
df_all['hour'] = df_all[['hour']].apply(lambda x: (x - x.min()) / (x.max() - x.min()))


df_train = df_all[:num_train]
df_test = df_all[num_train:]

df_train.head()

Unnamed: 0,item_price_level,item_sales_level,item_collected_level,item_pv_level,user_age_level,user_star_level,context_page_id,shop_review_num_level,shop_review_positive_rate,shop_star_level,...,user_gender_id_nan,user_occupation_id_2002.0,user_occupation_id_2003.0,user_occupation_id_2004.0,user_occupation_id_2005.0,user_occupation_id_nan,day,hour,user_query_day,user_query_day_hour
0,0.176471,0.125,0.235294,0.666667,0.428571,0.3,0.263158,0.16,1.0,0.142857,...,0,0,0,0,1,0,0.0,0.434783,0.04878,0.08
1,0.176471,0.125,0.235294,0.666667,0.285714,0.6,0.0,0.16,1.0,0.142857,...,0,0,0,0,1,0,0.0,0.521739,0.170732,0.16
2,0.176471,0.125,0.235294,0.666667,0.428571,0.4,0.0,0.16,1.0,0.142857,...,0,0,0,0,1,0,0.0,0.130435,0.02439,0.04
3,0.176471,0.125,0.235294,0.666667,0.571429,0.6,0.789474,0.16,1.0,0.142857,...,0,0,0,0,1,0,0.0,0.26087,0.0,0.0
4,0.176471,0.125,0.235294,0.666667,0.285714,0.1,0.0,0.16,1.0,0.142857,...,0,0,0,0,1,0,0.0,0.826087,0.02439,0.04


In [3]:
def time2str(t):
    return datetime.fromtimestamp(t).strftime('%Y-%m-%d %H:%M:%S')

def str2time(s):
    return datetime.strptime(s, '%Y-%m-%d %H:%M:%S').timestamp()

start_timestamp = df_train.iloc[0]['context_timestamp']
end_timestamp = df_train.iloc[-1]['context_timestamp']

start_time = time2str(df_train.loc[df_train['context_timestamp'].idxmin()]['context_timestamp'])
end_time = time2str(df_train.loc[df_train['context_timestamp'].idxmax()]['context_timestamp'])

start_time, end_time

('2018-09-18 00:00:01', '2018-09-24 23:59:47')

### 构建评估函数

In [4]:
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, log_loss
loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

### 根据时间划分训练集和测试集

In [5]:
from sklearn.model_selection import train_test_split

train_start_time, train_end_time = (str2time('2018-09-22 00:00:00'), str2time('2018-09-23 23:59:59'))
val_start_time, val_end_time = (str2time('2018-09-24 00:00:00'), str2time(end_time))

train_data = df_train[df_train['context_timestamp'] >= train_start_time]
train_data = train_data[train_data['context_timestamp'] <= train_end_time]
val_data = df_train[df_train['context_timestamp'] >= val_start_time]

y_train = train_data['is_trade']
y_val = val_data['is_trade']

X_train = train_data.drop(['context_timestamp', 'is_trade'], axis=1)
X_val = val_data.drop(['context_timestamp', 'is_trade'], axis=1)

X_train.shape, y_train.shape

((131932, 27), (131932,))

In [8]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty="l2", solver='liblinear',
                        max_iter=1000, verbose=1)

from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_features='auto', max_depth=5)

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=5, bootstrap=True, max_features='auto')

from sklearn.ensemble import AdaBoostClassifier
ab = AdaBoostClassifier()

from sklearn.ensemble import GradientBoostingClassifier
gbdt = GradientBoostingClassifier(n_estimators=50, max_features='auto', max_depth=3)

from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier(
    lr, max_samples=0.7, max_features=1.0, n_estimators=10, n_jobs=-1)

from sklearn.neural_network import MLPClassifier
# alpha可以调
# mlp = MLPClassifier(hidden_layer_sizes=(64,64), solver='adam', tol=1e-5, alpha=1e-3, verbose=1)
# mlp = MLPClassifier(hidden_layer_sizes=(64, 64), solver='sgd', tol=1e-5,
#                     learning_rate='adaptive', learning_rate_init=0.1, alpha=1e-3, verbose=1)

clf = gbdt
clf.fit(X_train, y_train)
predicted = clf.predict_proba(X_val)
log_loss(y_val, predicted)

0.084260266459771971

In [9]:
from fastFM import sgd, mcmc
from scipy import sparse

# fm = sgd.FMClassification(n_iter=1000, init_stdev=0.1, l2_reg_w=0,
#                           l2_reg_V=0, rank=2, step_size=0.1)
fm = mcmc.FMClassification(n_iter=100, rank=2, init_stdev=0.1)

y_train_fm = np.array(y_train.replace(to_replace=[0], value=-1))
y_val_fm = np.array(y_val.replace(to_replace=[0], value=-1))

X_train_fm = sparse.csr_matrix(X_train)
X_val_fm = sparse.csr_matrix(X_val)

# clf = fm
# clf.fit(X_train_fm, y_train_fm)
# predicted = clf.predict_proba(X_val_fm)


# y_pred = fm.fit_predict(X_train_fm, y_train_fm, X_val_fm)
y_pred_proba = fm.fit_predict_proba(X_train_fm, y_train_fm, X_val_fm)

# X_val
# predicted
log_loss(y_val_fm, y_pred_proba)

# set(y_train_fm)
# X_train_fm

# set(y_train_fm)


0.12576895487803319

In [377]:
train_start_time, train_end_time = (str2time('2018-09-18 00:00:00'), str2time('2018-09-24 23:59:59'))
train_data = df_train[df_train['context_timestamp'] > train_start_time]
train_data = train_data[train_data['context_timestamp'] <= train_end_time]
y_train = train_data['is_trade']
X_train = train_data.drop(['context_timestamp', 'is_trade'], axis=1)

clf.fit(X_train, y_train)

X_test = df_test.drop(['context_timestamp', 'is_trade'], axis=1)
X_test.shape
preds = clf.predict_proba(X_test)[:, 1]

sub = pd.DataFrame({'instance_id': df_test.index, 'predicted_score': preds})

sub.to_csv('20180322.txt', sep=" ", index=False, line_terminator='\n')

In [194]:
import pandas as pd
pd.merge(pd.read_csv('../data/round1_ijcai_18_test_a_20180301.txt', sep=' ', usecols=[0, 6]), pd.read_csv('../data/round1_ijcai_18_train_20180301.txt', sep=' ', usecols=[6, 26]).groupby(
    'item_price_level', as_index=False).mean(), on='item_price_level', how='left').drop('item_price_level', axis=1).to_csv('baseline.csv', index=False, sep=' ', header=['instance_id', 'predicted_score'])