In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb



In [2]:
train = pd.read_csv('../input/user_action_train.txt', sep="\t", header=None)
goods = pd.read_csv('../input/goods_train.txt', sep="\t", header=None)
test = pd.read_csv('../input/user_action_test_items.txt', sep="\t", header=None)

In [3]:
# remove useless column of user_action_text_item.txt
test = test.drop(test.columns[[2]], axis=1)
test.head()

Unnamed: 0,0,1
0,117570,771214
1,60750,1580520
2,595361,484220
3,45427,326736
4,443345,1049603


In [4]:
print("user_action_test_items.txt: ")
test.head()

user_action_test_items.txt: 


Unnamed: 0,0,1
0,117570,771214
1,60750,1580520
2,595361,484220
3,45427,326736
4,443345,1049603


In [5]:
print(len(train))
print(len(goods))
print(len(test))

10000000
2004581
5761092


In [6]:
train.rename(columns={0: 'uid', 1: 'spu_id', 2: 'action_type', 3: 'date'}, inplace=True)
goods.rename(columns={0: 'spu_id', 1: 'brand_id', 2: 'cat_id'}, inplace=True)
test.rename(columns={0: 'uid', 1: 'spu_id'}, inplace=True)

In [7]:
print("user_action_train.txt: ")
train.head()

user_action_train.txt: 


Unnamed: 0,uid,spu_id,action_type,date
0,522945,338312,0,03-28
1,320510,839213,0,03-10
2,314210,369282,0,01-05
3,381441,730484,0,01-05
4,168858,902515,0,03-10


In [8]:
print("goods_train.txt: ")
goods.head()

goods_train.txt: 


Unnamed: 0,spu_id,brand_id,cat_id
0,621837,10010304,297
1,1698431,10012546,271
2,653495,10026906,1056
3,1426380,10012968,297
4,200496,10004565,1056


In [9]:
print("user_action_test_items.txt: ")
test.head()

user_action_test_items.txt: 


Unnamed: 0,uid,spu_id
0,117570,771214
1,60750,1580520
2,595361,484220
3,45427,326736
4,443345,1049603


In [10]:
train_goods = pd.merge(train, goods, left_on='spu_id', right_on='spu_id', how='left')
test_goods = pd.merge(test, goods, left_on='spu_id', right_on='spu_id', how='left')

In [11]:
train_goods.head()

Unnamed: 0,uid,spu_id,action_type,date,brand_id,cat_id
0,522945,338312,0,03-28,10005188,1012
1,320510,839213,0,03-10,10010631,271
2,314210,369282,0,01-05,10001351,311
3,381441,730484,0,01-05,10000866,311
4,168858,902515,0,03-10,10028088,297


In [12]:
test_goods.head()

Unnamed: 0,uid,spu_id,brand_id,cat_id
0,117570,771214,10000223,1056
1,60750,1580520,10012892,28008
2,595361,484220,10005367,311
3,45427,326736,10004119,28006
4,443345,1049603,10012721,680


In [13]:
# Training set
train_label = train_goods['action_type']
train_goods = train_goods.drop(['action_type', 'date'], axis=1)

In [14]:
train_goods.head()

Unnamed: 0,uid,spu_id,brand_id,cat_id
0,522945,338312,10005188,1012
1,320510,839213,10010631,271
2,314210,369282,10001351,311
3,381441,730484,10000866,311
4,168858,902515,10028088,297


In [15]:
train_label.head()

0    0
1    0
2    0
3    0
4    0
Name: action_type, dtype: int64

In [16]:
xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

num_boost_rounds = 20

dtrain = xgb.DMatrix(train_goods, train_label)
dtest = xgb.DMatrix(test_goods)

In [17]:
cv_output = xgb.cv(
    xgb_params,
    dtrain,
    num_boost_round=num_boost_rounds,
    verbose_eval=50,
    show_stdv=False)

cv_output[['train-rmse-mean', 'test-rmse-mean']]

[0]	train-rmse:0.476038	test-rmse:0.476038


Unnamed: 0,train-rmse-mean,test-rmse-mean
0,0.476038,0.476038
1,0.453317,0.453317
2,0.43179,0.43179
3,0.411391,0.411391
4,0.392079,0.392079
5,0.373789,0.373789
6,0.356478,0.356478
7,0.340091,0.340091
8,0.324599,0.324599
9,0.309949,0.30995


In [18]:
model = xgb.train(
    dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)

y_predict = model.predict(dtest)
model_output = pd.DataFrame({'weight': y_predict})

model_output

Unnamed: 0,weight
0,0.183392
1,0.199333
2,0.183428
3,0.187926
4,0.183903
5,0.188002
6,0.183475
7,0.187549
8,0.183432
9,0.183394


In [19]:
model_output.to_csv('submisstion.txt', index=False)