In [1]:
import time
import pandas as pd
import numpy as np
import lightgbm as lgb
import config
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
###组合信息
timeFeatList = [
    ['user_id','day'],
    ['user_id','day','hour'],
    ['item_id','day'],
    ['item_id','day','hour'],
    ['shop_id','day'],
    ['shop_id','day','hour'],
    ['item_brand_id','day'],
    ['item_brand_id','day','hour'],
    ['item_city_id','day'],
    ['item_city_id','day','hour'],
]

In [3]:
def timestamp_datetime(value):
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(value))

def time_feat(df,featList,featName):
    tmp = df.groupby(featList).size().reset_index().rename(columns={0:featName})
    df = df.merge(tmp,'left',on=featList)
    return df

def process(df):
    df['time'] = df.context_timestamp.apply(timestamp_datetime)
    df['day'] = df.time.apply(lambda x: int(x[8:10]))
    df['hour'] = df.time.apply(lambda x: int(x[11:13]))
    for lst in timeFeatList:
        df = time_feat(df,lst,'_'.join(lst))
    df['item_property_list_clean'] = df['item_property_list'].apply(lambda x:';'.join(sorted(set(str(x).split(';')))))
    df["missing_feat"] = np.sum((df == -1).values, axis=1)
    return df

In [4]:
dfTrain = pd.read_table(config.TRAIN_FILE,sep=' ')
dfTrain.drop_duplicates(inplace=True)
dfTest = pd.read_table(config.TEST_FILE,sep=' ')

dfTrain = process(dfTrain)
dfTest = process(dfTest)

dfAll = pd.concat([dfTrain,dfTest],axis=0)
trainNum = dfTrain.shape[0]
dfAll.shape

(496482, 42)

In [5]:
count = CountVectorizer(ngram_range=(1,1))
sparse_merge = count.fit_transform(dfAll['item_property_list_clean'])

In [6]:
def get_onehot(df,field):
    one = OneHotEncoder()
    lb = LabelEncoder()
    tmp = lb.fit_transform((list(df[field])))
    one.fit(tmp.reshape(-1,1))
    oneOut = one.transform(lb.transform(df[field]).reshape(-1,1))
    return oneOut

In [7]:
'''try:
    del sparse_merge
except:
    print('start one hot')'''
for field in config.CATEGORICAL_COLS:
    tmp = get_onehot(dfAll,field)
    try:
        sparse_merge = hstack((sparse_merge,tmp)).tocsr()
    except:
        sparse_merge = tmp
           
sparse_merge = sparse_merge[:, np.array(np.clip(sparse_merge[:dfTrain.shape[0],:].getnnz(axis=0) - 1, 0, 1), dtype=bool)]

In [8]:
denseFeatList = config.NUMERIC_COLS
#+['_'.join(lst) for lst in timeFeatList]
denseFeat = dfAll[denseFeatList].values
sparse_merge = hstack((sparse_merge,denseFeat)).tocsr()

In [9]:
train_idx = dfTrain.loc[dfTrain['day']<24].index
valid_idx = dfTrain.loc[dfTrain['day']==24].index
Xi_train_, y_train_ = sparse_merge[list(train_idx),:],dfTrain.loc[train_idx,'is_trade']
Xi_valid_, y_valid_ = sparse_merge[list(valid_idx),:],dfTrain.loc[valid_idx,'is_trade']
Xi_test_ = sparse_merge[trainNum:,:]

In [10]:
clf = lgb.LGBMClassifier(num_leaves=120, max_depth=9, n_estimators=80, n_jobs=20)
clf.fit(Xi_train_, y_train_, feature_name=['f%d'%i for i in range(sparse_merge.shape[1])],
        categorical_feature=[])
y_score_ = clf.predict_proba(Xi_valid_,)[:, 1]
print(log_loss(y_valid_, y_score_))



0.0828573611996


In [11]:
y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)
y_test_meta[:,0] += clf.predict(Xi_test_)
submit = pd.DataFrame({'instance_id':dfTest['instance_id'],'predicted_score':y_test_meta[:,0]})
submit.to_csv('../../Submission/advertisement/gbm_sparse_0327.txt', sep=" ", index=False, line_terminator='\n')