In [1]:
import time
import pandas as pd
import numpy as np
import lightgbm as lgb
import config
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
###组合信息
timeFeatList = [
    ['user_id','day'],
    ['user_id','day','hour'],
    ['item_id','day'],
    ['item_id','day','hour'],
    ['shop_id','day'],
    ['shop_id','day','hour'],
    ['item_brand_id','day'],
    ['item_brand_id','day','hour'],
    ['item_city_id','day'],
    ['item_city_id','day','hour'],
]

In [14]:
def timestamp_datetime(value):
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(value))

def time_feat(df,featList,featName):
    scaler = StandardScaler()
    tmp = df.groupby(featList).size().reset_index().rename(columns={0:featName})
    tmp[featName] = scaler.fit_transform(tmp[featName].values.reshape(-1,1))
    df = df.merge(tmp,'left',on=featList)
    return df

def process(df):
    df['time'] = df.context_timestamp.apply(timestamp_datetime)
    df['day'] = df.time.apply(lambda x: int(x[8:10]))
    df['hour'] = df.time.apply(lambda x: int(x[11:13]))
    for lst in timeFeatList:
        df = time_feat(df,lst,'_'.join(lst))
    df['item_property_list_clean'] = df['item_property_list'].apply(lambda x:';'.join(sorted(set(str(x).split(';')))))
    df["missing_feat"] = np.sum((df == -1).values, axis=1)
    return df
def get_onehot(df,field):
    one = OneHotEncoder()
    lb = LabelEncoder()
    tmp = lb.fit_transform((list(df[field])))
    one.fit(tmp.reshape(-1,1))
    oneOut = one.transform(lb.transform(df[field]).reshape(-1,1))
    return oneOut

In [15]:
dfTrain = pd.read_table(config.TRAIN_FILE,sep=' ')
dfTrain.drop_duplicates(inplace=True)
dfTest = pd.read_table(config.TEST_FILE,sep=' ')

dfTrain = process(dfTrain)
dfTest = process(dfTest)

dfAll = pd.concat([dfTrain,dfTest],axis=0)
trainNum = dfTrain.shape[0]
dfAll.shape



(496482, 42)

<font color=#0099ff size=5 face="黑体">OneHot稀疏矩阵</font>

In [22]:
cutoff = 1
count = CountVectorizer(ngram_range=(1,1))
sparse_merge = count.fit_transform(dfAll['item_property_list_clean'])
for field in config.CATEGORICAL_COLS:
    tmp = get_onehot(dfAll,field)
    try:
        sparse_merge = hstack((sparse_merge,tmp)).tocsr()
    except:
        sparse_merge = tmp
           
sparse_merge = sparse_merge[:, np.array(np.clip(sparse_merge[:dfTrain.shape[0],:].getnnz(axis=0) -cutoff, 0, 1), dtype=bool)]

In [23]:
denseFeatList = config.NUMERIC_COLS+['_'.join(lst) for lst in timeFeatList]
denseFeat = dfAll[denseFeatList].values
sparse_merge = hstack((sparse_merge,denseFeat)).tocsr()

In [24]:
train_idx = dfTrain.loc[dfTrain['day']<24].index
valid_idx = dfTrain.loc[dfTrain['day']==24].index
Xi_train_, y_train_ = sparse_merge[list(train_idx),:],dfTrain.loc[train_idx,'is_trade']
Xi_valid_, y_valid_ = sparse_merge[list(valid_idx),:],dfTrain.loc[valid_idx,'is_trade']
Xi_test_ = sparse_merge[trainNum:,:]

<font color=#0099ff size=5 face="黑体">WOE筛选变量</font>

In [None]:
from woe import calc_nominal_woe
from itertools import chain, combinations

In [None]:
def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return list(chain.from_iterable(combinations(s, r) for r in range(len(s)+1)))

def feat_select(df,featList,order=3,cutoff=0.1):
    dfTrain = df.copy()
    subset = powerset(featList)
    selected = {}
    for sub in subset:
        sub=list(sub)
        if len(sub)==0 or len(sub)>order:
            continue
        if len(sub)==1:
            tmp_woe = calc_nominal_woe(df,sub[0],'is_trade',bins=10,small=0.001)
        else:
            df['test'] = df[sub[0]]
            for i in range(1,len(sub)):
                df['test'] = df[['test',sub[i]]].apply(lambda x:str(x[0])+'_'+str(x[1]),axis=1)
            tmp_woe = calc_nominal_woe(df,'test','is_trade',bins=10,small=0.001)
        if tmp_woe[-1]>=cutoff:
            print(sub,tmp_woe[-1])
            selected['_'.join(sub)] = tmp_woe[0]
    return selected
            


In [None]:
tt = feat_select(dfTrain,config.CATEGORICAL_COLS)

<font color=#0099ff size=5 face="黑体">根据比值生成特征</font>

<font color=#0099ff size=5 face="黑体">模型</font>

In [25]:
clf = lgb.LGBMClassifier(num_leaves=100, max_depth=7, n_estimators=80, n_jobs=20)
clf.fit(Xi_train_, y_train_, feature_name=['f%d'%i for i in range(sparse_merge.shape[1])],
        categorical_feature=[])
y_score_ = clf.predict_proba(Xi_valid_,)[:, 1]
print(log_loss(y_valid_, y_score_))



0.0819671914194


In [26]:
y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)
y_test_meta[:,0] += clf.predict_proba(Xi_test_)[:,1]
submit = pd.DataFrame({'instance_id':dfTest['instance_id'],'predicted_score':y_test_meta[:,0]})
submit.to_csv('../../Submission/advertisement/gbm_sparse_0328.txt', sep=" ", index=False, line_terminator='\n')

In [27]:
submit.max()

instance_id        9.222350e+18
predicted_score    2.756152e-01
dtype: float64