In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import pickle
import os
import gc
from scipy.sparse import hstack

In [2]:
data_dir = '../../../data/raw/preliminary_contest_data/'
ad_cnt_dir = '../../../data/nlp_count/preliminary_contest_data/byAdFeatureName/'
user_cnt_dir = '../../../data/nlp_count/preliminary_contest_data/byUserFeatureName/'
user_tfidf_dir = '../../../data/nlp_tfidf/preliminary_contest_data/byUserFeatureName/'

In [3]:
def load(filename, **kw):
    return pd.read_csv(os.path.join(data_dir, filename), **kw)

In [4]:
def load_pickle(filepath):
    obj = None
    with open(filepath, "rb") as f:
        obj = pickle.load(f)
    return obj

In [5]:
def load_ad_cnt(feat_name):
    filename = "adFeature.[featureName='{}'].pkl".format(feat_name)
    filepath = os.path.join(ad_cnt_dir, filename)
    index, matrix = load_pickle(filepath)
    
    filename = "aid.pkl".format(feat_name)
    filepath = os.path.join(ad_cnt_dir, filename)
    uid = load_pickle(filepath)
    
    return uid, (index, matrix)

In [6]:
def load_user_cnt(feat_name):
    filename = "userFeature.[featureName='{}'].pkl".format(feat_name)
    filepath = os.path.join(user_cnt_dir, filename)
    index, matrix = load_pickle(filepath)
    
    filename = "uid.pkl".format(feat_name)
    filepath = os.path.join(user_cnt_dir, filename)
    uid = load_pickle(filepath)
    
    return uid, (index, matrix)

In [7]:
def load_user_tfidf(feat_name):
    filename = "userFeature.[featureName='{}'].pkl".format(feat_name)
    filepath = os.path.join(user_tfidf_dir, filename)
    index, idf, matrix = load_pickle(filepath)
    
    filename = "uid.pkl".format(feat_name)
    filepath = os.path.join(user_tfidf_dir, filename)
    uid = load_pickle(filepath)
    
    return uid, (index, idf, matrix)

In [8]:
df_train = load("train.csv")

In [9]:
uid, (ufeat_index, uvec) = load_user_cnt("interest1")
aid, (afeat_index, avec) = load_ad_cnt('aid')
uid_to_index = dict(zip(uid, list(range(len(uid)))))  # mapping from uids to distinct indices
aid_to_index = dict(zip(aid, list(range(len(aid)))))  # mapping from aids to distinct indices

a_index = df_train['aid'].map(aid_to_index).values  # list of indices for matrix joining
u_index = df_train['uid'].map(uid_to_index).values  # list of indices for matrix joining

X = hstack((avec[a_index,:], uvec[u_index,:])).tocsr()  # joined user and advertise matrix
y = (df_train['label'].values + 1) / 2

X_train, y_train = X[:6000000], y[:6000000]
X_valid, y_valid = X[6000000:], y[6000000:]

lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train)
lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid)

In [10]:
lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train)
lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid)

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 5,
    'num_leaves': 31,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'verbose': 0
}

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=200,
                valid_sets=[lgb_train, lgb_valid], 
                valid_names=['train', 'valid1'])

[1]	train's auc: 0.551884	valid1's auc: 0.551446
[2]	train's auc: 0.552501	valid1's auc: 0.552079
[3]	train's auc: 0.554068	valid1's auc: 0.553465
[4]	train's auc: 0.554547	valid1's auc: 0.553624
[5]	train's auc: 0.55593	valid1's auc: 0.555189
[6]	train's auc: 0.556199	valid1's auc: 0.555471
[7]	train's auc: 0.556544	valid1's auc: 0.555728
[8]	train's auc: 0.556614	valid1's auc: 0.555803
[9]	train's auc: 0.556687	valid1's auc: 0.555789
[10]	train's auc: 0.557061	valid1's auc: 0.556237
[11]	train's auc: 0.557105	valid1's auc: 0.556319
[12]	train's auc: 0.558209	valid1's auc: 0.557438
[13]	train's auc: 0.558242	valid1's auc: 0.55746
[14]	train's auc: 0.558856	valid1's auc: 0.557981
[15]	train's auc: 0.559374	valid1's auc: 0.558604
[16]	train's auc: 0.559669	valid1's auc: 0.558859
[17]	train's auc: 0.559779	valid1's auc: 0.559009
[18]	train's auc: 0.560642	valid1's auc: 0.559699
[19]	train's auc: 0.561354	valid1's auc: 0.560359
[20]	train's auc: 0.561735	valid1's auc: 0.560718
[21]	train'

KeyboardInterrupt: 

In [None]:
uid, (ufeat_index, uidf, uvec) = load_user_tfidf("interest1")
uid_to_index = dict(zip(uid, list(range(len(uid)))))

u_index = df_train['uid'].map(uid_to_index).values

X = hstack((avec[a_index,:], uvec[u_index,:])).tocsr()

X_train = X[:6000000]
X_valid = X[6000000:]

lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train)
lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid)

In [None]:
lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train)
lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid)

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 4,
    'num_leaves': 31,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'verbose': 0
}

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=300,
                valid_sets=[lgb_train, lgb_valid], 
                valid_names=['train', 'valid1'])

In [108]:
def get_set(dataframe, test, features_u_want):
    
    aid, (afeat_index, avec) = load_ad_cnt('aid')
    aid_to_index = dict(zip(aid, list(range(len(aid)))))  # mapping from aids to distinct indices
    a_index = dataframe['aid'].map(aid_to_index).values  # list of indices for matrix joining
    
    id_index_vec = []                        
    for each in features_u_want:
        id_index_vec.append(load_user_cnt(each))           #eid, (efeat_index, evec) = load_user_cnt("education")
    print(1)
    id2index = []                                        # mapping from uids to distinct indices
    for each in id_index_vec:
        id2index.append(dict(zip(each[0], list(range(len(each[0]))))))  # eid_to_index = dict(zip(eid, list(range(len(eid)))))
    print(2)
    # list of indices for matrix joining
    
    index_mapper = []
    for each in id2index:
        index_mapper.append(dataframe['uid'].map(each).values)     # e_index = dataframe['uid'].map(eid_to_index).values
    print(3)
    X = avec[a_index,:]
    temp = hstack([id_index_vec[i][1][1][index_mapper[i],:] for i in range((len(id_index_vec)))])
    X = hstack([X,temp]).tocsr()
    """
    X = hstack((avec[a_index,:], evec[e_index,:], i1vec[i1_index, :], i2vec[i2_index, :], i3vec[i3_index, :],
               i4vec[i4_index, :], i5vec[i5_index, :], k1vec[k1_index, :], k2vec[k2_index, :], k3vec[k3_index, :], 
               appvec[app_index, :], apivec[api_index, :])).tocsr()  # joined user and advertise matrix"""
    if test==True:
        return X
    else:
        y = (dataframe['label'].values + 1) / 2
    
    return X, y


In [109]:
gc.collect()
X, y = get_set(df_train, test = False, features_u_want = ['education','interest1','interest2','interest3','interest4','interest5','kw1','kw2',
                                       'kw3', 'appIdAction',  'appIdInstall'])
X_train, y_train = X[:6000000], y[:6000000]
X_valid, y_valid = X[6000000:], y[6000000:]

lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train)
lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid)
gc.collect()

1
2
3


21

In [110]:
lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train)
lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid)

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 5,
    'num_leaves': 31,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'verbose': 0
}

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
                valid_sets=[lgb_train, lgb_valid], 
                valid_names=['train', 'valid1'])

[1]	train's auc: 0.538001	valid1's auc: 0.502577
[2]	train's auc: 0.541933	valid1's auc: 0.501816
[3]	train's auc: 0.545326	valid1's auc: 0.501699
[4]	train's auc: 0.545784	valid1's auc: 0.501627
[5]	train's auc: 0.551716	valid1's auc: 0.501393
[6]	train's auc: 0.556275	valid1's auc: 0.501179
[7]	train's auc: 0.55499	valid1's auc: 0.501474
[8]	train's auc: 0.555671	valid1's auc: 0.501438
[9]	train's auc: 0.560689	valid1's auc: 0.501415
[10]	train's auc: 0.563348	valid1's auc: 0.500891
[11]	train's auc: 0.564758	valid1's auc: 0.500698
[12]	train's auc: 0.564237	valid1's auc: 0.500907
[13]	train's auc: 0.563782	valid1's auc: 0.500833
[14]	train's auc: 0.564892	valid1's auc: 0.500825
[15]	train's auc: 0.564607	valid1's auc: 0.500949
[16]	train's auc: 0.564372	valid1's auc: 0.500875
[17]	train's auc: 0.564333	valid1's auc: 0.501014
[18]	train's auc: 0.565519	valid1's auc: 0.500895
[19]	train's auc: 0.565102	valid1's auc: 0.500953
[20]	train's auc: 0.564574	valid1's auc: 0.500987
[21]	train

In [13]:
from sklearn.linear_model import LogisticRegression   
from sklearn import metrics
lr = LogisticRegression('l2', solver = 'sag', class_weight = 'balanced', verbose = 1)
logisticR = lr.fit(X_train, y_train)
lr_pred = logisticR.predict_proba(X_valid)
gc.collect()

max_iter reached after 1066 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 17.8min finished


6

In [28]:
from sklearn.externals import joblib
# save model
joblib.dump(logisticR, 'lr.pkl')
gbm.save_model('model.txt')
# load model
# bst = lgb.Booster(model_file='model.txt')
# lr_pickle = joblib.load('lr.pkl')

In [40]:
# build X_test
gc.collect()
df_test = load("test1.csv")
X = get_set(df_test, True)

In [41]:
lr_predict = logisticR.predict_proba(X)
gbm_predict = gbm.predict(X.astype(np.float32))
emsembled = 0.7*gbm_predict+0.3*lr_predict[:,1]z

In [50]:
df_test['score'] = [round(each, 8) for each in emsembled]
df_test.to_csv("submission.csv", index = False)

In [48]:
df_test.shape

(2265989, 3)