In [13]:
import pandas as pd
import numpy as np
import os
import lightgbm as lgb

from scipy import sparse

from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss,roc_curve,auc

In [2]:
def encoder(df,varList):
    lbl = LabelEncoder()
    for var in varList:
        df[var] = lbl.fit_transform(df[var].astype(str))
        df[var] = pd.to_numeric(df[var],downcast = 'unsigned')
    return df
def score_performance(target,score):
    fpr, tpr, thresholds = roc_curve(target,score)
    ks = max(tpr-fpr)
    auc_ = auc(fpr, tpr)
    return ks,auc_

In [4]:
def lgbmodel(argsDict,XTrain,yTrain,bst_estimators = 20000,(XValid,yValid)=(None,None),features=None):
    leaf = argsDict['leaf']*5 + 5
    learning_rate = argsDict['learning_rate']*0.02 + 0.01
    colsample_bytree = argsDict['colsample_bytree']*0.1 +0.7
    subsample = argsDict['subsample']*0.1 +0.7
    max_bin = argsDict['max_bin']*5 + 5
    
    features = [i for i in range(XTrain.shape[1])] if features==None else features
    
    clf = lgb.LGBMClassifier(
        num_leaves=leaf,
        n_estimators=bst_estimators,
        n_jobs=20,
        learning_rate=learning_rate,
        colsample_bytree=colsample_bytree,
        subsample=subsample,
        max_bin=max_bin
    )
    if bst_estimators == 20000:
        clf.fit(XTrain, yTrain, eval_set=[(XValid, yValid)],feature_name = features,
                early_stopping_rounds=100,verbose = 200)
        return clf,clf.best_iteration_
    else:
        clf.fit(XTrain, yTrain,feature_name = features,)
        return clf

def lgbtuning(argsDict):
    print(argsDict)
    clf,bst_estimators = lgbmodel(argsDict)
    y_score_ = clf.predict_proba(Xi_valid_[features],num_iteration=bst_estimators)[:, 1]
    ks,auc = ks_metric(y_valid_, y_score_)
    print(auc)
    print('-------------------------------------------------------------------------------------------------------------------------------\n')
    return -auc


def feat_select(argsDict,feat,target,cutoff=0):
    features = [i for i in range(feat.shape[1])]
    XTrain,XValid,yTrain,yValid = train_test_split(feat,target,test_size = 0.15, random_state=42)
    clf,clf.best_iteration_ = lgbmodel(argsDict)

In [6]:
def str_process(df,var):
    cnt = CountVectorizer()
    cnt.vocabulary_ = dict([(str(list(set.union(*df[var].apply(lambda x:[int(i) for i in x.split(' ') if not i=='']).apply(set)))[i]),i) for i in range(len(tmp2))])
    result = cnt.transform(df[var])
    return result
    
def feat_engineer(df,trainNum,cutoff = 20 ,strList=['uid','appIdAction','appIdInstall','ct','interest1','interest2','interest3','interest4','interest5','kw1','kw2','kw3','topic1','topic2','topic3','marriageStatus','os'],
                  lbeList=['LBS','carrier'],
                  nochangeList=['age','consumptionAbility','education','gender','house']):
    feat = sparse.csr_matrix(df[nochangeList].values)
    for var in strList:
        try:
            feat = sparse.hstack(feat,str_process(df,var))
        except:
            feat = str_process(df,var)
    for var in lbeList:
        lbe = LabelEncoder()
        oh = OneHotEncoder()
        featTmp = oh.fit_transform(lbe.fit_transform(df[var]))
        try:
            feat = sparse.hstack(feat,featTmp)
        except:
            feat = featTmp
    return feat
    

In [7]:
dfTrain = pd.read_csv('../../Data/tengAD/Raw/train.csv')
dfTrain.label.replace({-1:0},inplace =True)
dfTest = pd.read_csv('../../Data/tengAD/Raw/test1.csv')

In [10]:
dfAd = pd.read_csv('../../Data/tengAD/Raw/adFeature.csv')
#dfAd = encoder(dfAd,[i for i in dfAd if i!='aid'])
dfUser = pd.read_csv('../../Data/tengAD/Raw/userFeature.csv')
#dfUser.fillna('',inplace =True)
#dfUser = encoder(dfUser,[i for i in dfUser if i!='uid'])

In [14]:
for aid in dfTrain['aid'].unique():
    dfTrainTmp = dfUser.loc[dfUser['uid'].isin(dfTrain.loc[dfTrain['aid']==aid,'uid'].unique())]
    dfTestTmp = dfUser.loc[dfUser['uid'].isin(dfTest.loc[dfTest['aid']==aid,'uid'].unique())]
    dfAll = pd.concat([dfTrainTmp,dfTestTmp])
    trainNum = dfTrainTmp.shape[0]
    del dfTrainTmp
    del dfTestTmp
    dfAll.reset_index(drop=True,inplace=True)
    dfAll.house.fillna(0,inplace=True)
    dfAll.fillna('',inplace=True)
    
    tmpfeat = feat_engineer(dfAll,['appIdAction'],['LBS'],['age'])
    break
    
    

NameError: name 'tmp2' is not defined

In [None]:
dfTrain = dfTrain.merge(dfAd,'left','aid')
dfTrain = dfTrain.merge(dfUser,'left','uid')

In [None]:
y = dfTrain['label']
del dfTrain['label']
XTrain,XValid,yTrain,yValid = train_test_split(dfTrain,y,test_size = 0.8, random_state=42)
#XValid1,XValid2,yValid1,yValid2 = train_test_split(XValid,yValid,test_size = 0.5, random_state=42)

In [None]:
clf = lgb.LGBMClassifier(
    learning_rate =0.1,
    leaf = 25,
    max_bin = 20,
    subsample = 0.8,
    n_estimators=20000,
    n_jobs=20,
    colsample_bytree = 0.8,
)
clf.fit(XTrain,yTrain,eval_set = (XValid,yValid),
        categorical_feature =XTrain.columns.tolist(),
        early_stopping_rounds = 100,
        eval_metric = 'auc')

In [None]:
oh = OneHotEncoder()
XValidLeaf1 = oh.fit_transform(clf.apply(XValid1))
XValidLeaf2 = oh.fit_transform(clf.apply(XValid2))

In [None]:
lr = LogisticRegression(penalty ='l1',C=1.)
lr.fit(XValidLeaf1, yValid1)

In [None]:
scoreValid1 = lr.predict(XValidLeaf1)
scoreValid2 = lr.predict(XValidLeaf2)

In [None]:
score_performance(yValid1,scoreValid1)

In [None]:
score_performance(yValid2,scoreValid2)

In [None]:
lr

In [None]:
dfTest = dfTest.merge(dfAd,'left','aid')
dfTest = dfTest.merge(dfUser,'left','uid')

In [None]:
scoreTest = clf.predict_proba(dfTest,num_iteration=clf.best_iteration_)[:,1]

In [None]:
submit = dfTest[['aid','uid']]
submit['score'] = scoreTest

In [None]:
submit.to_csv('../../Submission/tengAD/submission.csv',index=False)