In [1]:
import os
import numpy as np
import pandas as pd
import warnings
from gensim.models import Word2Vec
from tqdm import tqdm
import random
import tensorflow as tf
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

from keras import backend as K
from keras.preprocessing import text, sequence
from keras import Model
from keras.layers import Conv1D, Embedding, Input, Bidirectional, CuDNNLSTM, Dense, Concatenate, Masking, LSTM, SpatialDropout1D
from keras.layers import BatchNormalization, Dropout, Activation, Flatten
from keras.layers import GlobalMaxPool1D, GlobalAveragePooling1D, GlobalAvgPool1D, GlobalMaxPooling1D
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, Callback
from keras.utils import to_categorical
from keras_radam import RAdam
from keras_lookahead import Lookahead
import gc

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')

In [2]:
def fix_seed(seed):
    np.random.seed(seed)
    tf.set_random_seed(seed)

seed = 2020
fix_seed(seed)

# 数据集处理

In [3]:
%%time
df_train = pd.read_csv('/home/kesci/input/finaltrain1235/competition_train_data.csv', encoding='GB18030')
df_test = pd.read_csv('/home/kesci/input/finaltest2443/competition_test_data.csv', encoding='GB18030')
df_feature = df_train.append(df_test)
df_feature.head()

df_log = pd.read_pickle('/home/kesci/work/scy/data/md.pkl')
df_log.sort_values(['aopsid', 'eventtime'], inplace=True)
    
for col in tqdm(['eventname', 'title', 'page_name']):
    df_log[col]  = df_log[col].astype(str)
    print(col, df_log[col].nunique())

    tmp = df_log.groupby('aopsid')[col].agg(list).reset_index()
    tmp[col] = tmp[col].map(lambda x: ' '.join(x))
    df_feature = df_feature.merge(tmp, how='left')
    del tmp
    gc.collect()
    
df_feature.to_pickle('/home/kesci/work/scy/data/data.pkl')

In [4]:
df_data = pd.read_pickle('/home/kesci/work/scy/data/data.pkl')

In [6]:
for f in ['xz', 'xb', 'x_year', 'aopsid']:
    print(df_data[f].nunique())
    
    lbl = LabelEncoder()
    df_data[f] = lbl.fit_transform(df_data[f].astype(str))

In [7]:
from sklearn.preprocessing import MinMaxScaler


for f in ['aopsid']:    
    lbl = MinMaxScaler()
    df_data[f] = lbl.fit_transform(df_data[[f]].values)

In [8]:
for f in ['eventname', 'title', 'page_name']:
    df_data[f] = df_data[f].fillna('')
df_data.head()

In [9]:
def sequence_processing(df_data_, col, seq_len, embedding_dim, max_words_num):
    df_data = df_data_.copy()
    
    print('Generate seqs')
    os.makedirs('seqs', exist_ok=True)
    seq_path = 'seqs/seqs_{}_{}.npy'.format(max_words_num, seq_len)
    word_index_path = 'seqs/word_index_{}_{}.npy'.format(max_words_num, seq_len)
    if not os.path.exists(seq_path) or not os.path.exists(word_index_path):
        tokenizer = text.Tokenizer(num_words=max_words_num, lower=False, filters='')
        tokenizer.fit_on_texts(df_data[col].values.tolist())
        seqs = sequence.pad_sequences(tokenizer.texts_to_sequences(df_data[col].values.tolist()), maxlen=seq_len,
                                      padding='post', truncating='pre')
        word_index = tokenizer.word_index

        np.save(seq_path, seqs)
        np.save(word_index_path, word_index)

    else:
        seqs = np.load(seq_path)
        word_index = np.load(word_index_path, allow_pickle=True).item()

    # print('Generate embedding')
    # os.makedirs('embedding', exist_ok=True)
    # embedding_path = 'embedding/w2v_{}_{}.m'.format(col, embedding_dim)
    # if not os.path.exists(embedding_path):
    #     print('Training w2v')
    #     model = Word2Vec([[word for word in senetnce.split(' ')] for senetnce in df_data[col].values],
    #                       size=embedding_dim, window=20, workers=32, seed=seed, min_count=1, sg=1, hs=1)

    #     model.save(embedding_path)
    # else:
    #     model = Word2Vec.load(embedding_path)

    embedding = np.zeros((len(word_index) + 1, embedding_dim))
    # for word, i in tqdm(word_index.items()):
    #     embedding_vector = model[word] if word in model else None
    #     if embedding_vector is not None:
    #         embedding[i] = embedding_vector
    
    return seqs, embedding

In [10]:
seq_len = 200
eventname_seqs, eventname_embedding = sequence_processing(df_data, 'eventname', seq_len, 128, 4200)
title_seqs, title_embedding = sequence_processing(df_data, 'title', seq_len, 128, None)
page_name_seqs, page_name_embedding = sequence_processing(df_data, 'page_name', seq_len, 128, None)

# 模型训练

In [11]:
os.makedirs('model', exist_ok=True)
os.makedirs('/home/kesci/work/sub', exist_ok=True)
os.makedirs('/home/kesci/work/prob/', exist_ok=True)

In [12]:
all_index = df_data[df_data['is_xb'].notnull()].index.tolist()
test_index = df_data[df_data['is_xb'].isnull()].index.tolist()

len(all_index), len(test_index)

In [13]:
def build_model(eventname_emb, title_emb, page_name_emb, seq_len):
    seq1 = Input(shape=(seq_len,))
    seq2 = Input(shape=(seq_len,))
    seq3 = Input(shape=(seq_len,))
    
    year = Input(shape=(1,))
    xb = Input(shape=(1,))
    xz = Input(shape=(1,))
    userid = Input(shape=(1,))

    emb_year = Embedding(
        input_dim=10,
        output_dim=64,
        input_length=1,
    )
    
    emb_xb = Embedding(
        input_dim=5,
        output_dim=64,
        input_length=1,
    )
    
    emb_xz = Embedding(
        input_dim=3,
        output_dim=64,
        input_length=1,
    )
    
    emb_layer1 = Embedding(
        input_dim=eventname_emb.shape[0],
        output_dim=eventname_emb.shape[1],
#         weights=[eventname_emb],
        input_length=seq_len,
#         trainable=False
    )
    
    emb_layer2 = Embedding(
        input_dim=title_emb.shape[0],
        output_dim=title_emb.shape[1],
#         weights=[title_emb],
        input_length=seq_len,
#         trainable=False
    )
   
    emb_layer3 = Embedding(
        input_dim=page_name_emb.shape[0],
        output_dim=page_name_emb.shape[1],
#         weights=[title_emb],
        input_length=seq_len,
#         trainable=False
    )

    seq_emb1 = emb_layer1(seq1)
    seq_emb2 = emb_layer2(seq2)
    seq_emb3 = emb_layer3(seq3)

    seq_emb1 = SpatialDropout1D(rate=0.2)(seq_emb1)
    seq_emb2 = SpatialDropout1D(rate=0.2)(seq_emb2)
    seq_emb3 = SpatialDropout1D(rate=0.2)(seq_emb3)

    seq_emb = Concatenate()([seq_emb1, seq_emb2, seq_emb3])

#     lstm = Bidirectional(CuDNNLSTM(200, return_sequences=True))(seq_emb)
    lstm = LSTM(200, return_sequences=True)(seq_emb)

    lstm_avg_pool = GlobalAveragePooling1D()(lstm)
    lstm_max_pool = GlobalMaxPooling1D()(lstm)
    
    year_emb = emb_year(year)
    year_emb = Flatten()(year_emb)
    
    xb_emb = emb_xb(xb)
    xb_emb = Flatten()(xb_emb)
    
    xz_emb = emb_xz(xz)
    xz_emb = Flatten()(xz_emb)
    
    userid_emb = Dense(64)(userid)
    
    x = Concatenate()([lstm_avg_pool, lstm_max_pool, year_emb, xb_emb, xz_emb, userid_emb])
    
    x = Dropout(0.2)(Activation(activation='relu')(BatchNormalization()(Dense(300)(x))))

    out = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs=[seq1, seq2, seq3, year, xb, xz, userid], outputs=out)
    model.compile(loss='binary_crossentropy', optimizer=Lookahead(RAdam()), metrics=['accuracy'])

    return model

In [14]:
class Evaluator(Callback):
    def __init__(self, validation_data):
        super().__init__()
        self.best_val_auc = 0.
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]

    def evaluate(self):
        y_true = self.y_val
        y_pred = self.model.predict(self.x_val)
        auc = roc_auc_score(y_true, y_pred)
        return auc

    def on_epoch_end(self, epoch, logs=None):
        val_auc = self.evaluate()
        if val_auc > self.best_val_auc:
            self.best_val_auc = val_auc
        logs['val_auc'] = val_auc
        print(f'val_auc: {val_auc:.5f}, best_val_auc: {self.best_val_auc:.5f}')

In [15]:
bs = 1000
monitor = 'val_auc'

oof_pred = np.zeros((len(all_index), 1))
test_pred = np.zeros((len(test_index), 1))

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
for fold_id, (train_index, val_index) in enumerate(kfold.split(all_index, df_data.iloc[all_index]['is_xb'])):
    print('\n****************************************EPOCH {}****************************************\n'.format(fold_id))
    
    train_x = [eventname_seqs[train_index], title_seqs[train_index], page_name_seqs[train_index], 
               df_data.iloc[train_index]['x_year'].values, df_data.iloc[train_index]['xb'].values, 
               df_data.iloc[train_index]['xz'].values, df_data.iloc[train_index]['aopsid'].values]
    
    val_x = [eventname_seqs[val_index], title_seqs[val_index], page_name_seqs[val_index],
             df_data.iloc[val_index]['x_year'].values, df_data.iloc[val_index]['xb'].values, 
             df_data.iloc[val_index]['xz'].values, df_data.iloc[val_index]['aopsid'].values]
    
    test_x = [eventname_seqs[test_index], title_seqs[test_index], page_name_seqs[test_index],
            df_data.iloc[test_index]['x_year'].values, df_data.iloc[test_index]['xb'].values,
             df_data.iloc[test_index]['xz'].values, df_data.iloc[test_index]['aopsid'].values]
    
    label = df_data['is_xb'].values
    train_y = label[train_index]
    val_y = label[val_index]
    
    model_path = 'model/lstm_{}.h5'.format(fold_id)
    checkpoint = ModelCheckpoint(model_path, monitor=monitor, verbose=1, save_best_only=True, mode='max', save_weights_only=True)
    earlystopping = EarlyStopping(monitor=monitor, patience=5, verbose=1, mode='max')
    reduce_lr = ReduceLROnPlateau(monitor=monitor, factor=0.5, patience=2, mode='max', verbose=1)
    
    model = build_model(eventname_embedding, title_embedding, page_name_embedding, seq_len)
    model.fit(train_x, train_y, batch_size=bs, epochs=30,
              validation_data=(val_x, val_y),
              callbacks=[Evaluator(validation_data=(val_x, val_y)), checkpoint, reduce_lr, earlystopping], verbose=1, shuffle=True)
    
    # 加载最好模型
    model.load_weights(model_path)
    
    prob = model.predict(val_x, batch_size=bs, verbose=1)
    oof_pred[val_index] = prob
    auc = roc_auc_score(df_data.loc[val_index]['is_xb'], prob)
    print(auc)
    
    prob = model.predict(test_x, batch_size=bs, verbose=1)
    test_pred += prob / 5

In [None]:
model.summary()

In [None]:
df_oof = df_data.iloc[all_index][['aopsid', 'is_xb']]
df_oof['pred'] = oof_pred
auc = roc_auc_score(df_oof['is_xb'], df_oof['pred'])
print(auc)

In [None]:
df_sub = df_data.iloc[test_index][['aopsid']]
df_sub['pred'] = test_pred
df_prob = df_oof[['aopsid', 'pred']].append(df_sub)

df_prob.to_pickle('/home/kesci/work/prob/lstm_prob.pkl')
df_prob.to_pickle('/home/kesci/work/prob/lstm_prob_{}.pkl'.format(auc))

In [None]:
df_prob.head()

In [None]:
prediction = df_data.iloc[test_index][['aopsid']]
prediction['pred'] = test_pred
prediction[['aopsid', 'pred']].to_csv('/home/kesci/work/sub/sub_lstm.txt', index=False, encoding='utf-8', sep='\t')

In [None]:
# !wget -nv -O kesci_submit https://cdn.kesci.com/submit_tool/v4/kesci_submit&&chmod +x kesci_submit
# !./kesci_submit -token 7e252de2ef2cda71 -file sub/sub_lstm.txt