# age|gender预测-BiLSTM版

In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
from matplotlib import pyplot as plt
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
import gc

In [4]:
import tensorflow as tf

In [5]:
from tensorflow import keras
from tensorflow.keras import layers

In [6]:
tf.__version__

'2.1.0'

In [7]:
tf.config.experimental.list_physical_devices(device_type='GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

设置显存按需增长，防止程序独占gpu显存

In [8]:
if tf.__version__.startswith('1.'):  # tensorflow 1
    config = tf.ConfigProto()  # allow_soft_placement=True
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
else:  # tensorflow 2
    gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)

In [9]:
%load_ext autoreload
%autoreload 2

## 载入数据

### 载入点击日志

In [10]:
pd_user_ad_list = pd.read_csv("./data/pd_user_ad_id_seg_all_click_time.csv")

In [11]:
pd_user_ad_list.shape

(1900000, 9)

In [11]:
pd_user_ad_list.dtypes

user_id              int64
time                object
creative_id         object
click_times         object
ad_id               object
product_id          object
product_category    object
advertiser_id       object
industry            object
dtype: object

In [12]:
pd_user_ad_list.head()

Unnamed: 0,user_id,time,creative_id,click_times,ad_id,product_id,product_category,advertiser_id,industry
0,1,20 20 20 39 40 43 46 52 60 64 64 73 76,877468 209778 821396 1683713 122032 71691 1940...,1 1 1 1 1 1 1 1 1 1 1 1 2,773445 188507 724607 1458878 109959 66210 1678...,0 136 0 0 1334 0 0 0 0 64 1454 64 1261,5 2 5 5 2 18 5 5 18 2 2 2 2,29455 9702 7293 14668 11411 14681 17189 367 44...,106 6 326 326 0 326 73 217 64 245 238 245 6
1,2,10 11 14 17 28 28 28 38 38 39 41 42 42 42 44 4...,63441 155822 39714 609050 13069 1266180 441462...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...,58788 139702 38066 541125 14495 1107111 392680...,87 80 129 129 1400 0 87 0 1261 0 111 129 0 0 0...,2 2 2 2 2 18 2 18 2 18 2 2 18 18 18 18 2 2 2 2...,22885 10686 18562 25932 768 34505 22885 26006 ...,318 238 6 6 317 47 318 47 6 47 242 6 47 6 47 4...
2,3,12 13 14 14 14 17 19 22 31 36 37 44 47 47 50 5...,661347 808612 593522 825434 710859 726940 3920...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...,586668 713448 527601 728308 629278 643108 3502...,36256 40905 1674 35985 1674 0 0 1031 1786 2258...,17 17 2 17 2 18 18 2 2 2 18 12 2 2 18 18 8 4 2...,32974 9877 17018 14186 18492 9058 8371 2336 39...,0 0 322 0 322 6 54 6 322 322 205 302 322 322 2...
3,4,8 15 41 44 48 48 48 48 49 52 58 58 59 61 62 62...,39588 589886 574787 1892854 1230094 31070 2264...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...,37966 524312 511235 1638619 1076286 30773 1953...,1862 0 2625 38743 39904 39422 37758 41265 0 15...,2 18 2 17 17 17 4 17 18 2 2 2 2 2 2 2 2 2 5 2 ...,19451 7976 13084 12130 10172 13299 811 23664 1...,238 25 248 0 0 0 0 0 88 319 6 238 319 238 319 ...
4,5,3 13 14 15 20 21 24 25 27 28 29 30 32 32 35 35...,296145 350759 24333 43235 852327 1054434 12964...,1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...,265971 314795 24966 41148 751113 925792 113378...,0 0 87 136 0 136 0 0 0 1064 0 2620 0 0 0 136 2...,5 8 2 2 18 2 5 18 18 2 18 2 5 5 18 2 2 18 18 5...,11882 992 22885 9706 38760 2862 17745 31552 22...,297 0 318 6 322 6 288 322 319 238 322 322 203 ...


In [13]:
pd_user_ad_list['word_count'] = pd_user_ad_list['click_times'].apply(lambda x:len(x.split(" ")))

In [14]:
pd_user_ad_list['word_count'].describe(percentiles=[0.05,0.25,0.5,0.75,0.95,0.99,0.999,0.9999])

count     1.900000e+06
mean      3.350962e+01
std       9.479383e+01
min       2.000000e+00
5%        1.000000e+01
25%       1.500000e+01
50%       2.400000e+01
75%       4.000000e+01
95%       8.900000e+01
99%       1.570000e+02
99.9%     2.940000e+02
99.99%    5.540002e+02
max       1.139740e+05
Name: word_count, dtype: float64

### 载入标签数据

In [12]:
df_user_label = pd.read_csv("./data/train_preliminary/user.csv")

In [16]:
df_user_label.shape

(900000, 3)

In [17]:
df_user_label.head()

Unnamed: 0,user_id,age,gender
0,1,4,1
1,2,10,1
2,3,7,2
3,4,5,1
4,5,4,1


In [18]:
df_user_label.dtypes

user_id    int64
age        int64
gender     int64
dtype: object

In [19]:
df_user_label['gender'].value_counts()

1    602610
2    297390
Name: gender, dtype: int64

In [20]:
df_user_label['age'].value_counts().sort_index()

1      35195
2     149271
3     202909
4     150578
5     130667
6     101720
7      66711
8      31967
9      19474
10     11508
Name: age, dtype: int64

从0开始编码

In [13]:
df_user_label['gender_c'] = df_user_label['gender']-1
df_user_label['age_c'] = df_user_label['age']-1

In [22]:
df_user_label['gender_c'].value_counts()

0    602610
1    297390
Name: gender_c, dtype: int64

In [23]:
df_user_label['age_c'].value_counts().sort_index()

0     35195
1    149271
2    202909
3    150578
4    130667
5    101720
6     66711
7     31967
8     19474
9     11508
Name: age_c, dtype: int64

#### 分离出测试集用户id

In [14]:
user_ids_test = set(pd_user_ad_list['user_id'].unique())-set(df_user_label['user_id'].values)

In [15]:
len(user_ids_test)

1000000

In [16]:
df_test_user_id = pd.DataFrame(data=user_ids_test,columns=['user_id'])

In [17]:
df_test_user_id.shape

(1000000, 1)

In [28]:
df_test_user_id.head()

Unnamed: 0,user_id
0,3000001
1,3000002
2,3000003
3,3000004
4,3000005


In [18]:
df_test_user_id['user_id'].min(),df_test_user_id['user_id'].max()

(3000001, 4000000)

In [19]:
df_user_label['user_id'].min(),df_user_label['user_id'].max()

(1, 900000)

## 构建词典

### 素材id

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [22]:
tf_vectorizer_cr = CountVectorizer(min_df=5)

In [23]:
tf_vectorizer_cr.fit(pd_user_ad_list['creative_id'].values)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [245]:
gc.collect()

123197

min_df=30

In [35]:
min(tf_vectorizer_cr.vocabulary_.values()),max(tf_vectorizer_cr.vocabulary_.values()),len(tf_vectorizer_cr.vocabulary_.values())

(0, 222543, 222544)

min_df=10

In [246]:
min(tf_vectorizer_cr.vocabulary_.values()),max(tf_vectorizer_cr.vocabulary_.values()),len(tf_vectorizer_cr.vocabulary_.values())

(0, 578523, 578524)

min_df=5

In [37]:
min(tf_vectorizer_cr.vocabulary_.values()),max(tf_vectorizer_cr.vocabulary_.values()),len(tf_vectorizer_cr.vocabulary_.values())

(0, 1020346, 1020347)

min_df=2

In [27]:
min(tf_vectorizer_cr.vocabulary_.values()),max(tf_vectorizer_cr.vocabulary_.values()),len(tf_vectorizer_cr.vocabulary_.values())

(0, 2083852, 2083853)

min_df=1

In [326]:
min(tf_vectorizer_cr.vocabulary_.values()),max(tf_vectorizer_cr.vocabulary_.values()),len(tf_vectorizer_cr.vocabulary_.values())

(0, 3412763, 3412764)

生成词典,下同

In [42]:
# word_to_id: 形如{"word_1":id_1,"word_2":id_2,...}
dict_cr = tf_vectorizer_cr.vocabulary_ 
# id_to_word: 形如{"id_1":word_1,"id_2":word_2,...}
dict_cr_reverse = dict(zip(tf_vectorizer_cr.vocabulary_.values(), tf_vectorizer_cr.vocabulary_.keys()))

### 广告id

In [24]:
tf_vectorizer_ad = CountVectorizer(min_df=5)

In [25]:
tf_vectorizer_ad.fit(pd_user_ad_list['ad_id'].values)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [38]:
min(tf_vectorizer_ad.vocabulary_.values()),max(tf_vectorizer_ad.vocabulary_.values()),len(tf_vectorizer_ad.vocabulary_.values())

(0, 1006031, 1006032)

In [27]:
dict_ad = tf_vectorizer_ad.vocabulary_
dict_ad_reverse = dict(zip(tf_vectorizer_ad.vocabulary_.values(), tf_vectorizer_ad.vocabulary_.keys()))

### 广告主id

In [33]:
tf_vectorizer_av = CountVectorizer(min_df=5)

In [34]:
tf_vectorizer_av.fit(pd_user_ad_list['advertiser_id'].values)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [40]:
min(tf_vectorizer_av.vocabulary_.values()),max(tf_vectorizer_av.vocabulary_.values()),len(tf_vectorizer_av.vocabulary_.values())

(0, 40941, 40942)

In [36]:
dict_av = tf_vectorizer_av.vocabulary_
dict_av_reverse = dict(zip(tf_vectorizer_av.vocabulary_.values(), tf_vectorizer_av.vocabulary_.keys()))

### 产品id

In [29]:
tf_vectorizer_pr = CountVectorizer(min_df=5)

In [30]:
tf_vectorizer_pr.fit(pd_user_ad_list['product_id'].values)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [87]:
min(tf_vectorizer_pr.vocabulary_.values()),max(tf_vectorizer_pr.vocabulary_.values()),len(tf_vectorizer_pr.vocabulary_.values())

(0, 22949, 22950)

In [32]:
dict_pr = tf_vectorizer_pr.vocabulary_
dict_pr_reverse = dict(zip(tf_vectorizer_pr.vocabulary_.values(), tf_vectorizer_pr.vocabulary_.keys()))

## 载入预训练的词向量

In [41]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.models import word2vec

In [126]:
def get_embedding_initial(dict_id_to_word,wv):
    """
    生成embedding层的预训练权重矩阵
    
    Parameters
    ----------
    dict_id_to_word: dict，形如{"id_1":word_1,"id_2":word_2,...}
    wv：gensim.models.keyedvectors.Word2VecKeyedVectors,训练好的词向量
    
    Returns
    ----------
    embedding_cr:numpy.ndarray,用作embedding层的预训练权重矩阵
    """
    l = len(dict_id_to_word)
    embedding_size = wv.vector_size
    # 长度为字典长度+1，预留一位作为padding项
    embedding_cr = np.zeros((len(dict_id_to_word)+1,embedding_size),dtype=np.float32)
    for i in range(0,l):
        embedding_cr[i+1]=np.array(wv[dict_id_to_word[i]])
    return embedding_cr

### 素材id

In [157]:
# del wv_cr

In [43]:
wv_cr = KeyedVectors.load("./model/word2vec/kv_cr/tx_gender_age_log_cr_sg_win100_neg5_dim128_iter10_minc1_w2v.kv", mmap='r')

In [44]:
type(wv_cr)

gensim.models.keyedvectors.Word2VecKeyedVectors

In [38]:
wv_cr.vector_size

128

In [46]:
len(wv_cr.vocab.keys())

3412772

In [161]:
# del embedding_cr

In [47]:
embedding_cr = get_embedding_initial(dict_cr_reverse,wv_cr)

In [83]:
embedding_cr.shape

(1020348, 128)

In [50]:
embedding_cr.dtype

dtype('float32')

In [51]:
gc.collect()

10

### 广告id

In [185]:
wv_ad = KeyedVectors.load("./model/word2vec/kv_ad/tx_gender_age_log_sg_win10_neg5_dim128_iter10_minc1_w2v_ad.kv", mmap='r')

In [186]:
wv_ad.vector_size

128

In [187]:
len(wv_ad.vocab.keys())

3027360

In [188]:
embedding_ad = get_embedding_initial(dict_ad_reverse,wv_ad)

In [189]:
embedding_ad.shape

(1006033, 128)

In [190]:
embedding_ad.dtype

dtype('float32')

### 广告主id

In [69]:
wv_av = KeyedVectors.load("./model/word2vec/kv_adv/tx_gender_age_log_sg_win10_neg5_dim64_iter10_minc1_w2v_adv.kv", mmap='r')

In [70]:
wv_av.vector_size

64

In [71]:
len(wv_av.vocab.keys())

57870

In [72]:
embedding_av = get_embedding_initial(dict_av_reverse,wv_av)

In [85]:
embedding_av.shape

(40943, 64)

In [74]:
embedding_av.dtype

dtype('float32')

### 产品id

In [75]:
wv_pr = KeyedVectors.load("./model/word2vec/kv_pr/tx_gender_age_log_sg_win10_neg5_dim64_iter10_minc1_w2v_pr.kv", mmap='r')

In [76]:
wv_pr.vector_size

64

In [77]:
len(wv_pr.vocab.keys())

39057

In [78]:
embedding_pr = get_embedding_initial(dict_pr_reverse,wv_pr)

In [86]:
embedding_pr.shape

(22951, 64)

In [80]:
embedding_pr.dtype

dtype('float32')

## 构造序列特征

### 训练集

In [81]:
pd_user_ad_list_train = pd.merge(pd_user_ad_list,df_user_label,on="user_id")

In [82]:
pd_user_ad_list_train.shape

(900000, 13)

In [125]:
def seq_list(df_data,col_name,dict_id,pad_len=100):
    """
    生成用户的序列id列表
    
    Parameters
    ----------
    df_data: pandas.DataFrame,数据源
    col_name: str,列名
    dict_id：dict,形如{"id_1":word_1,"id_2":word_2,...}
    pad_len: int,pad长度
    
    Returns
    ----------
    numpy.ndarray, 用户的序列id列表
    """
    #count=0
    id_list=[]
    for segcontent in df_data[col_name].values:
        #count+=1
        seg_id_list = segcontent.split(" ")
        id_list.append([dict_id[x]+1 for x in seg_id_list if x in dict_id])
    return keras.preprocessing.sequence.pad_sequences(id_list,maxlen=pad_len)

In [90]:
seq_list_train_cr = seq_list(pd_user_ad_list_train,col_name='creative_id',dict_id=dict_cr)

In [91]:
seq_list_train_cr.shape

(900000, 100)

In [92]:
seq_list_train_pr = seq_list(pd_user_ad_list_train,col_name='product_id',dict_id=dict_pr)

In [93]:
seq_list_train_pr.shape

(900000, 100)

In [94]:
seq_list_train_av = seq_list(pd_user_ad_list_train,col_name='advertiser_id',dict_id=dict_av)

In [95]:
seq_list_train_av.shape

(900000, 100)

In [96]:
seq_list_train_ad = seq_list(pd_user_ad_list_train,col_name='ad_id',dict_id=dict_ad)

In [97]:
seq_list_train_ad.shape

(900000, 100)

### 测试集

In [99]:
pd_user_ad_list_test = pd.merge(pd_user_ad_list,df_test_user_id,on="user_id")

In [100]:
pd_user_ad_list_test.shape

(1000000, 9)

In [101]:
seq_list_test_cr = seq_list(pd_user_ad_list_test,col_name='creative_id',dict_id=dict_cr)

In [102]:
seq_list_test_cr.shape

(1000000, 100)

In [103]:
seq_list_test_pr = seq_list(pd_user_ad_list_test,col_name='product_id',dict_id=dict_pr)

In [104]:
seq_list_test_av = seq_list(pd_user_ad_list_test,col_name='advertiser_id',dict_id=dict_av)

In [105]:
seq_list_test_ad = seq_list(pd_user_ad_list_test,col_name='ad_id',dict_id=dict_ad)

## 构造词向量特征

根据用户点击序列和点击次数进行加权平均获取用户向量，主要作为DNN的输入测试词向量加权平均的模型效果  
**若测试BILSTM或者TextCNN此步骤可忽略**

In [115]:
def user_w2v(df_data,wv,col_name):
    """
    生成用户的序列id列表
    
    Parameters
    ----------
    df_data: pandas.DataFrame,数据源
    wv: gensim.models.keyedvectors.Word2VecKeyedVectors,训练好的词向量
    col_name: str,列名
    
    Returns
    ----------
    numpy.ndarray, 用户词向量表征
    """
    k = wv.vector_size
    user_vectors=[]
    count=0
    for index,row in df_data.iterrows():
        
        count+=1
        user_vec = np.zeros(k)
        
        id_segcontent = row[col_name]
        click_times = row['click_times']
                          
        ad_list = id_segcontent.split(" ")
        click_time_list = click_times.split(" ")

        word_len = len(ad_list)
        word_size = 0
        for i in range(0,word_len):
            if ad_list[i] in wv:
                user_vec+=wv[ad_list[i]]*int(click_time_list[i])
                word_size+=int(click_time_list[i])
        user_vec = user_vec/word_size
        user_vectors.append(user_vec)
        if count%100000==0:
            print(count)
    return np.array(user_vectors)

### 训练集

In [90]:
user_w2v_cr_train = user_w2v(pd_user_ad_list_train,wv_cr,col_name='creative_id')

100000
200000
300000
400000
500000
600000
700000
800000
900000


### 测试集

In [91]:
user_w2v_cr_test = user_w2v(pd_user_ad_list_test,wv_cr,col_name='creative_id')

100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000


## 标签编码

独热编码

In [107]:
label_gender_onehot_train = keras.utils.to_categorical(pd_user_ad_list_train['gender_c'],num_classes=2)
label_age_onehot_train = keras.utils.to_categorical(pd_user_ad_list_train['age_c'],num_classes=10)

In [110]:
label_gender_onehot_train[0]

array([1., 0.], dtype=float32)

In [111]:
label_age_onehot_train[0]

array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0.], dtype=float32)

## 建模

### 定义模型

#### 单输入

In [124]:
def DNN(feature_dim=128,deep_layers=[128],dropout_deep=[0.2],deep_layers_activation='relu',l1_reg=0.0,l2_reg=0.05,
        batch_norm=0,batch_norm_decay=0.995,num_class=2,verbose=True,random_seed=2020):
    """
    构造DNN模型
    
    Parameters
    ----------
    feature_dim: int,特征维度
    deep_layers: list,隐藏层节点数
    dropout_deep: list,隐藏层dropout比例
    deep_layers_activation：str,隐藏层激活函数
    l1_reg：float,l1正则化系数
    l2_reg：float,l2正则化系数
    batch_norm：int,是否使用批量归一化，0代表否，1代表是（暂时不用，备用）
    batch_norm_decay：float,批量归一化参数（暂时不用，备用）
    num_class：int,类别数量
    verbose：boolean,是否打印日志（暂时不用，备用）
    random_seed：int,随机种子（暂时不用，备用）
    
    Returns
    ----------
    keras.Model
    """
    inputs = keras.Input(shape=(feature_dim,), dtype="float32")
    x = inputs
    for i in range(0,len(deep_layers)):
        x = layers.Dense(deep_layers[i], activation=deep_layers_activation,kernel_regularizer=keras.regularizers.l1_l2(l1=l1_reg,l2=l2_reg))(x)
        x = layers.Dropout(dropout_deep[i])(x)
    outputs = layers.Dense(num_class, activation="softmax")(x)
    model = keras.Model(inputs, outputs)
    model.summary()
    #model.compile(optimizer=optimizer, loss=loss_type, metrics=[eval_metric])
    return model

In [150]:
def BiLSTM(embedding_feature_size,embedding_size,embedding_initial_weight,embedding_trainable,
           mask_zero=True,dropout_embedding=0,seq_length=100,
           lstm_kernel='lstm',lstm_num_hidden=128,lstm_num_layer=2,dropout_lstm=0.2,
           deep_layers=[128],dropout_deep=[0.2],deep_layers_activation='relu',l1_reg=0.0,l2_reg=0.05,
           batch_norm=0,batch_norm_decay=0.995,
           #epoch=10,batch_size=128,optimizer= keras.optimizers.Adam(),loss_type='categorical_crossentropy',eval_metric= 'accuracy',
           num_class=2,
           verbose=True,
           random_seed=2020
          ):
    """
    构造单输入BiLSTM模型
    
    Parameters
    ----------
    embedding_feature_size: int,embedding权重矩阵大小
    embedding_size: int,embedding权重矩阵的向量维度
    embedding_initial_weight：np.ndarray,embedding初始化矩阵
    embedding_trainable：boolean,embedding权重是否可训练
    mask_zero：boolean, 是否进行mask
    dropout_embedding：int,embedding的droupout比例（暂时不用，备用）
    seq_length=100: int,用户的序列长度（暂时不用，备用）
    lstm_kernel：str，表示lstm层选择什么核，例如lstm或者gru（暂时不用，备用）
    lstm_num_hidden：lstm隐藏层节点数
    lstm_num_layer：lstm的层数
    dropout_lstm：lstm的dropout比例
    #lstm正则项待加
    deep_layers: list,隐藏层节点数
    dropout_deep: list,隐藏层dropout比例
    deep_layers_activation：str,隐藏层激活函数
    l1_reg：float,l1正则化系数
    l2_reg：float,l2正则化系数
    batch_norm：int,是否使用批量归一化，0代表否，1代表是（暂时不用，备用）
    batch_norm_decay：float,批量归一化参数（暂时不用，备用）
    num_class：int,类别数量
    verbose：boolean,是否打印日志（暂时不用，备用）
    random_seed：int,随机种子（暂时不用，备用）
    
    Returns
    ----------
    keras.Model
    """
    inputs = keras.Input(shape=(None,), dtype="int32")
    x = layers.Embedding(input_dim=embedding_feature_size, output_dim=embedding_size,mask_zero=mask_zero,weights=[embedding_initial_weight],trainable=embedding_trainable)(inputs)
    #lstm
    for i in range(0,lstm_num_layer):  
        x = layers.Bidirectional(layers.LSTM(lstm_num_hidden,dropout=dropout_lstm,return_sequences=True))(x)
    x = layers.GlobalMaxPooling1D()(x)
    #x = layers.Dropout(dropout_lstm)(x)
    #lstm后接全连接层
    for i in range(0,len(deep_layers)):
        x = layers.Dense(deep_layers[i], activation=deep_layers_activation,kernel_regularizer=keras.regularizers.l1_l2(l1=l1_reg,l2=l2_reg))(x)
        x = layers.Dropout(dropout_deep[i])(x)
    outputs = layers.Dense(num_class, activation="softmax")(x)
    model = keras.Model(inputs, outputs)
    model.summary()
    #model.compile(optimizer=optimizer, loss=loss_type, metrics=[eval_metric])
    return model

#### 多输入

In [151]:
def BiLSTM_MutilInput(embedding_feature_size=[],embedding_size=[],embedding_initial_weight=[],embedding_trainable=False,
           mask_zero=True,dropout_embedding=0,seq_length=100,
           lstm_kernel='lstm',lstm_num_hidden=128,lstm_num_layer=2,dropout_lstm=0.2,
           deep_layers=[128],dropout_deep=[0.2],deep_layers_activation='relu',l1_reg=0.0,l2_reg=0.05,
           batch_norm=0,batch_norm_decay=0.995,
           epoch=10,batch_size=128,optimizer= keras.optimizers.Adam(),loss_type='categorical_crossentropy',eval_metric= 'accuracy',
           num_class=2,
           verbose=True,
           random_seed=2020
          ):
    """
    构造多输入BiLSTM模型，同时支持单输入
    
    Parameters
    ----------
    embedding_feature_size: list[int],embedding权重矩阵大小
    embedding_size: list[int],embedding权重矩阵的向量维度
    embedding_initial_weight：list[np.ndarray],embedding初始化矩阵
    embedding_trainable：boolean,embedding权重是否可训练
    mask_zero：boolean, 是否进行mask
    dropout_embedding：int,embedding的droupout比例（暂时不用，备用）
    seq_length=100: int,用户的序列长度（暂时不用，备用）
    lstm_kernel：str，表示lstm层选择什么核，例如lstm或者gru（暂时不用，备用）
    lstm_num_hidden：lstm隐藏层节点数
    lstm_num_layer：lstm的层数
    dropout_lstm：lstm的dropout比例
    #lstm正则项待加
    deep_layers: list,隐藏层节点数
    dropout_deep: list,隐藏层dropout比例
    deep_layers_activation：str,隐藏层激活函数
    l1_reg：float,l1正则化系数
    l2_reg：float,l2正则化系数
    batch_norm：int,是否使用批量归一化，0代表否，1代表是（暂时不用，备用）
    batch_norm_decay：float,批量归一化参数（暂时不用，备用）
    num_class：int,类别数量
    verbose：boolean,是否打印日志（暂时不用，备用）
    random_seed：int,随机种子（暂时不用，备用）
    
    Returns
    ----------
    keras.Model
    """
    
    input_nums = len(embedding_feature_size)
    #每个输入对应一个bilstm
    sub_models = []
    for i in range(0,input_nums):
        #print(i)
        inputs = keras.Input(shape=(None,), dtype="int32")
        x = layers.Embedding(input_dim=embedding_feature_size[i], output_dim=embedding_size[i],mask_zero=mask_zero,weights=[embedding_initial_weight[i]],trainable=embedding_trainable)(inputs)
        for i in range(0,lstm_num_layer):  
            x = layers.Bidirectional(layers.LSTM(lstm_num_hidden,dropout=dropout_lstm,return_sequences=True))(x)
        x = layers.GlobalMaxPooling1D()(x)
        x = keras.Model(inputs, outputs=x)
        sub_models.append(x)
    
    #增加对单输入的支持
    if input_nums<2:
        combined_input=sub_models[0].output
    else:
        combined_input = layers.concatenate([x.output for x in sub_models])
    
    #lstm后接全连接层
    x = combined_input
    for i in range(0,len(deep_layers)):
        x = layers.Dense(deep_layers[i], activation=deep_layers_activation,kernel_regularizer=keras.regularizers.l1_l2(l1=l1_reg,l2=l2_reg))(x)
        x = layers.Dropout(dropout_deep[i])(x)  
    outputs = layers.Dense(num_class, activation="softmax")(x)
    model = keras.Model([x.input for x in sub_models], outputs)
    model.summary()
    #model.compile(optimizer=optimizer, loss=loss_type, metrics=[eval_metric])
    return model

In [162]:
def TextCNN_MutilInput(embedding_feature_size=[],embedding_size=[],embedding_initial_weight=[],embedding_trainable=False,
           mask_zero=True,dropout_embedding=0,seq_length=100,
           filters=128,kernel_size=[2,5,10],dropout_cnn=[0.2,0.2,0.2],
           deep_layers=[128],dropout_deep=[0.2],deep_layers_activation='relu',l1_reg=0.0,l2_reg=0.05,
           batch_norm=0,batch_norm_decay=0.995,
           epoch=10,batch_size=128,optimizer= keras.optimizers.Adam(),loss_type='categorical_crossentropy',eval_metric= 'accuracy',
           num_class=2,
           verbose=True,
           random_seed=2020
          ):
    """
    构造多输入TextCNN模型，同时支持单输入
    
    Parameters
    ----------
    embedding_feature_size: list[int],embedding权重矩阵大小
    embedding_size: list[int],embedding权重矩阵的向量维度
    embedding_initial_weight：list[np.ndarray],embedding初始化矩阵
    embedding_trainable：boolean,embedding权重是否可训练
    mask_zero：boolean, 是否进行mask
    dropout_embedding：int,embedding的droupout比例（暂时不用，备用）
    seq_length=100: int,用户的序列长度（暂时不用，备用）
    filters：int,卷积核个数
    kernel_size：list,卷积核大小
    dropout_cnn：list,cnn输出节点的dropout比例
    deep_layers: list,隐藏层节点数
    dropout_deep: list,隐藏层dropout比例
    deep_layers_activation：str,隐藏层激活函数
    l1_reg：float,l1正则化系数
    l2_reg：float,l2正则化系数
    batch_norm：int,是否使用批量归一化，0代表否，1代表是（暂时不用，备用）
    batch_norm_decay：float,批量归一化参数（暂时不用，备用）
    num_class：int,类别数量
    verbose：boolean,是否打印日志（暂时不用，备用）
    random_seed：int,随机种子（暂时不用，备用））
    
    Returns
    ----------
    keras.Model
    """ 
    input_nums = len(embedding_feature_size)
    #每个输入对应一个cnn
    sub_models = []
    for i in range(0,input_nums):
        #print(i)
        inputs = keras.Input(shape=(None,), dtype="int32")
        emb = layers.Embedding(input_dim=embedding_feature_size[i], output_dim=embedding_size[i],mask_zero=mask_zero,weights=[embedding_initial_weight[i]],trainable=embedding_trainable)(inputs)
        convs=[]
        for i in range(0,len(kernel_size)):  
            x = layers.Conv1D(filters=filters,kernel_size=kernel_size[i],activation=deep_layers_activation)(emb)
            x = layers.GlobalMaxPooling1D()(x)
            x = layers.Dropout(dropout_cnn[i])(x)
            convs.append(x)
        x = layers.concatenate(convs)
        x = keras.Model(inputs, outputs=x)
        sub_models.append(x)
        
    #增加对单输入的支持
    if input_nums<2:
        combined_input=sub_models[0].output
    else:
        combined_input = layers.concatenate([x.output for x in sub_models])
    
    #cnn后接全连接层
    x=combined_input
    for i in range(0,len(deep_layers)):
        x = layers.Dense(deep_layers[i], activation=deep_layers_activation,kernel_regularizer=keras.regularizers.l1_l2(l1=l1_reg,l2=l2_reg))(x)
        x = layers.Dropout(dropout_deep[i])(x)  
    outputs = layers.Dense(num_class, activation="softmax")(x)
    model = keras.Model([x.input for x in sub_models], outputs)
    model.summary()
    #model.compile(optimizer=optimizer, loss=loss_type, metrics=[eval_metric])
    return model

In [163]:
def BiLSTM_DNN_MutilInput(dnn_feature_size=128,embedding_feature_size=[],embedding_size=[],embedding_initial_weight=[],embedding_trainable=False,
           mask_zero=True,dropout_embedding=0,seq_length=100,
           lstm_kernel='lstm',lstm_num_hidden=128,lstm_num_layer=2,dropout_lstm=0.2,
           deep_layers=[128],dropout_deep=[0.2],deep_layers_activation='relu',l1_reg=0.0,l2_reg=0.05,
           batch_norm=0,batch_norm_decay=0.995,
           epoch=10,batch_size=128,optimizer= keras.optimizers.Adam(),loss_type='categorical_crossentropy',eval_metric= 'accuracy',
           num_class=2,
           verbose=True,
           random_seed=2020
          ):
    """
    构造多输入BiLSTM+DNN模型，同时支持单输入
    
    Parameters
    ----------
    embedding_feature_size: list[int],embedding权重矩阵大小
    embedding_size: list[int],embedding权重矩阵的向量维度
    embedding_initial_weight：list[np.ndarray],embedding初始化矩阵
    embedding_trainable：boolean,embedding权重是否可训练
    mask_zero：boolean, 是否进行mask
    dropout_embedding：int,embedding的droupout比例（暂时不用，备用）
    seq_length=100: int,用户的序列长度（暂时不用，备用）
    lstm_kernel：str，表示lstm层选择什么核，例如lstm或者gru（暂时不用，备用）
    lstm_num_hidden：lstm隐藏层节点数
    lstm_num_layer：lstm的层数
    dropout_lstm：lstm的dropout比例
    #lstm正则项待加
    deep_layers: list,隐藏层节点数
    dropout_deep: list,隐藏层dropout比例
    deep_layers_activation：str,隐藏层激活函数
    l1_reg：float,l1正则化系数
    l2_reg：float,l2正则化系数
    batch_norm：int,是否使用批量归一化，0代表否，1代表是（暂时不用，备用）
    batch_norm_decay：float,批量归一化参数（暂时不用，备用）
    num_class：int,类别数量
    verbose：boolean,是否打印日志（暂时不用，备用）
    random_seed：int,随机种子（暂时不用，备用）
    
    Returns
    ----------
    keras.Model
    """
    input_nums = len(embedding_feature_size)
    sub_models = []
    #每个输入对应一个bilstm
    for i in range(0,input_nums):
        #print(i)
        inputs = keras.Input(shape=(None,), dtype="int32")
        x = layers.Embedding(input_dim=embedding_feature_size[i], output_dim=embedding_size[i],mask_zero=mask_zero,weights=[embedding_initial_weight[i]],trainable=embedding_trainable)(inputs)
        for i in range(0,lstm_num_layer):  
            x = layers.Bidirectional(layers.LSTM(lstm_num_hidden,dropout=dropout_lstm,return_sequences=True))(x)
        x = layers.GlobalMaxPooling1D()(x)
        x = keras.Model(inputs, outputs=x)
        sub_models.append(x)
        
    #增加对单输入的支持
    if input_nums<2:
        combined_input=sub_models[0].output
    else:
        combined_input = layers.concatenate([x.output for x in sub_models])
    
    #增加DNN输入特征
    dnn_inputs = keras.Input(shape=(dnn_feature_size,), dtype="float32")
    x = layers.concatenate([combined_input,dnn_inputs])
    #全连接层
    for i in range(0,len(deep_layers)):
        x = layers.Dense(deep_layers[i], activation=deep_layers_activation,kernel_regularizer=keras.regularizers.l1_l2(l1=l1_reg,l2=l2_reg))(x)
        x = layers.Dropout(dropout_deep[i])(x)  
    outputs = layers.Dense(num_class, activation="softmax")(x)
    inputs = [x.input for x in sub_models]
    inputs.append(dnn_inputs)
    model = keras.Model(inputs, outputs)
    model.summary()
    #model.compile(optimizer=optimizer, loss=loss_type, metrics=[eval_metric])
    return model

In [164]:
def BiLSTM_TextCNN_MutilInput(embedding_feature_size=[],embedding_size=[],embedding_initial_weight=[],embedding_trainable=False,
           mask_zero=True,dropout_embedding=0,seq_length=100,
           lstm_kernel='lstm',lstm_num_hidden=128,lstm_num_layer=2,dropout_lstm=0.2,
           filters=128,kernel_size=[2,5,10],dropout_cnn=[0.2,0.2,0.2],
           deep_layers=[128],dropout_deep=[0.2],deep_layers_activation='relu',l1_reg=0.0,l2_reg=0.05,
           batch_norm=0,batch_norm_decay=0.995,
           epoch=10,batch_size=128,optimizer= keras.optimizers.Adam(),loss_type='categorical_crossentropy',eval_metric= 'accuracy',
           num_class=2,
           verbose=True,
           random_seed=2020
          ):
    """
    构造多输入BiLSTM+TextCNN模型，同时支持单输入
    
    Parameters
    ----------
    embedding_feature_size: list[int],embedding权重矩阵大小
    embedding_size: list[int],embedding权重矩阵的向量维度
    embedding_initial_weight：list[np.ndarray],embedding初始化矩阵
    embedding_trainable：boolean,embedding权重是否可训练
    mask_zero：boolean, 是否进行mask
    dropout_embedding：int,embedding的droupout比例（暂时不用，备用）
    seq_length=100: int,用户的序列长度（暂时不用，备用）
    lstm_kernel：str，表示lstm层选择什么核，例如lstm或者gru（暂时不用，备用）
    lstm_num_hidden：lstm隐藏层节点数
    lstm_num_layer：lstm的层数
    dropout_lstm：lstm的dropout比例
    #lstm正则项待加
    filters：int,卷积核个数
    kernel_size：list,卷积核大小
    dropout_cnn：list,cnn输出节点的dropout比例
    deep_layers: list,隐藏层节点数
    dropout_deep: list,隐藏层dropout比例
    deep_layers_activation：str,隐藏层激活函数
    l1_reg：float,l1正则化系数
    l2_reg：float,l2正则化系数
    batch_norm：int,是否使用批量归一化，0代表否，1代表是（暂时不用，备用）
    batch_norm_decay：float,批量归一化参数（暂时不用，备用）
    num_class：int,类别数量
    verbose：boolean,是否打印日志（暂时不用，备用）
    random_seed：int,随机种子（暂时不用，备用）
    
    Returns
    ----------
    keras.Model
    """
    input_nums = len(embedding_feature_size)
    #每个输入对应一个bilstm+textxnn
    sub_models = []
    for i in range(0,input_nums):
        inputs = keras.Input(shape=(None,), dtype="int32")
        emb = layers.Embedding(input_dim=embedding_feature_size[i], output_dim=embedding_size[i],mask_zero=mask_zero,weights=[embedding_initial_weight[i]],trainable=embedding_trainable)(inputs)
        #bilstm
        lstm_part=emb
        for i in range(0,lstm_num_layer):  
            lstm_part = layers.Bidirectional(layers.LSTM(lstm_num_hidden,dropout=dropout_lstm,return_sequences=True))(lstm_part)
        lstm_part = layers.GlobalMaxPooling1D()(lstm_part)
        #textcnn
        convs=[]
        for i in range(0,len(kernel_size)):  
            x = layers.Conv1D(filters=filters,kernel_size=kernel_size[i],activation=deep_layers_activation)(emb)
            x = layers.GlobalMaxPooling1D()(x)
            x = layers.Dropout(dropout_cnn[i])(x)
            convs.append(x)
        cnn_part = layers.concatenate(convs)
        
        sub_model = layers.concatenate([lstm_part,cnn_part])
        sub_model = keras.Model(inputs, outputs=sub_model)
        sub_models.append(sub_model)
    
    #增加对单输入的支持
    if input_nums<2:
        combined_input=sub_models[0].output
    else:
        combined_input = layers.concatenate([x.output for x in sub_models])
    
    #全连接层
    x= combined_input
    for i in range(0,len(deep_layers)):
        x = layers.Dense(deep_layers[i], activation=deep_layers_activation,kernel_regularizer=keras.regularizers.l1_l2(l1=l1_reg,l2=l2_reg))(x)
        x = layers.Dropout(dropout_deep[i])(x)  
    outputs = layers.Dense(num_class, activation="softmax")(x)
    model = keras.Model([x.input for x in sub_models], outputs)
    model.summary()
    #model.compile(optimizer=optimizer, loss=loss_type, metrics=[eval_metric])
    return model

### 性别

In [88]:
keras.backend.clear_session()

In [89]:
gc.collect()

1466

#### 单输入DNN

In [119]:
# params
dnn_params_gender = {
    "feature_dim":128,
    "deep_layers": [256,128],
    "dropout_deep": [0.2,0.2],
    "deep_layers_activation": 'relu',
    "l1_reg":0.0,
    "l2_reg": 0.001,
    "batch_norm": 0,
    "batch_norm_decay": 0.995,
    "num_class":2,
    "verbose": True,
    "random_seed": 2020
}

In [120]:
dnn_model_gender = DNN(**dnn_params_gender)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 128)]             0         
_________________________________________________________________
dense (Dense)                (None, 256)               33024     
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 258       
Total params: 66,178
Trainable params: 66,178
Non-trainable params: 0
_________________________________________________________

In [101]:
user_w2v_cr_train.astype()

dtype('float64')

In [102]:
dnn_model_gender.compile(optimizer=keras.optimizers.Adam(3e-3), loss='categorical_crossentropy', metrics=['accuracy'])
dnn_model_gender.fit(user_w2v_cr_train, label_gender_onehot_train, batch_size=128, epochs=5,validation_split=0.2)

Train on 720000 samples, validate on 180000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fcfd752eda0>

#### 单输入bilstm

In [193]:
keras.backend.clear_session()

In [92]:
gc.collect()

0

In [137]:
# params
dnn_params_gender = {
    "embedding_feature_size":len(dict_cr.values())+1,
    "embedding_size": 128,
    "embedding_initial_weight":embedding_cr,
    "embedding_trainable":False,
    "mask_zero":False,
    "dropout_embedding": 0,
    "seq_length":100,
    "lstm_kernel":'lstm',
    "lstm_num_hidden":128,
    "lstm_num_layer":2,
    "dropout_lstm":0.2,
    "deep_layers": [128],
    "dropout_deep": [0.2],
    "deep_layers_activation": 'relu',
    "l1_reg":0.0,
    "l2_reg": 0.001,
    "batch_norm": 0,
    "batch_norm_decay": 0.995,
    "num_class":2,
    "verbose": True,
    "random_seed": 2020
}

In [91]:
#del lstm_model_gender

In [136]:
lstm_model_gender = BiLSTM(**dnn_params_gender)

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 128)         130604544 
_________________________________________________________________
bidirectional (Bidirectional (None, None, 256)         263168    
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 256)         394240    
_________________________________________________________________
global_max_pooling1d (Global (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0   

test

In [165]:
lstm_model.compile(optimizer=keras.optimizers.Adam(3e-3), loss='categorical_crossentropy', metrics=['accuracy'])
lstm_model.fit(seq_list_train_cr[0:100000], label_gender_onehot_train[0:100000], batch_size=512, epochs=5, validation_data=(seq_list_train_cr[100000:200000], label_gender_onehot_train[100000:200000]))

Train on 100000 samples, validate on 100000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7ff1fc8d13c8>

手动构造验证集

In [166]:
Xi_train_gender,Xi_val_gender,y_train_gender,y_val_gender = train_test_split(seq_list_train_cr,pd_user_ad_list_train['gender_c'],test_size=0.2,random_state=0)

In [167]:
Xi_train_gender.shape,Xi_val_gender.shape

((720000, 100), (180000, 100))

In [168]:
y_train_gender_one_hot = keras.utils.to_categorical(y_train_gender,num_classes=2)
y_val_gender_one_hot = keras.utils.to_categorical(y_val_gender,num_classes=2)

In [190]:
lstm_model_gender.compile(optimizer=keras.optimizers.Adam(3e-3), loss='categorical_crossentropy', metrics=['accuracy'])
lstm_model_gender.fit(Xi_train_gender, y_train_gender_one_hot, batch_size=512, epochs=5, validation_data=(Xi_val_gender, y_val_gender_one_hot))

Train on 720000 samples, validate on 180000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f763cc59940>

利用api参数构造验证集

In [142]:
lstm_model_gender.compile(optimizer=keras.optimizers.Adam(3e-3), loss='categorical_crossentropy', metrics=['accuracy'])
lstm_model_gender.fit(seq_list_train_cr, label_gender_onehot_train, batch_size=512, epochs=5,validation_split=0.2)

Train on 720000 samples, validate on 180000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f9f4805b518>

refit

In [287]:
lstm_model_gender.compile(optimizer=keras.optimizers.Adam(3e-3), loss='categorical_crossentropy', metrics=['accuracy'])
lstm_model_gender_all.fit(seq_list_train_cr, label_gender_onehot_train, batch_size=512, epochs=4)

Train on 900000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7ff22474b7b8>

**显存空间释放，可选**

In [140]:
#keras.backend.clear_session()

In [141]:
#del lstm_model_gender

In [187]:
gc.collect()

2061

#### 多输入bilstm

In [191]:
# del lstm_model_gender

In [192]:
del dnn_params_gender

In [193]:
keras.backend.clear_session()

In [194]:
gc.collect()

3

In [195]:
# params
dnn_params_gender = {
    "embedding_feature_size":[len(dict_cr.values())+1,len(dict_ad.values())+1,len(dict_av.values())+1],
    "embedding_size": [128,128,64],
    "embedding_initial_weight":[embedding_cr,embedding_ad,embedding_av],
    "embedding_trainable":False,
    "mask_zero":False,
    "dropout_embedding": 0,
    "seq_length":100,
    "lstm_kernel":'lstm',
    "lstm_num_hidden":128,
    "lstm_num_layer":2,
    "dropout_lstm":0.2,
    "deep_layers": [128],
    "dropout_deep": [0.2],
    "deep_layers_activation": 'relu',
    "l1_reg":0.0,
    "l2_reg": 0.001,
    "batch_norm": 0,
    "batch_norm_decay": 0.995,
    "num_class":2,
    "verbose": True,
    "random_seed": 2020
}

In [196]:
lstm_model_gender = BiLSTM_MutilInput(**dnn_params_gender)

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 128)    130604544   input_1[0][0]                    
____________________________________________________________________________________________

3输入

In [197]:
lstm_model_gender.compile(optimizer=keras.optimizers.Adam(3e-3), loss='categorical_crossentropy', metrics=['accuracy'])
lstm_model_gender.fit([seq_list_train_cr,seq_list_train_ad,seq_list_train_av], label_gender_onehot_train, batch_size=512, epochs=5,validation_split=0.2)

Train on 720000 samples, validate on 180000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f9f523bd0b8>

refit

In [116]:
lstm_model_gender.compile(optimizer=keras.optimizers.Adam(3e-3), loss='categorical_crossentropy', metrics=['accuracy'])
lstm_model_gender.fit([seq_list_train_cr,seq_list_train_ad,seq_list_train_av], label_gender_onehot_train, batch_size=512, epochs=4)

Train on 900000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7f40e15cc828>

4 输入

In [182]:
lstm_model_gender.compile(optimizer=keras.optimizers.Adam(3e-3), loss='categorical_crossentropy', metrics=['accuracy'])
lstm_model_gender.fit([seq_list_train_cr,seq_list_train_ad,seq_list_train_av,seq_list_train_pr], label_gender_onehot_train, batch_size=512, epochs=5,validation_split=0.2)

Train on 720000 samples, validate on 180000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f40b278aba8>

#### 多输入bilstm+dnn

In [159]:
keras.backend.clear_session()

In [160]:
gc.collect()

1820

In [165]:
# params
dnn_params_gender = {
    "dnn_feature_size":128,
    "embedding_feature_size":[len(dict_cr.values())+1,len(dict_ad.values())+1,len(dict_av.values())+1],
    "embedding_size": [128,128,64],
    "embedding_initial_weight":[embedding_cr,embedding_ad,embedding_av],
    "embedding_trainable":False,
    "mask_zero":False,
    "dropout_embedding": 0,
    "seq_length":100,
    "lstm_kernel":'lstm',
    "lstm_num_hidden":128,
    "lstm_num_layer":2,
    "dropout_lstm":0.2,
    "deep_layers": [128],
    "dropout_deep": [0.2],
    "deep_layers_activation": 'relu',
    "l1_reg":0.0,
    "l2_reg": 0.001,
    "batch_norm": 0,
    "batch_norm_decay": 0.995,
    "num_class":2,
    "verbose": True,
    "random_seed": 2020
}

In [166]:
lstm_dnn_model_gender = BiLSTM_DNN_MutilInput(**dnn_params_gender)

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 128)    130604544   input_1[0][0]                    
____________________________________________________________________________________________

In [118]:
lstm_dnn_model_gender.compile(optimizer=keras.optimizers.Adam(3e-3), loss='categorical_crossentropy', metrics=['accuracy'])
lstm_dnn_model_gender.fit([seq_list_train_cr,seq_list_train_ad,seq_list_train_av,user_w2v_cr_train], label_gender_onehot_train, batch_size=512, epochs=5,validation_split=0.2)

Train on 720000 samples, validate on 180000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fd00bcab160>

#### 多输入bilstm+textcnn

In [180]:
keras.backend.clear_session()

In [181]:
gc.collect()

116

In [182]:
# params
dnn_params_gender = {
    "embedding_feature_size":[len(dict_cr.values())+1,len(dict_ad.values())+1,len(dict_av.values())+1],
    "embedding_size": [128,128,64],
    "embedding_initial_weight":[embedding_cr,embedding_ad,embedding_av],
    "embedding_trainable":False,
    "mask_zero":False,
    "dropout_embedding": 0,
    "seq_length":100,
    "lstm_kernel":'lstm',
    "lstm_num_hidden":128,
    "lstm_num_layer":2,
    "dropout_lstm":0.2,
    "filters":128,
    "kernel_size":[2,5,10],
    "dropout_cnn":[0.2,0.2,0.2],
    "deep_layers": [128],
    "dropout_deep": [0.2],
    "deep_layers_activation": 'relu',
    "l1_reg":0.0,
    "l2_reg": 0.001,
    "batch_norm": 0,
    "batch_norm_decay": 0.995,
    "num_class":2,
    "verbose": True,
    "random_seed": 2020
}

In [183]:
lstm_cnn_model_gender = BiLSTM_TextCNN_MutilInput(**dnn_params_gender)

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 128)    130604544   input_1[0][0]                    
____________________________________________________________________________________________

In [None]:
lstm_cnn_model_gender.compile(optimizer=keras.optimizers.Adam(3e-3), loss='categorical_crossentropy', metrics=['accuracy'])
lstm_cnn_model_gender.fit([seq_list_train_cr,seq_list_train_ad,seq_list_train_av], label_gender_onehot_train, batch_size=512, epochs=5,validation_split=0.2)

#### 多输入textcnn

In [177]:
keras.backend.clear_session()

In [178]:
gc.collect()

1157

In [176]:
# params
dnn_params_gender = {
    "embedding_feature_size":[len(dict_cr.values())+1],
    "embedding_size": [128],
    "embedding_initial_weight":[embedding_cr],
    "embedding_trainable":False,
    "mask_zero":False,
    "dropout_embedding": 0,
    "seq_length":100,
    "filters":128,
    "kernel_size":[2,5,10],
    "dropout_cnn":[0.2,0.2,0.2],
    "deep_layers": [128],
    "dropout_deep": [0.2],
    "deep_layers_activation": 'relu',
    "l1_reg":0.0,
    "l2_reg": 0.001,
    "batch_norm": 0,
    "batch_norm_decay": 0.995,
    "num_class":2,
    "verbose": True,
    "random_seed": 2020
}

In [179]:
cnn_model_gender = TextCNN_MutilInput(**dnn_params_gender)

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 128)    130604544   input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, None, 128)    32896       embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, None, 128)    82048       embedding[0][0]                  
____________________________________________________________________________________________

In [268]:
cnn_model_gender.compile(optimizer=keras.optimizers.Adam(3e-3), loss='categorical_crossentropy', metrics=['accuracy'])
cnn_model_gender.fit([seq_list_train_cr], label_gender_onehot_train, batch_size=512, epochs=5,validation_split=0.2)

Train on 720000 samples, validate on 180000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f75eb063748>

### 年龄

#### 单输入bilstm

In [177]:
keras.backend.clear_session()

In [178]:
gc.collect()

1157

In [371]:
# params
dnn_params_age = {
    "embedding_feature_size":len(dict_cr.values())+1,
    "embedding_size": 128,
    "embedding_initial_weight":embedding_cr,
    "embedding_trainable":False,
    "mask_zero":False,
    "dropout_embedding": 0,
    "seq_length":100,
    "lstm_kernel":'lstm',
    "lstm_num_hidden":128,
    "lstm_num_layer":2,
    "dropout_lstm":0.2,
    "deep_layers": [128],
    "dropout_deep": [0.2],
    "deep_layers_activation": 'relu',
    "l1_reg":0.0,
    "l2_reg": 0.001,
    "batch_norm": 0,
    "batch_norm_decay": 0.995,
    "num_class":10,
    "verbose": True,
    "random_seed": 2020
}

In [372]:
lstm_model_age = BiLSTM(**dnn_params_age)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 128)         130604544 
_________________________________________________________________
bidirectional (Bidirectional (None, None, 256)         263168    
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 256)         394240    
_________________________________________________________________
global_max_pooling1d (Global (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               32896     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0     

In [373]:
Xi_train_age,Xi_val_age,y_train_age,y_val_age = train_test_split(seq_list_train_cr,pd_user_ad_list_train['age_c'],test_size=0.2,random_state=0)

In [374]:
y_train_age_one_hot = kr.utils.to_categorical(y_train_age,num_classes=10)
y_val_age_one_hot = kr.utils.to_categorical(y_val_age,num_classes=10)

min-df=10

In [266]:
lstm_model_age.compile(optimizer=keras.optimizers.Adam(1e-3), loss='categorical_crossentropy', metrics=['accuracy'])
lstm_model_age.fit(Xi_train_age, y_train_age_one_hot, batch_size=128, epochs=10, validation_data=(Xi_val_age, y_val_age_one_hot))

Train on 720000 samples, validate on 180000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7ff1a6f070b8>

min-df=5

In [375]:
lstm_model_age.compile(optimizer=keras.optimizers.Adam(1e-3), loss='categorical_crossentropy', metrics=['accuracy'])
lstm_model_age.fit(Xi_train_age, y_train_age_one_hot, batch_size=128, epochs=10, validation_data=(Xi_val_age, y_val_age_one_hot))

Train on 720000 samples, validate on 180000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fed8f706f60>

refit

In [289]:
lstm_model_age_all.fit(seq_list_train_cr, label_age_onehot_train, batch_size=128, epochs=9)

Train on 900000 samples
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


<tensorflow.python.keras.callbacks.History at 0x7ff1bf0ee358>

#### 多输入bilistm

In [163]:
keras.backend.clear_session()

In [164]:
gc.collect()

1812

In [187]:
# params
dnn_params_age = {
    "embedding_feature_size":[len(dict_cr.values())+1,len(dict_ad.values())+1,len(dict_av.values())+1,len(dict_pr.values())+1],
    "embedding_size": [128,128,64,64],
    "embedding_initial_weight":[embedding_cr,embedding_ad,embedding_av,embedding_pr],
    "embedding_trainable":False,
    "mask_zero":False,
    "dropout_embedding": 0,
    "seq_length":100,
    "lstm_kernel":'lstm',
    "lstm_num_hidden":128,
    "lstm_num_layer":2,
    "dropout_lstm":0.2,
    "deep_layers": [128],
    "dropout_deep": [0.2],
    "deep_layers_activation": 'relu',
    "l1_reg":0.0,
    "l2_reg": 0.001,
    "batch_norm": 0,
    "batch_norm_decay": 0.995,
    "num_class":10,
    "verbose": True,
    "random_seed": 2020
}

In [188]:
lstm_model_age = BiLSTM_MutilInput(**dnn_params_age)

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, None)]       0                                            
____________________________________________________________________________________________

3输入

In [203]:
lstm_model_age.compile(optimizer=keras.optimizers.Adam(3e-3), loss='categorical_crossentropy', metrics=['accuracy'])
lstm_model_age.fit([seq_list_train_cr,seq_list_train_ad,seq_list_train_av], label_age_onehot_train, batch_size=512, epochs=10,validation_split=0.2)

Train on 720000 samples, validate on 180000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f7643a695c0>

refit

In [105]:
lstm_model_age.compile(optimizer=keras.optimizers.Adam(3e-3), loss='categorical_crossentropy', metrics=['accuracy'])
lstm_model_age.fit([seq_list_train_cr,seq_list_train_ad,seq_list_train_av], label_age_onehot_train, batch_size=512, epochs=9)

Train on 900000 samples
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


<tensorflow.python.keras.callbacks.History at 0x7f409c68bcf8>

4输入

In [None]:
lstm_model_age.compile(optimizer=keras.optimizers.Adam(1e-3), loss='categorical_crossentropy', metrics=['accuracy'])
lstm_model_age.fit([seq_list_train_cr,seq_list_train_ad,seq_list_train_av,seq_list_train_pr], label_age_onehot_train, batch_size=128, epochs=10,validation_split=0.2)

#### 多输入bilstm+textcnn

In [163]:
keras.backend.clear_session()

In [164]:
gc.collect()

1812

In [186]:
# params
dnn_params_age = {
    "embedding_feature_size":[len(dict_cr.values())+1,len(dict_ad.values())+1,len(dict_av.values())+1,len(dict_pr.values())+1],
    "embedding_size": [128,128,64,64],
    "embedding_initial_weight":[embedding_cr,embedding_ad,embedding_av,embedding_pr],
    "embedding_trainable":False,
    "mask_zero":False,
    "dropout_embedding": 0,
    "seq_length":100,
    "lstm_kernel":'lstm',
    "lstm_num_hidden":128,
    "lstm_num_layer":2,
    "dropout_lstm":0.2,
    "filters":128,
    "kernel_size":[2,5,10],
    "dropout_cnn":[0.2,0.2,0.2],
    "deep_layers": [256],
    "dropout_deep": [0.2],
    "deep_layers_activation": 'relu',
    "l1_reg":0.0,
    "l2_reg": 0.001,
    "batch_norm": 0,
    "batch_norm_decay": 0.995,
    "num_class":10,
    "verbose": True,
    "random_seed": 2020
}

In [150]:
lstm_cnn_model_age = BiLSTM_TextCNN_MutilInput(**dnn_params_age)

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 128)    130604544   input_1[0][0]                    
____________________________________________________________________________________________

In [None]:
lstm_cnn_model_age.compile(optimizer=keras.optimizers.Adam(3e-3), loss='categorical_crossentropy', metrics=['accuracy'])
lstm_cnn_model_age.fit([seq_list_train_cr,seq_list_train_ad,seq_list_train_av], label_age_onehot_train, batch_size=512, epochs=10,validation_split=0.2)

In [139]:
del lstm_cnn_model_age
gc.collect()

674

### 保存和载入模型

In [129]:
lstm_model_age.save('./model/age_model_lstm_mindf5_3input.tf')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: ./model/age_model_lstm_mindf5_3input.tf/assets


In [130]:
del lstm_model_age

In [131]:
lstm_model_age = keras.models.load_model("./model/age_model_lstm_mindf5_3input.tf")

In [134]:
lstm_model_gender.save('./model/gender_model_lstm_mindf5_3input.tf')

INFO:tensorflow:Assets written to: ./model/gender_model_lstm_mindf5_3input.tf/assets


In [185]:
'''
keras.backend.clear_session()
del lstm_model_gender
del lstm_model_age
'''

1984

In [None]:
gc.collect()

### 导出结果

预测

In [169]:
y_predict_gender_prob_test = lstm_model_gender.predict([seq_list_test_cr,seq_list_test_ad,seq_list_test_av],batch_size=2048)

In [170]:
y_predict_gender_prob_train = lstm_model_gender.predict([seq_list_train_cr,seq_list_train_ad,seq_list_train_av],batch_size=2048)

In [106]:
y_predict_age_prob_test = lstm_model_age.predict([seq_list_test_cr,seq_list_test_ad,seq_list_test_av],batch_size=2048)

In [107]:
y_predict_age_prob_train = lstm_model_age.predict([seq_list_train_cr,seq_list_train_ad,seq_list_train_av],batch_size=2048)

概率转标签

In [171]:
pd_user_ad_list_train['predicted_gender'] = np.argmax(y_predict_gender_prob_train,axis=1)+1

In [109]:
pd_user_ad_list_train['predicted_age'] = np.argmax(y_predict_age_prob_train,axis=1)+1

In [121]:
pd_user_ad_list_test['predicted_gender'] = np.argmax(y_predict_gender_prob_test,axis=1)+1

In [122]:
pd_user_ad_list_test['predicted_age'] = np.argmax(y_predict_age_prob_test,axis=1)+1

查看训练集效果和测试集预测标签分布

In [120]:
accuracy_score(pd_user_ad_list_train['gender'],pd_user_ad_list_train['predicted_gender'])

0.9482466666666667

In [110]:
accuracy_score(pd_user_ad_list_train['age'],pd_user_ad_list_train['predicted_age'])

0.49612333333333336

In [123]:
pd_user_ad_list_train['age'].value_counts()/pd_user_ad_list_train['age'].value_counts().sum()

3     0.225454
4     0.167309
2     0.165857
5     0.145186
6     0.113022
7     0.074123
1     0.039106
8     0.035519
9     0.021638
10    0.012787
Name: age, dtype: float64

In [124]:
pd_user_ad_list_test['predicted_age'].value_counts()/pd_user_ad_list_test['predicted_age'].value_counts().sum()

3     0.247647
4     0.187156
2     0.169764
6     0.147429
5     0.112628
7     0.054154
8     0.025360
1     0.023323
9     0.019223
10    0.013316
Name: predicted_age, dtype: float64

In [125]:
pd_user_ad_list_test[['user_id','predicted_age','predicted_gender']].to_csv("./result/submission.csv",index=False)

测试

In [146]:
test_result = pd.read_csv("./result/submission.csv")

In [147]:
test_result['predicted_age'].value_counts()/test_result['predicted_age'].value_counts().sum()

3     0.278535
4     0.186520
2     0.173963
6     0.148147
5     0.093333
7     0.043736
1     0.024179
8     0.022073
9     0.017949
10    0.011565
Name: predicted_age, dtype: float64

In [241]:
test_result_new = pd.merge(pd_user_ad_list_test[['user_id','predicted_age']],test_result[['user_id','predicted_gender']],on='user_id')

In [242]:
test_result_new[['user_id','predicted_age','predicted_gender']].to_csv("./result/submission.csv",index=False)