In [86]:
import os

# 数据处理工具
import pandas as pd
import numpy as np
import random as rnd
from collections import Counter

# 可视化工具
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# 用于测试集的划分
from sklearn.model_selection import train_test_split

# 进度条工具
from tqdm import tqdm

In [2]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
ad_feature = pd.read_csv('data/adFeature.csv')

In [27]:
train_df.head()

Unnamed: 0,aid,uid,label
0,699,78508957,-1
1,1991,3637295,-1
2,1119,19229018,-1
3,2013,79277120,-1
4,692,41528441,-1


In [41]:
test_df.head()

Unnamed: 0,aid,uid
0,2118,64355836
1,692,45051997
2,692,10869198
3,1918,75929554
4,1596,5790162


In [38]:
train_df['uid'].drop_duplicates().describe()

count    7.883466e+06
mean     4.127145e+07
std      2.382770e+07
min      2.000000e+00
25%      2.063347e+07
50%      4.128304e+07
75%      6.190386e+07
max      8.254290e+07
Name: uid, dtype: float64

In [42]:
test_df['uid'].drop_duplicates().describe()

count    2.195951e+06
mean     4.127717e+07
std      2.383508e+07
min      8.000000e+00
25%      2.064691e+07
50%      4.128092e+07
75%      6.192491e+07
max      8.254288e+07
Name: uid, dtype: float64

In [39]:
train_df['aid'].drop_duplicates().describe()

count     173.000000
mean     1140.364162
std       658.957025
min         6.000000
25%       562.000000
50%      1171.000000
75%      1728.000000
max      2216.000000
Name: aid, dtype: float64

In [43]:
test_df['aid'].drop_duplicates().describe()

count     173.000000
mean     1140.364162
std       658.957025
min         6.000000
25%       562.000000
50%      1171.000000
75%      1728.000000
max      2216.000000
Name: aid, dtype: float64

从正负例比例来看，存在样本分布不均衡问题

In [40]:
train_df['label'].drop_duplicates().describe()

count    2.000000
mean     0.000000
std      1.414214
min     -1.000000
25%     -0.500000
50%      0.000000
75%      0.500000
max      1.000000
Name: label, dtype: float64

In [16]:
print('total length: ', len(train_df))
pos_len = len(train_df[train_df['label'] == 1])
neg_len = len(train_df[train_df['label'] == -1])
print('positive: ', pos_len)
print('negative: ', neg_len)
print('pos/neg:: ', pos_len/neg_len)

total length: 

 

8798814




positive: 

 

421961




negative: 

 

8376853




pos/neg:: 

 

0.05037225793505031




In [18]:
print('test_size: ', len(test_df))

test_size: 

 

2265989




In [5]:
ad_feature.head()

Unnamed: 0,aid,advertiserId,campaignId,creativeId,creativeSize,adCategoryId,productId,productType
0,177,8203,76104,1500666,59,282,0,6
1,2050,19441,178687,245165,53,1,0,6
2,1716,5552,158101,1080850,35,27,113,9
3,336,370,4833,119845,22,67,113,9
4,671,45705,352827,660519,42,67,0,4


In [21]:
ad_feature.describe()

Unnamed: 0,aid,advertiserId,campaignId,creativeId,creativeSize,adCategoryId,productId,productType
count,173.0,173.0,173.0,173.0,173.0,173.0,173.0,173.0
mean,1140.364162,13229.202312,159044.612717,938856.7,50.364162,57.33526,2559.277457,7.346821
std,658.957025,23033.243589,184836.397583,521112.3,24.827317,62.197012,5570.602765,2.827595
min,6.0,60.0,80.0,5977.0,20.0,1.0,0.0,4.0
25%,562.0,702.0,31020.0,492484.0,35.0,21.0,0.0,4.0
50%,1171.0,7229.0,76104.0,981822.0,42.0,27.0,0.0,6.0
75%,1728.0,11487.0,209098.0,1383456.0,59.0,67.0,3733.0,11.0
max,2216.0,158679.0,766460.0,1806760.0,109.0,282.0,28986.0,11.0


In [81]:
print('advertiserId number: ', ad_feature['advertiserId'].drop_duplicates().count())
print('campaignId number: ', ad_feature['campaignId'].drop_duplicates().count())

advertiserId number: 

 

79




campaignId number: 

 

138

In [83]:
print('creativeId number: ', ad_feature['creativeId'].drop_duplicates().count())
print('creativeSize number: ', ad_feature['creativeSize'].drop_duplicates().count())

creativeId number: 

 

173




creativeSize number: 

 

15

In [85]:
print('adCategoryId number: ', ad_feature['adCategoryId'].drop_duplicates().count())
print('productId number: ', ad_feature['productId'].drop_duplicates().count())
print('productType number: ', ad_feature['productType'].drop_duplicates().count())

adCategoryId number: 

 

40




productId number: 

 

33




productType number: 

 

4




In [92]:
def create_lookup_tables(words):
    word_counts = Counter(words)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    vocab_to_int = {word: ii for ii, word in enumerate(sorted_vocab)}

    return vocab_to_int

In [103]:
advertiserId_to_int = create_lookup_tables(ad_feature['advertiserId'])
print('advertiserId_to_int:\n', advertiserId_to_int)
campaignId_to_int = create_lookup_tables(ad_feature['campaignId'])
print('campaignId_to_int:\n', campaignId_to_int)
creativeId_to_int = create_lookup_tables(ad_feature['creativeId'])
print('creativeId_to_int:\n', creativeId_to_int)

advertiserId_to_int:


 

{24704: 33, 1025: 34, 16770: 35, 388: 4, 104071: 36, 29704: 37, 45705: 18, 10122: 38, 8203: 0, 25485: 20, 2775: 15, 9106: 6, 915: 3, 149: 78, 21017: 41, 6937: 17, 21359: 71, 285: 21, 8350: 22, 8864: 23, 6946: 24, 13915: 43, 18630: 53, 9619: 69, 19441: 72, 7300: 48, 10055: 26, 133292: 45, 11437: 27, 8494: 9, 5552: 5, 101662: 42, 6841: 47, 1082: 7, 11195: 19, 60: 49, 18621: 28, 702: 1, 821: 46, 58643: 66, 11459: 52, 452: 10, 15174: 25, 327: 14, 43189: 50, 9571: 63, 25420: 55, 2509: 56, 41806: 57, 79: 8, 71505: 75, 5459: 12, 20943: 58, 158679: 40, 728: 59, 47823: 29, 8668: 30, 990: 61, 11487: 16, 14818: 44, 3387: 60, 75748: 64, 27367: 65, 44008: 39, 66025: 67, 14315: 68, 49772: 51, 22802: 70, 7229: 54, 17597: 11, 3993: 13, 83042: 62, 370: 2, 243: 31, 2676: 73, 7926: 74, 7565: 32, 23805: 76, 5758: 77}




campaignId_to_int:


 

{280832: 26, 60929: 27, 38402: 28, 295940: 7, 7686: 29, 23303: 30, 63752: 8, 3372: 9, 154634: 31, 131853: 33, 47118: 10, 19215: 11, 15634: 2, 33813: 34, 668182: 35, 378648: 36, 692763: 37, 111645: 32, 52258: 12, 159012: 39, 49189: 40, 10460: 122, 31020: 0, 115759: 42, 163120: 43, 132657: 44, 95990: 133, 67127: 45, 134068: 13, 352827: 14, 18237: 15, 487541: 105, 178687: 137, 141893: 47, 174407: 49, 76104: 3, 353610: 50, 90700: 51, 741453: 52, 20048: 17, 18296: 54, 199508: 55, 110094: 56, 51385: 1, 42625: 79, 204378: 59, 645468: 60, 12128: 61, 1123: 62, 475236: 63, 42104: 116, 734054: 64, 7527: 65, 48236: 67, 98158: 68, 286065: 69, 51315: 70, 169332: 71, 163957: 72, 42614: 73, 696695: 74, 888: 75, 244601: 76, 420987: 77, 68476: 16, 745599: 78, 13953: 48, 331396: 80, 27030: 91, 308103: 82, 358536: 83, 25739: 4, 135565: 84, 159118: 85, 295567: 86, 531344: 87, 163352: 88, 26003: 18, 404: 90, 158101: 19, 662422: 81, 50305: 66, 36763: 93, 219802: 92, 17378: 95, 12711: 20, 59293: 94, 241577: 9




In [107]:
creativeSize_to_int = create_lookup_tables(ad_feature['creativeSize'])
print('creativeSize_to_int:\n', creativeSize_to_int)
adCategoryId_to_int = create_lookup_tables(ad_feature['adCategoryId'])
print('adCategoryId_to_int:\n', adCategoryId_to_int)
productId_to_int = create_lookup_tables(ad_feature['productId'])
print('productId_to_int:\n', productId_to_int)
productType_to_int = create_lookup_tables(ad_feature['productType'])
print('productType_to_int:\n', productType_to_int)

creativeSize_to_int:


 

{35: 0, 100: 5, 105: 12, 42: 3, 109: 7, 77: 9, 59: 2, 79: 6, 20: 13, 53: 4, 22: 1, 91: 8, 60: 10, 93: 14, 95: 11}




adCategoryId_to_int:


 

{192: 22, 1: 15, 67: 3, 4: 9, 179: 23, 70: 24, 8: 16, 265: 25, 10: 1, 140: 20, 13: 8, 142: 6, 77: 27, 74: 38, 81: 28, 121: 37, 21: 2, 22: 30, 89: 13, 24: 4, 25: 17, 218: 7, 27: 0, 282: 14, 94: 10, 34: 11, 100: 31, 102: 32, 40: 33, 43: 18, 108: 19, 48: 35, 51: 5, 204: 26, 30: 21, 9: 29, 137: 36, 59: 12, 125: 34, 149: 39}




productId_to_int:


 

{0: 0, 25730: 13, 70: 14, 1313: 15, 28986: 31, 439: 32, 17614: 17, 3791: 12, 7992: 19, 3794: 7, 3733: 8, 16791: 20, 5336: 21, 27855: 18, 3194: 23, 15454: 9, 13727: 24, 9760: 25, 12193: 5, 4772: 26, 38: 10, 14314: 27, 1455: 6, 113: 1, 3826: 28, 24947: 16, 11636: 30, 542: 22, 5615: 2, 19256: 11, 4666: 4, 4669: 3, 6131: 29}




productType_to_int:


 

{9: 2, 11: 1, 4: 0, 6: 3}




In [146]:
ad_feature_mapped = ad_feature.copy()
ad_feature_mapped['advertiserId'] = ad_feature['advertiserId'].map(advertiserId_to_int)
ad_feature_mapped['campaignId'] = ad_feature['campaignId'].map(campaignId_to_int)
ad_feature_mapped['creativeId'] = ad_feature['creativeId'].map(creativeId_to_int)
ad_feature_mapped['creativeSize'] = ad_feature['creativeSize'].map(creativeSize_to_int)
ad_feature_mapped['adCategoryId'] = ad_feature['adCategoryId'].map(adCategoryId_to_int)
ad_feature_mapped['productId'] = ad_feature['productId'].map(productId_to_int)
ad_feature_mapped['productType'] = ad_feature['productType'].map(productType_to_int)
ad_feature_mapped.describe()

Unnamed: 0,aid,advertiserId,campaignId,creativeId,creativeSize,adCategoryId,productId,productType
count,173.0,173.0,173.0,173.0,173.0,173.0,173.0,173.0
mean,1140.364162,22.479769,56.647399,86.0,2.895954,7.947977,3.884393,1.300578
std,658.957025,22.578954,42.930277,50.084928,3.306202,9.431063,7.489406,1.111175
min,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,562.0,4.0,17.0,43.0,0.0,1.0,0.0,0.0
50%,1171.0,14.0,51.0,86.0,2.0,4.0,0.0,1.0
75%,1728.0,35.0,94.0,129.0,4.0,11.0,3.0,2.0
max,2216.0,78.0,137.0,172.0,14.0,39.0,32.0,3.0


In [147]:
for k, v in ad_feature_mapped.iteritems():
    if not k == 'aid':
        ad_feature_mapped[k] = ad_feature_mapped[k] / ad_feature_mapped[k].max()
ad_feature_mapped.describe()

Unnamed: 0,aid,advertiserId,campaignId,creativeId,creativeSize,adCategoryId,productId,productType
count,173.0,173.0,173.0,173.0,173.0,173.0,173.0,173.0
mean,1140.364162,0.288202,0.413485,0.5,0.206854,0.203794,0.121387,0.433526
std,658.957025,0.289474,0.31336,0.291191,0.236157,0.241822,0.234044,0.370392
min,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,562.0,0.051282,0.124088,0.25,0.0,0.025641,0.0,0.0
50%,1171.0,0.179487,0.372263,0.5,0.142857,0.102564,0.0,0.333333
75%,1728.0,0.448718,0.686131,0.75,0.285714,0.282051,0.09375,0.666667
max,2216.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [150]:
columns = ['aid', 'advertiserId', 'campaignId', 'creativeId', 'creativeSize', 'adCategoryId', 'productId', 'productType']
ad_feature_mapped.to_csv('data/ad_feature_mapped.csv', columns=columns, index=False)

In [155]:
ad_feature_reload = pd.read_csv('data/ad_feature_mapped.csv')
ad_feature_reload.describe()

Unnamed: 0,aid,advertiserId,campaignId,creativeId,creativeSize,adCategoryId,productId,productType
count,173.0,173.0,173.0,173.0,173.0,173.0,173.0,173.0
mean,1140.364162,0.288202,0.413485,0.5,0.206854,0.203794,0.121387,0.433526
std,658.957025,0.289474,0.31336,0.291191,0.236157,0.241822,0.234044,0.370392
min,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,562.0,0.051282,0.124088,0.25,0.0,0.025641,0.0,0.0
50%,1171.0,0.179487,0.372263,0.5,0.142857,0.102564,0.0,0.333333
75%,1728.0,0.448718,0.686131,0.75,0.285714,0.282051,0.09375,0.666667
max,2216.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### 根据数据分析的结果，对特征进行处理（归一化，独热编码，合并，删除等等），得到最终的特征向量

In [None]:
"""
此处进行各种处理
"""

# 得到最终的数字特征
final_user_feature = pd.DataFrame()
final_ad_feature = pd.DataFrame()


### 将广告特征与用户特征合并到train_df和test_df中，生成训练数据

In [31]:
train_df.head()

Unnamed: 0,aid,uid,label
0,1,1,1
1,2,2,1
2,3,3,-1
3,4,4,-1
4,5,5,1


In [75]:
# 这里要使用处理后的final_user_feature和final_ad_feature
train_merged_1 = pd.merge(train_df, user_feature, on='uid')
test_merged_1 = pd.merge(test_df, user_feature, on='uid')
train_merged_1

Unnamed: 0,aid,uid,label,age,gender,marriageStatus,education,consumptionAbility,LBS,interest1,...,kw3,topic1,topic2,topic3,appIdInstall,appIdAction,ct,os,carrier,house
0,1,1,1,7,1,0,3,1,6,1 2,...,2,34,35,65,23 23 29 43,43 29,3,1,2,1
1,2,2,1,7,1,0,3,1,6,1 2,...,2,34,35,65,23 23 29 43,43 29,3,1,2,1
2,3,3,-1,7,1,0,3,1,6,1 2,...,2,34,35,65,23 23 29 43,43 29,3,1,2,1
3,4,4,-1,2,1,0,3,1,6,1 2,...,2,34,35,65,23 23 29 43,43 29,3,1,2,1
4,5,5,1,7,1,0,3,1,6,1 2,...,2,34,35,65,23 23 29 43,43 29,3,1,2,1
5,6,6,-1,8,1,0,3,1,6,1 2,...,2,34,35,65,23 23 29 43,43 29,3,1,2,1
6,7,7,1,7,1,0,3,1,6,1 2,...,2,34,35,65,23 23 29 43,43 29,3,1,2,1
7,10,10,-1,7,1,0,3,1,6,1 2,...,2,34,35,65,23 23 29 43,43 29,3,1,2,1


In [76]:
train_merged_2 = pd.merge(train_merged_1, ad_feature, on='aid')
test_merged_2 = pd.merge(test_merged_1, ad_feature, on='aid')
train_merged_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 0 to 3
Data columns (total 33 columns):
aid                   4 non-null int64
uid                   4 non-null int64
label                 4 non-null int64
age                   4 non-null int64
gender                4 non-null int64
marriageStatus        4 non-null int64
education             4 non-null int64
consumptionAbility    4 non-null int64
LBS                   4 non-null int64
interest1             4 non-null object
interest2             4 non-null int64
interest3             4 non-null int64
interest4             4 non-null int64
interest5             4 non-null int64
kw1                   4 non-null int64
kw2                   4 non-null int64
kw3                   4 non-null int64
topic1                4 non-null int64
topic2                4 non-null int64
topic3                4 non-null int64
appIdInstall          4 non-null object
appIdAction           4 non-null object
ct                    4 non-null int64

In [80]:
train_feature = train_merged_2.drop('label', axis=1)
train_label = train_merged_2['label']

# 为了防止过拟合，需要从训练集中划分出一个验证集来
X_train, X_test, y_train, y_test = train_test_split(train_feature, train_label, test_size=0.1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# 此数据集的测试集中没有label，仅用于提交结果，所以需要在训练完毕后对此集做推理运算，得出label
test_feature = test_merged_2

(3, 32)
(1, 32)
(3,)
(1,)


## 通过得到的特征向量与标签，用tensorflow搭建神经网络进行训练

In [43]:
import tensorflow as tf

In [None]:
# 设计神经网络结构
def build_arch(inputs):
    layer1 = tf.layers.dense(inputs, 128, activation=tf.nn.relu)
    layer2 = tf.layers.dense(layer1, 128, activation=tf.nn.relu)
    layer3 = tf.layers.dense(layer2, 128, activation=tf.nn.relu)
    outputs = tf.layers.dense(layer3, 1, activation=tf.nn.relu)
    
    return outputs

In [None]:
# 设计损失函数
def build_loss(outputs, labels):
    loss = tf.nn.softmax_cross_entropy_with_logits(outputs, labels)
    return loss

In [None]:
# 设计分数统计函数
def build_score(outputs, labels):
    score = 0
    return score

In [None]:
# 获取上一次保存的模型
def get_last_state(logdir, num_batch):
    ckpt = tf.train.get_checkpoint_state(logdir)
    if ckpt and ckpt.model_checkpoint_path:
        # Restores from checkpoint
        ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
        global_step = int(ckpt_name.split('-')[-1])
        last_epoch = global_step // num_batch
        last_step = global_step % num_batch
    else:
        global_step = 0
        last_epoch = 0
        last_step = 0
    return ckpt, global_step, last_epoch, last_step

### 构建计算图

In [None]:
# 调试超参
lr = 0.1
batch_size = 128
epoch = 1
logdir = 'log/'
save_checkpoint_steps = 20
save_summaries_steps = 10

In [None]:
tf.reset_default_graph()
outputs = build_arch(X_train)
loss = build_loss(outputs, y_train)
score = build_score(outputs, y_train)
opt = tf.train.AdamOptimizer(lr).minimize(loss)

### 训练

In [None]:
num_batch = X_train.shape[0] // batch_size
ckpt, global_step, last_epoch, last_step = get_last_state(logdir, num_batch)
init_op = tf.global_variables_initializer()
saver = tf.train.Saver()
with tf.Session() as sess:
    sess.run(init_op)
    train_writer = tf.summary.FileWriter(logdir + '/train', sess.graph)
    
    if ckpt and ckpt.model_checkpoint_path:
        # 加载上次保存的模型
        saver.restore(sess, ckpt.model_checkpoint_path)
    # 计算图结构分析
    param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
            tf.get_default_graph(),
            tfprof_options=tf.contrib.tfprof.model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
    print('total_params: %d\n' % param_stats.total_parameters)
    
    for e in range(last_epoch, epoch):
        print('Training for epoch ' + str(epoch+1) + '/' + str(epoch) + ':')
        
        bar = tqdm(range(last_step, num_batch), initial=last_step, total=num_batch, ncols=100, leave=False,
                       unit='b')
        for _ in bar:
            if global_step % save_summaries_steps == 0:
                # train
                _, train_score, summary_str = sess.run(
                        [opt, score, summary])
                train_writer.add_summary(summary_str, global_step)
                bar.set_description('tr_acc:{}'.format(train_score))
            else:
                sess.run(opt)

            global_step += 1
            if global_step % save_checkpoint_steps == 0:
                saver.save(sess, logdir + '/model.ckpt', global_step=global_step)

    train_writer.close()