In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
from sklearn.metrics import f1_score, classification_report
import gc
import xgboost as xgb
from scipy import stats
import datetime
import time
from scipy.stats import entropy, kurtosis
import multiprocessing
from gensim.models.word2vec import LineSentence
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
tqdm.pandas()

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')

In [None]:
current_path = './'
seed = 2019

In [None]:
df_train = pd.read_csv(os.path.join(current_path, 'raw_data', 'train.csv'))
df_test = pd.read_csv(os.path.join(current_path, 'raw_data', 'test.csv'))

In [None]:
df_train.head()

In [None]:
df_feature = pd.concat([df_train, df_test], sort=False)
df_feature = df_feature.sort_values(
    ['deviceid', 'ts']).reset_index().drop('index', axis=1)

In [None]:
df_feature['newsid'] = df_feature['newsid'].map(lambda x: str(x))

In [None]:
# 时间
df_feature['ts_datetime'] = df_feature['ts'] + 8 * 60 * 60 * 1000
df_feature['ts_datetime'] = pd.to_datetime(
    df_feature['ts_datetime'], unit='ms')
df_feature['day'] = df_feature['ts_datetime'].dt.day
df_feature['hour'] = df_feature['ts_datetime'].dt.hour
df_feature['minute'] = df_feature['ts_datetime'].dt.minute
df_feature['minute10'] = (df_feature['minute'] // 10) * 10

df_feature['hourl'] = df_feature['day'] * 24 + df_feature['hour']
df_feature['hourl'] = df_feature['hourl'] - df_feature['hourl'].min()

# 基本特征

In [None]:
group = df_feature.groupby('deviceid')
df_feature['ts_before'] = group['ts'].shift(0) - group['ts'].shift(1)
df_feature['ts_before'] = df_feature['ts_before'].fillna(3 * 60 * 1000)
INDEX = df_feature[df_feature['ts_before'] > (3 * 60 * 1000 - 1)].index
df_feature['ts_before'] = np.log(df_feature['ts_before'] // 1000 + 1)
LENGTH = len(INDEX)
ts_len = []
group = []
for i in tqdm(range(1, LENGTH)):
    ts_len += [(INDEX[i] - INDEX[i - 1])] * (INDEX[i] - INDEX[i - 1])
    group += [i] * (INDEX[i] - INDEX[i - 1])
ts_len += [(len(df_feature) - INDEX[LENGTH - 1])] * \
    (len(df_feature) - INDEX[LENGTH - 1])
group += [LENGTH] * (len(df_feature) - INDEX[LENGTH - 1])
df_feature['ts_before_len'] = ts_len
df_feature['group'] = group

group = df_feature.groupby('deviceid')
df_feature['ts_after'] = group['ts'].shift(-1) - group['ts'].shift(0)
df_feature['ts_after'] = df_feature['ts_after'].fillna(3 * 60 * 1000)
INDEX = df_feature[df_feature['ts_after'] > (3 * 60 * 1000 - 1)].index
df_feature['ts_after'] = np.log(df_feature['ts_after'] // 1000 + 1)
LENGTH = len(INDEX)
ts_len = [INDEX[0]] * (INDEX[0] + 1)
for i in tqdm(range(1, LENGTH)):
    ts_len += [(INDEX[i] - INDEX[i - 1])] * (INDEX[i] - INDEX[i - 1])
df_feature['ts_after_len'] = ts_len

In [None]:
# 类别交叉特征
df_feature['devicevendor_osv'] = df_feature['device_vendor'].astype(
    'str') + '_' + df_feature['osversion'].astype('str')

In [None]:
# 下一次 pos
df_feature['before_pos'] = df_feature.groupby(['deviceid'])['pos'].shift(1)
df_feature['next_pos'] = df_feature.groupby(['deviceid'])['pos'].shift(-1)
df_feature['diff_pos'] = df_feature['next_pos'] - df_feature['pos']

# 距离变化
df_feature['next_lat'] = df_feature.groupby(['deviceid'])['lat'].shift(-1)
df_feature['next_lng'] = df_feature.groupby(['deviceid'])['lng'].shift(-1)
df_feature['dist_diff'] = (df_feature['next_lat'] - df_feature['lat']
                           ) ** 2 + (df_feature['lng'] - df_feature['next_lng']) ** 2

del df_feature['next_lat']
del df_feature['next_lng']

# 下一次 网络
df_feature['next_netmodel'] = df_feature.groupby(['deviceid'])[
    'netmodel'].shift(-1)

In [None]:
df_feature.head()

# 历史特征

## day 为单位 

In [None]:
# 对前一天的样本的所有反应时间进行统计量提取
df_temp = df_feature[df_feature['target'] == 1]
df_temp['click_minus'] = df_temp['timestamp'] - df_temp['ts']

col = 'deviceid'
col2 = 'click_minus'

df_temp = df_temp.groupby([col, 'day'], as_index=False)[col2].agg({
    'yesterday_{}_{}_max'.format(col, col2): 'max',
    'yesterday_{}_{}_mean'.format(col, col2): 'mean',
    'yesterday_{}_{}_min'.format(col, col2): 'min',
    'yesterday_{}_{}_std'.format(col, col2): 'std',
    'yesterday_{}_{}_median'.format(col, col2): 'median',
    'yesterday_{}_{}_kurt'.format(col, col2): kurtosis,
    'yesterday_{}_{}_q3'.format(col, col2): lambda x: np.quantile(x, q=0.75),
})
df_temp['day'] += 1

df_feature = df_feature.merge(df_temp, on=[col, 'day'], how='left')

del df_temp
gc.collect()

In [None]:
# 昨日 deviceid 点击次数，点击率
col = 'deviceid'
df_temp = df_feature.groupby([col, 'day'], as_index=False)['target'].agg({
    'yesterday_{}_click_count'.format(col): 'sum',
    'yesterday_{}_count'.format(col): 'count',
})
df_temp['yesterday_{}_ctr'.format(col)] = df_temp['yesterday_{}_click_count'.format(col)] \
    / df_temp['yesterday_{}_count'.format(col)]
df_temp['day'] += 1
del df_temp['yesterday_{}_count'.format(col)]

df_feature = df_feature.merge(df_temp, on=[col, 'day'], how='left')

del df_temp
gc.collect()

In [None]:
# 昨日小时点击率
groups = ['deviceid', 'hour']
df_temp = df_feature.groupby(groups + ['day'], as_index=False)['target'].agg({
    'yesterday_{}_click_count'.format('_'.join(groups)): 'sum',
    'yesterday_{}_count'.format('_'.join(groups)): 'count',
})

df_temp['yesterday_{}_ctr'.format('_'.join(groups))] = df_temp['yesterday_{}_click_count'.format('_'.join(groups))] \
    / df_temp['yesterday_{}_count'.format('_'.join(groups))]
df_temp['day'] += 1

del df_temp['yesterday_{}_click_count'.format('_'.join(groups))]
del df_temp['yesterday_{}_count'.format('_'.join(groups))]

df_feature = df_feature.merge(df_temp, on=groups + ['day'], how='left')

del df_temp
gc.collect()

In [None]:
# 昨日曝光 pos 平均值
col = 'deviceid'
df_temp = df_feature.groupby([col, 'day'], as_index=False)['pos'].agg({
    'yesterday_{}_pos_mean'.format(col): 'mean',
})
df_temp['day'] += 1

df_feature = df_feature.merge(df_temp, on=[col, 'day'], how='left')

del df_temp
gc.collect()

In [None]:
# 昨日 deviceid netmodel 点击率
groups = ['deviceid', 'netmodel']
df_temp = df_feature.groupby(groups + ['day'], as_index=False)['target'].agg({
    'yesterday_{}_click_count'.format('_'.join(groups)): 'sum',
    'yesterday_{}_count'.format('_'.join(groups)): 'count',
})

df_temp['yesterday_{}_ctr'.format('_'.join(groups))] = df_temp['yesterday_{}_click_count'.format('_'.join(groups))] \
    / df_temp['yesterday_{}_count'.format('_'.join(groups))]

df_temp['day'] += 1

df_feature = df_feature.merge(df_temp, on=groups + ['day'], how='left')
df_feature['yesterday_deviceid_netmodel_click_ratio'] = df_feature['yesterday_deviceid_netmodel_click_count'] / \
    df_feature['yesterday_deviceid_click_count']

del df_feature['yesterday_{}_click_count'.format('_'.join(groups))]
del df_feature['yesterday_{}_count'.format('_'.join(groups))]

del df_temp
gc.collect()

In [None]:
# 对前一天的 newsid 所有反应时间进行统计量提取
df_temp = df_feature[df_feature['target'] == 1]
df_temp['click_minus'] = df_temp['timestamp'] - df_temp['ts']

col = 'newsid'
col2 = 'click_minus'

df_temp = df_temp.groupby([col, 'day'], as_index=False)[col2].agg({
    'yesterday_{}_{}_std'.format(col, col2): 'std',
})
df_temp['day'] += 1

df_feature = df_feature.merge(df_temp, on=[col, 'day'], how='left')

del df_temp
gc.collect()

In [None]:
# 昨日 newsid 点击次数，点击率
col = 'newsid'
df_temp = df_feature.groupby([col, 'day'], as_index=False)['target'].agg({
    'yesterday_{}_click_count'.format(col): 'sum',
    'yesterday_{}_count'.format(col): 'count',
})
df_temp['yesterday_{}_ctr'.format(col)] = df_temp['yesterday_{}_click_count'.format(col)] \
    / df_temp['yesterday_{}_count'.format(col)]

df_temp['day'] += 1
del df_temp['yesterday_{}_count'.format(col)]

df_feature = df_feature.merge(df_temp, on=[col, 'day'], how='left')

del df_temp
gc.collect()

In [None]:
# 昨日 next_pos 点击率
col = 'next_pos'
df_temp = df_feature.groupby([col, 'day'], as_index=False)['target'].agg({
    'yesterday_{}_click_count'.format(col): 'sum',
    'yesterday_{}_count'.format(col): 'count',
})
df_temp['yesterday_{}_ctr'.format(col)] = df_temp['yesterday_{}_click_count'.format(col)] \
    / df_temp['yesterday_{}_count'.format(col)]

df_temp['day'] += 1

del df_temp['yesterday_{}_count'.format(col)]
del df_temp['yesterday_{}_click_count'.format(col)]

df_feature = df_feature.merge(df_temp, on=[col, 'day'], how='left')

del df_temp
gc.collect()

In [None]:
cat_list = tqdm([['deviceid', 'netmodel']])
for f1, f2 in cat_list:
    df_feature['t_{}_count'.format(f1)] = df_feature.groupby([f1, 'day'])[
        'id'].transform('count')
    df_feature['t_{}_count'.format(f2)] = df_feature.groupby([f2, 'day'])[
        'id'].transform('count')
    df_feature['t_{}_count'.format('_'.join([f1, f2]))] = df_feature.groupby([
        f1, f2, 'day'])['id'].transform('count')

    df_feature['{}_coratio'.format('_'.join([f1, f2]))] = (df_feature['t_{}_count'.format(
        f1)] * df_feature['t_{}_count'.format(f2)]) / df_feature['t_{}_count'.format('_'.join([f1, f2]))]
    df_feature['yesterday_{}_coratio'.format('_'.join([f1, f2]))] = df_feature.groupby(
        [f1, f2, 'day'])['{}_coratio'.format('_'.join([f1, f2]))].shift()

    del df_feature['t_{}_count'.format(f1)]
    del df_feature['t_{}_count'.format(f2)]
    del df_feature['t_{}_count'.format('_'.join([f1, f2]))]
    del df_feature['{}_coratio'.format('_'.join([f1, f2]))]

    gc.collect()

In [None]:
df_feature.head()

## 以 hour 为单位

In [None]:
# 一小时之前 deviceid 点击次数，点击率
col = 'deviceid'
df_temp = df_feature.groupby([col, 'hourl'], as_index=False)['id'].agg({
    'pre_hour_{}_count'.format(col): 'count',
})
df_temp['hourl'] += 1

df_feature = df_feature.merge(df_temp, on=[col, 'hourl'], how='left')

del df_temp
gc.collect()

In [None]:
df_feature.head()

# 统计特征

In [None]:
cat_list = [['deviceid'], ['guid'], ['newsid'], ['deviceid', 'pos'], ['newsid', 'pos'],
            ['deviceid', 'guid', 'newsid'], ['deviceid', 'next_pos']]
for f in tqdm(cat_list):
    df_feature['{}_day_count'.format('_'.join(f))] = df_feature.groupby([
        'day'] + f)['id'].transform('count')

cat_list = [['deviceid'], ['guid'], [
    'deviceid', 'pos'], ['deviceid', 'netmodel']]
for f in tqdm(cat_list):
    df_feature['{}_minute10_count'.format('_'.join(f))] = df_feature.groupby(
        ['day', 'hour', 'minute10'] + f)['id'].transform('count')

cat_list = [['deviceid', 'netmodel']]
for f in tqdm(cat_list):
    df_feature['{}_hour_count'.format('_'.join(f))] = df_feature.groupby([
        'hourl'] + f)['id'].transform('count')

cat_list = [['deviceid', 'group', 'pos']]
for f in tqdm(cat_list):
    df_feature['{}_count'.format('_'.join(f))] = df_feature.groupby(f)[
        'id'].transform('count')

In [None]:
col = 'group'
df_temp = df_feature.groupby([col], as_index=False)['ts_before'].agg({
    '{}_ts_before_mean'.format(col): 'mean',
    '{}_ts_before_std'.format(col): 'std'
})
df_feature = df_feature.merge(df_temp, on=col, how='left')

del df_temp
gc.collect()

In [None]:
col = 'deviceid'
df_temp = df_feature.groupby([col], as_index=False)['ts_after'].agg({
    '{}_ts_after_mean'.format('deviceid'): 'mean',
    '{}_ts_after_std'.format('deviceid'): 'std',
    '{}_ts_after_median'.format('deviceid'): 'median',
    '{}_ts_after_skew'.format('deviceid'): 'skew',
})
df_feature = df_feature.merge(df_temp, on=col, how='left')

del df_temp
gc.collect()

In [None]:
df_temp = df_feature.groupby(['deviceid', 'hourl'], as_index=False)[
    'target'].agg({'hour_count': 'size'})
df_temp = df_temp.groupby(['deviceid'], as_index=False)['hour_count'].agg({
    '{}_hour_count_mean'.format('deviceid'): 'mean'
})

df_feature = df_feature.merge(df_temp, how='left')

del df_temp
gc.collect()

In [None]:
df_feature['deviceid_hour_cumsum'] = df_feature.groupby(['deviceid', 'hourl'])[
    'ts'].cumcount()

In [None]:
df_temp = df_feature[['deviceid', 'day', 'deviceid_day_count']].copy(deep=True)
df_temp.drop_duplicates(inplace=True)
df_temp['deviceid_day_count_diff_1'] = df_temp.groupby(
    ['deviceid'])['deviceid_day_count'].diff()

del df_temp['deviceid_day_count']
df_feature = df_feature.merge(df_temp, how='left')

del df_temp
gc.collect()

In [None]:
# 未来一小时 deviceid, netmodel 曝光数量
cat_list = [['deviceid', 'netmodel']]
for f in tqdm(cat_list):
    df_feature['temp'] = df_feature.groupby(
        ['hourl'] + f)['id'].transform('count')
    df_feature['next_{}_hour_count'.format('_'.join(f))] = df_feature.groupby(f)[
        'temp'].shift(-1)

    del df_feature['temp']

In [None]:
df_feature.head()

# ts 相关特征

In [None]:
sort_df = df_feature.sort_values('ts').reset_index(drop=True)
for f in [['deviceid']]:
    tmp = sort_df.groupby(f)
    # 前x次曝光到当前的时间差
    for gap in tqdm([2, 3, 4, 5, 8, 10, 20, 30]):
        sort_df['{}_prev{}_exposure_ts_gap'.format(
            '_'.join(f), gap)] = tmp['ts'].shift(0) - tmp['ts'].shift(gap)
        tmp2 = sort_df[
            f + ['ts', '{}_prev{}_exposure_ts_gap'.format('_'.join(f), gap)]
        ].drop_duplicates(f + ['ts']).reset_index(drop=True)
        df_feature = df_feature.merge(tmp2, on=f + ['ts'], how='left')

del tmp2, sort_df, tmp
gc.collect()

In [None]:
sort_df = df_feature.sort_values('ts').reset_index(drop=True)
for f in [['netmodel', 'deviceid']]:
    tmp = sort_df.groupby(f)
    # 前x次曝光到当前的时间差
    for gap in tqdm([2, 3]):
        sort_df['{}_prev{}_exposure_ts_gap'.format(
            '_'.join(f), gap)] = tmp['ts'].shift(0) - tmp['ts'].shift(gap)
        tmp2 = sort_df[
            f + ['ts', '{}_prev{}_exposure_ts_gap'.format('_'.join(f), gap)]
        ].drop_duplicates(f + ['ts']).reset_index(drop=True)
        df_feature = df_feature.merge(tmp2, on=f + ['ts'], how='left')

del tmp2, sort_df, tmp
gc.collect()

In [None]:
sort_df = df_feature.sort_values('ts').reset_index(drop=True)
for f in [['deviceid']]:
    tmp = sort_df.groupby(f)
    # 后x次曝光到当前的时间差
    for gap in tqdm([2, 3, 4, 5, 8, 10, 20, 30, 50]):
        sort_df['{}_next{}_exposure_ts_gap'.format(
            '_'.join(f), gap)] = tmp['ts'].shift(-gap) - tmp['ts'].shift(0)
        tmp2 = sort_df[
            f + ['ts', '{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)]
        ].drop_duplicates(f + ['ts']).reset_index(drop=True)
        df_feature = df_feature.merge(tmp2, on=f + ['ts'], how='left')

del tmp2, sort_df, tmp
gc.collect()

In [None]:
sort_df = df_feature.sort_values('ts').reset_index(drop=True)
for f in [['pos', 'deviceid']]:
    tmp = sort_df.groupby(f)
    # 后x次曝光到当前的时间差
    for gap in tqdm([1, 2]):
        sort_df['{}_next{}_exposure_ts_gap'.format(
            '_'.join(f), gap)] = tmp['ts'].shift(-gap) - tmp['ts'].shift(0)
        tmp2 = sort_df[
            f + ['ts', '{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)]
        ].drop_duplicates(f + ['ts']).reset_index(drop=True)
        df_feature = df_feature.merge(tmp2, on=f + ['ts'], how='left')

del tmp2, sort_df, tmp
gc.collect()

In [None]:
sort_df = df_feature.sort_values('ts').reset_index(drop=True)
for f in [['netmodel', 'deviceid']]:
    tmp = sort_df.groupby(f)
    # 后x次曝光到当前的时间差
    for gap in tqdm([1, 2]):
        sort_df['{}_next{}_exposure_ts_gap'.format(
            '_'.join(f), gap)] = tmp['ts'].shift(-gap) - tmp['ts'].shift(0)
        tmp2 = sort_df[
            f + ['ts', '{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)]
        ].drop_duplicates(f + ['ts']).reset_index(drop=True)
        df_feature = df_feature.merge(tmp2, on=f + ['ts'], how='left')

del tmp2, sort_df, tmp
gc.collect()

In [None]:
sort_df = df_feature.sort_values('ts').reset_index(drop=True)
for f in [['pos', 'netmodel', 'deviceid']]:
    tmp = sort_df.groupby(f)
    # 后x次曝光到当前的时间差
    for gap in tqdm([1]):
        sort_df['{}_next{}_exposure_ts_gap'.format(
            '_'.join(f), gap)] = tmp['ts'].shift(-gap) - tmp['ts'].shift(0)
        tmp2 = sort_df[
            f + ['ts', '{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)]
        ].drop_duplicates(f + ['ts']).reset_index(drop=True)
        df_feature = df_feature.merge(tmp2, on=f + ['ts'], how='left')

del tmp2, sort_df, tmp
gc.collect()

In [None]:
df_feature['lng_lat'] = df_feature['lng'].astype(
    'str') + '_' + df_feature['lat'].astype('str')
sort_df = df_feature.sort_values('ts').reset_index(drop=True)
for f in [['deviceid', 'lng_lat']]:
    tmp = sort_df.groupby(f)
    # 后x次曝光到当前的时间差
    for gap in tqdm([1]):
        sort_df['{}_next{}_exposure_ts_gap'.format(
            '_'.join(f), gap)] = tmp['ts'].shift(-gap) - tmp['ts'].shift(0)
        tmp2 = sort_df[
            f + ['ts', '{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)]
        ].drop_duplicates(f + ['ts']).reset_index(drop=True)
        df_feature = df_feature.merge(tmp2, on=f + ['ts'], how='left')

del tmp2, sort_df, tmp
gc.collect()

In [None]:
sort_df = df_feature.sort_values('ts').reset_index(drop=True)
for f in [['pos', 'deviceid', 'lng_lat']]:
    tmp = sort_df.groupby(f)
    # 后x次曝光到当前的时间差
    for gap in tqdm([1]):
        sort_df['{}_next{}_exposure_ts_gap'.format(
            '_'.join(f), gap)] = tmp['ts'].shift(-gap) - tmp['ts'].shift(0)
        tmp2 = sort_df[
            f + ['ts', '{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)]
        ].drop_duplicates(f + ['ts']).reset_index(drop=True)
        df_feature = df_feature.merge(tmp2, on=f + ['ts'], how='left')

del tmp2, sort_df, tmp
gc.collect()

In [None]:
for gap in tqdm([2, 3, 4, 5, 6, 7]):
    df_feature['next_pos{}'.format(gap)] = df_feature.groupby(
        ['deviceid'])['pos'].shift(-gap)

In [None]:
df_feature['next_pos_ts'] = df_feature['next_pos'] * \
    100 + df_feature['ts_after']

In [None]:
df_feature.head()

# user 表

In [None]:
df_user = pd.read_csv(os.path.join(current_path, 'raw_data', 'user.csv'))
df_feature = df_feature.merge(
    df_user[['deviceid', 'guid', 'level']], how='left', on=['deviceid', 'guid'])

In [None]:
df_tag = df_user[['deviceid', 'tag']].copy()

node_pairs = []
for item in tqdm(df_user[['deviceid', 'tag']].values):
    deviceid = str(item[0])
    tags = item[1]

    if type(tags) != float:
        tags = tags.split('|')
        for tag in tags:
            try:
                key, value = tag.split(':')
            except Exception:
                pass
            node_pairs.append([deviceid, key, value])

df_tag = pd.DataFrame(node_pairs)
df_tag.columns = ['deviceid', 'tag', 'score']
df_tag['score'] = df_tag['score'].astype('float')

df_temp = df_tag.groupby(['deviceid'])['score'].agg({'tag_score_mean': 'mean',
                                                     'tag_score_std': 'std',
                                                     'tag_score_count': 'count',
                                                     'tag_score_q2': lambda x: np.quantile(x, q=0.5),
                                                     'tag_score_q3': lambda x: np.quantile(x, q=0.75),
                                                     }).reset_index()

df_feature = df_feature.merge(df_temp, how='left')

del df_temp
del df_tag

gc.collect()

# embedding

In [None]:
from gensim.models import Word2Vec


def emb(df, f1, f2):
    emb_size = 16
    print('====================================== {} {} ======================================'.format(f1, f2))
    tmp = df.groupby(f1, as_index=False)[f2].agg(
        {'{}_{}_list'.format(f1, f2): list})
    sentences = tmp['{}_{}_list'.format(f1, f2)].values.tolist()
    del tmp['{}_{}_list'.format(f1, f2)]
    for i in range(len(sentences)):
        sentences[i] = [str(x) for x in sentences[i]]
    model = Word2Vec(sentences, size=emb_size, window=5,
                     min_count=5, sg=0, hs=1, seed=2019)
    emb_matrix = []
    for seq in sentences:
        vec = []
        for w in seq:
            if w in model:
                vec.append(model[w])
        if len(vec) > 0:
            emb_matrix.append(np.mean(vec, axis=0))
        else:
            emb_matrix.append([0] * emb_size)

    df_emb = pd.DataFrame(emb_matrix)
    df_emb.columns = ['{}_{}_emb_{}'.format(
        f1, f2, i) for i in range(emb_size)]

    tmp = pd.concat([tmp, df_emb], axis=1)

    del model, emb_matrix, sentences
    return tmp

In [None]:
for f1, f2 in [['newsid', 'deviceid'], ['lng_lat', 'deviceid']]:
    df_feature = df_feature.merge(emb(df_feature, f1, f2), on=f1, how='left')

In [None]:
df_feature['o_d'] = df_feature['deviceid'].astype(
    str)+'_'+df_feature['newsid'].astype(str)

sentence = df_feature[['deviceid', 'newsid', 'o_d']].astype(
    str).fillna('-1').astype(str).values
sentence = sentence.tolist()
print('training...')
np.random.seed(2019)

L = 5
model = Word2Vec(sentence, size=L, window=20, min_count=3,
                 workers=multiprocessing.cpu_count(), iter=10)
print('outputing...')


for fea in tqdm(['deviceid', 'newsid', 'o_d']):
    values = df_feature[fea].unique()
    print(len(values))
    w2v = []
    for i in values:
        a = [i]
        if str(i) in model:
            a.extend(model[str(i)])
        else:
            a.extend(np.ones(L) * -10)
        w2v.append(a)
    w2v = pd.DataFrame(w2v)
    w2v.columns = [fea, fea+'_w2v_1', fea+'_w2v_2', fea+'_w2v_3',
                   fea+'_w2v_4', fea+'_w2v_5']
    df_feature = df_feature.merge(w2v, on=fea, how='left')

In [None]:
df_feature['o_d1'] = df_feature['lng'].astype(
    str)+'_'+df_feature['lat'].astype(str)

sentence = df_feature[['lng', 'lat', 'o_d1']].astype(
    str).fillna('-1').astype(str).values
sentence = sentence.tolist()
print('training...')
np.random.seed(2019)

L = 5
model = Word2Vec(sentence, size=L, window=20, min_count=3,
                 workers=multiprocessing.cpu_count(), iter=10)
print('outputing...')

for fea in tqdm(['lng', 'lat', 'o_d1']):
    values = df_feature[fea].unique()
    print(len(values))
    w2v = []
    for i in values:
        a = [i]
        if str(i) in model:
            a.extend(model[str(i)])
        else:
            a.extend(np.ones(L) * -10)
        w2v.append(a)
    w2v = pd.DataFrame(w2v)
    w2v.columns = [fea, fea+'_w2v_1', fea+'_w2v_2', fea+'_w2v_3',
                   fea+'_w2v_4', fea+'_w2v_5']
    df_feature = df_feature.merge(w2v, on=fea, how='left')

# 减少内存

In [None]:
# Function to reduce the memory usage
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in tqdm(df.columns):
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(
                        np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(
                        np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(
                        np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(
                        np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(
                        np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(
                        np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
            end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
df_feature = reduce_mem_usage(df_feature)

In [None]:
df_feature.to_pickle(os.path.join(current_path, 'feature', 'feature.pickle'))