In [25]:
#coding:utf-8
import pandas as pd
import numpy as np
import time
import datetime
import gc
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
import lightgbm as lgb

In [26]:
#删除Instance_id和day两列
def pre_process(data):

    cols = data.columns.tolist()
    keys = ['instance_id', 'day']

    #修改1
    for k in keys:
        if k in cols:
            cols.remove(k)

    return data, cols

In [27]:

def zuhe(data):

    for col in ['user_gender_id','user_age_level','user_occupation_id','user_star_level']:
        #修改3
        if col not in data.columns:
            print(f"Column '{col}' not found in the dataset.")
            continue
        data[col] = data[col].apply(lambda x: 0 if x == -1 else x)

    for col in ['item_sales_level', 'item_price_level', 'item_collected_level',
                'user_gender_id','user_age_level','user_occupation_id','user_star_level',
                'shop_review_num_level', 'shop_star_level']:
        data[col] = data[col].astype(str)

    print('item两两组合')
    data['sale_price'] = data['item_sales_level'] + data['item_price_level'] #新指标，衡量item的sales和price水平
    data['sale_collect'] = data['item_sales_level'] + data['item_collected_level'] #结合销量和收藏
    data['price_collect'] = data['item_price_level'] + data['item_collected_level'] #结合价格和收藏

    print('user两两组合')
    data['gender_star'] = data['user_gender_id'] + data['user_star_level'] #结合用户性别和星级

    print('shop两两组合')
    data['review_star'] = data['shop_review_num_level'] + data['shop_star_level'] #结合店铺评论数量和星级
#修改2
#     # 将字符串 "nan" 替换为 NaN
#     data.replace("nan", np.nan, inplace=True)

#     # 填充缺失值为零
#     data.fillna(0, inplace=True)
    
    for col in ['item_sales_level', 'item_price_level', 'item_collected_level',  'sale_price','sale_collect', 'price_collect',
                'user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level']:
        data[col] = data[col].astype(int)

    del data['review_star'] 

    return data

In [28]:
#对商品（item）相关的特征进行一些统计和概率计算

def item(data):

    print('一个item有多少brand,price salse collected level……')

    #统计每个商品（item）的数量，新列名：item_cnt
    itemcnt = data.groupby(['item_id'], as_index=False)['instance_id'].agg({'item_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['item_id'], how='left')
    
    # 去除 'item_brand_id','item_city_id'
    #对每个item的价格，销量，收藏，pv统计，并计算相对总量的概率（item_prob, 
    #eg: 对 item_price_level 进行了统计，计算了每个商品在不同 item_price_level 下的数量以及相对于总数量的概率。
    #item1*2, price3price2, forprice3 prob=1/2。就是看item的price/sales...分布
    for col in ['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level']:
        itemcnt = data.groupby([col, 'item_id'], as_index=False)['instance_id'].agg({str(col) + '_item_cnt': 'count'})
        data = pd.merge(data, itemcnt, on=[col, 'item_id'], how='left')
        data[str(col) + '_item_prob']=data[str(col) + '_item_cnt']/data['item_cnt']
        del data[str(col) + '_item_cnt']
    del data['item_cnt']

    print('一个brand有多少price salse collected level……')

    itemcnt = data.groupby(['item_brand_id'], as_index=False)['instance_id'].agg({'item_brand_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['item_brand_id'], how='left')

    for col in ['item_city_id', 'item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level']:
        itemcnt = data.groupby([col, 'item_brand_id'], as_index=False)['instance_id'].agg({str(col) + '_brand_cnt': 'count'})
        data = pd.merge(data, itemcnt, on=[col, 'item_brand_id'], how='left')
        data[str(col) + '_brand_prob'] = data[str(col) + '_brand_cnt'] / data['item_brand_cnt']
        del data[str(col) + '_brand_cnt']
    del data['item_brand_cnt']

    print('一个city有多少item_price_level，item_sales_level，item_collected_level，item_pv_level')

    itemcnt = data.groupby(['item_city_id'], as_index=False)['instance_id'].agg({'item_city_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['item_city_id'], how='left')
    for col in ['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level']:
        itemcnt = data.groupby([col, 'item_city_id'], as_index=False)['instance_id'].agg({str(col) + '_city_cnt': 'count'})
        data = pd.merge(data, itemcnt, on=[col, 'item_city_id'], how='left')
        data[str(col) + '_city_prob'] = data[str(col) + '_city_cnt'] / data['item_city_cnt']
        del data[str(col) + '_city_cnt']
    del data['item_city_cnt']

    print('一个price有多少item_sales_level，item_collected_level，item_pv_level')

    itemcnt = data.groupby(['item_price_level'], as_index=False)['instance_id'].agg({'item_price_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['item_price_level'], how='left')
    for col in ['item_sales_level', 'item_collected_level', 'item_pv_level']:
        itemcnt = data.groupby([col, 'item_city_id'], as_index=False)['instance_id'].agg({str(col) + '_price_cnt': 'count'})
        data = pd.merge(data, itemcnt, on=[col, 'item_city_id'], how='left')
        data[str(col) + '_price_prob'] = data[str(col) + '_price_cnt'] / data['item_price_cnt']
        del data[str(col) + '_price_cnt']
    del data['item_price_cnt']

    print('一个item_sales_level有多少item_collected_level，item_pv_level')

    itemcnt = data.groupby(['item_sales_level'], as_index=False)['instance_id'].agg({'item_salse_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['item_sales_level'], how='left')
    for col in ['item_collected_level', 'item_pv_level']:
        itemcnt = data.groupby([col, 'item_sales_level'], as_index=False)['instance_id'].agg({str(col) + '_salse_cnt': 'count'})
        data = pd.merge(data, itemcnt, on=[col, 'item_sales_level'], how='left')
        data[str(col) + '_salse_prob'] = data[str(col) + '_salse_cnt'] / data['item_salse_cnt']
        del data[str(col) + '_salse_cnt']
    del data['item_salse_cnt']

    print('一个item_collected_level有多少item_pv_level')

    itemcnt = data.groupby(['item_collected_level'], as_index=False)['instance_id'].agg({'item_coll_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['item_collected_level'], how='left')
    for col in ['item_pv_level']:
        itemcnt = data.groupby([col, 'item_collected_level'], as_index=False)['instance_id'].agg({str(col) + '_coll_cnt': 'count'})
        data = pd.merge(data, itemcnt, on=[col, 'item_collected_level'], how='left')
        data[str(col) + '_coll_prob'] = data[str(col) + '_coll_cnt'] / data['item_coll_cnt']
        del data[str(col) + '_coll_cnt']
    del data['item_coll_cnt']
 
    return data
#这样的统计信息可以帮助理解每个商品的销售情况，以及与其他特征的关系。

In [29]:
#对用户和商品之间的关系进行一些统计和概率计算
def user_item(data):

    itemcnt = data.groupby(['user_id'], as_index=False)['instance_id'].agg({'user_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['user_id'], how='left')

    print('一个user有多少item_id,item_brand_id……')
    for col in ['item_id','item_category_list',
                'item_brand_id','item_city_id','item_price_level',
                'item_sales_level','item_collected_level','item_pv_level']:
        item_shop_cnt = data.groupby([col, 'user_id'], as_index=False)['instance_id'].agg({str(col)+'_user_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'user_id'], how='left')
        data[str(col) + '_user_prob'] = data[str(col) + '_user_cnt'] / data['user_cnt']
        del data[str(col) + '_user_cnt']

    print('一个user_gender有多少item_id,item_brand_id……')
    itemcnt = data.groupby(['user_gender_id'], as_index=False)['instance_id'].agg({'user_gender_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['user_gender_id'], how='left')
    for col in ['item_id', 'item_category_list',
                'item_brand_id','item_city_id','item_price_level',
                'item_sales_level','item_collected_level','item_pv_level']:
        item_shop_cnt = data.groupby([col, 'user_gender_id'], as_index=False)['instance_id'].agg({str(col)+'_user_gender_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'user_gender_id'], how='left')
        data[str(col) + '_user_gender_prob'] = data[str(col) + '_user_gender_cnt'] / data['user_gender_cnt']
        del data[str(col) + '_user_gender_cnt']

    print('一个user_age_level有多少item_id,item_brand_id……')
    itemcnt = data.groupby(['user_age_level'], as_index=False)['instance_id'].agg({'user_age_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['user_age_level'], how='left')
    for col in ['item_id', 'item_category_list',
                'item_brand_id','item_city_id','item_price_level',
                'item_sales_level','item_collected_level','item_pv_level']:
        item_shop_cnt = data.groupby([col, 'user_age_level'], as_index=False)['instance_id'].agg({str(col)+'_user_age_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'user_age_level'], how='left')
        data[str(col) + '_user_age_prob'] = data[str(col) + '_user_age_cnt'] / data['user_age_cnt']
        del data[str(col) + '_user_age_cnt']

    print('一个user_occupation_id有多少item_id,item_brand_id…')
    itemcnt = data.groupby(['user_occupation_id'], as_index=False)['instance_id'].agg({'user_occ_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['user_occupation_id'], how='left')
    for col in ['item_id', 'item_category_list',
                'item_brand_id','item_city_id','item_price_level',
                'item_sales_level','item_collected_level','item_pv_level']:
        item_shop_cnt = data.groupby([col, 'user_occupation_id'], as_index=False)['instance_id'].agg({str(col)+'_user_occ_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'user_occupation_id'], how='left')
        data[str(col) + '_user_occ_prob'] = data[str(col) + '_user_occ_cnt'] / data['user_occ_cnt']
        del data[str(col) + '_user_occ_cnt']

    return data

#这个函数的目的是通过统计用户与商品的关系，为每个用户创建一些统计信息和概率信息，
#这样的统计信息可以帮助理解每个用户与不同商品特征之间的关系。

In [30]:
def user_shop(data):

    print('一个user有多少shop_id,shop_review_num_level……')
    for col in ['shop_id', 'shop_review_num_level', 'shop_star_level']:
        item_shop_cnt = data.groupby([col, 'user_id'], as_index=False)['instance_id'].agg(
            {str(col) + '_user_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'user_id'], how='left')
        data[str(col) + '_user_prob'] = data[str(col) + '_user_cnt'] / data['user_cnt']
        del data[str(col) + '_user_cnt']
    del data['user_cnt']

    print('一个user_gender有多少shop_id,shop_review_num_level……')
    for col in ['shop_id', 'shop_review_num_level', 'shop_star_level']:
        item_shop_cnt = data.groupby([col, 'user_gender_id'], as_index=False)['instance_id'].agg(
            {str(col) + '_user_gender_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'user_gender_id'], how='left')
        data[str(col) + '_user_gender_prob'] = data[str(col) + '_user_gender_cnt'] / data['user_gender_cnt']
        del data[str(col) + '_user_gender_cnt']
    del data['user_gender_cnt']

    print('一个user_age_level有多少shop_id,shop_review_num_level……')
    for col in ['shop_id', 'shop_review_num_level', 'shop_star_level']:
        item_shop_cnt = data.groupby([col, 'user_age_level'], as_index=False)['instance_id'].agg(
            {str(col) + '_user_age_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'user_age_level'], how='left')
        data[str(col) + '_user_age_prob'] = data[str(col) + '_user_age_cnt'] / data['user_age_cnt']
        del data[str(col) + '_user_age_cnt']
    del data['user_age_cnt']

    print('一个user_occupation_id有多少shop_id,shop_review_num_level……')
    for col in ['shop_id', 'shop_review_num_level', 'shop_star_level']:
        item_shop_cnt = data.groupby([col, 'user_occupation_id'], as_index=False)['instance_id'].agg(
            {str(col) + '_user_occ_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'user_occupation_id'], how='left')
        data[str(col) + '_user_occ_prob'] = data[str(col) + '_user_occ_cnt'] / data['user_occ_cnt']
        del data[str(col) + '_user_occ_cnt']
    del data['user_occ_cnt']

    return data


In [31]:
def shop_item(data):
    
    print('一个shop有多少item_id,item_brand_id,item_city_id,item_price_level……')
    itemcnt = data.groupby(['shop_id'], as_index=False)['instance_id'].agg({'shop_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['shop_id'], how='left')
    for col in ['item_id',
                'item_brand_id','item_city_id','item_price_level',
                'item_sales_level','item_collected_level','item_pv_level']:
        item_shop_cnt = data.groupby([col, 'shop_id'], as_index=False)['instance_id'].agg({str(col)+'_shop_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'shop_id'], how='left')
        data[str(col) + '_shop_prob'] = data[str(col) + '_shop_cnt'] / data['shop_cnt']
        del data[str(col) + '_shop_cnt']
    del data['shop_cnt']

    print('一个shop_review_num_level有多少item_id,item_brand_id,item_city_id,item_price_level……')
    itemcnt = data.groupby(['shop_review_num_level'], as_index=False)['instance_id'].agg({'shop_rev_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['shop_review_num_level'], how='left')
    for col in ['item_id',
                'item_brand_id','item_city_id','item_price_level',
                'item_sales_level','item_collected_level','item_pv_level']:
        item_shop_cnt = data.groupby([col, 'shop_review_num_level'], as_index=False)['instance_id'].agg({str(col)+'_shop_rev_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'shop_review_num_level'], how='left')
        data[str(col) + '_shop_rev_prob'] = data[str(col) + '_shop_rev_cnt'] / data['shop_rev_cnt']
        del data[str(col) + '_shop_rev_cnt']
    del data['shop_rev_cnt']

    return data

# 读取数据

In [36]:
path = './data/'

train = pd.read_csv(path+'train_all.csv')
test = pd.read_csv(path+'test_all.csv')
#     train = pd.read_csv(path+'02_newfeature_result_all.csv',sep='\t')
#     test = pd.read_csv(path+'02_newfeature_result_day7.csv',sep='\t')

# data = pd.concat([train, test])
data = train.append(test, ignore_index=True)
print(data.columns.tolist())

['instance_id', 'item_id', 'item_category_list', 'item_property_list', 'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level', 'user_id', 'user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level', 'context_id', 'context_timestamp', 'context_page_id', 'predict_category_property', 'shop_id', 'shop_review_num_level', 'shop_review_positive_rate', 'shop_star_level', 'shop_score_service', 'shop_score_delivery', 'shop_score_description', 'is_trade', 'time', 'day', 'hour', 'minute', 'maphour', 'mapmin', 'item_category_1', 'item_category_2']


  data = train.append(test, ignore_index=True)


In [37]:
data

Unnamed: 0,instance_id,item_id,item_category_list,item_property_list,item_brand_id,item_city_id,item_price_level,item_sales_level,item_collected_level,item_pv_level,...,shop_score_description,is_trade,time,day,hour,minute,maphour,mapmin,item_category_1,item_category_2
0,7548377597191725106,9406,836752724084922533;3613783563199627217;1495388...,6241534230954727302;367082587220462692;2072967...,4492,84,8,11,11,17,...,0.978,0.0,2018-09-03 19:13:02,3,19,13,4,14,16,8
1,5975219932893529301,8719,836752724084922533;5685690139879409547;7497531...,6241534230954727302;5131280576272319091;263639...,2821,111,7,9,13,16,...,0.969,0.0,2018-09-06 12:09:32,6,12,9,3,10,19,44
2,7640080258618597885,917,836752724084922533;3613783563199627217;1036082...,5977512434884267894;2636395404473730413;314574...,3812,111,5,10,13,16,...,0.952,0.0,2018-09-03 21:53:39,3,21,53,4,9,16,4
3,3961824130940365274,6764,836752724084922533;1852600517265062354,6241534230954727302;367082587220462692;2072967...,2121,2,7,9,10,18,...,0.985,0.0,2018-08-31 15:25:51,31,15,25,3,11,7,0
4,9169996568220897747,489,836752724084922533;4911723539855588624,6241534230954727302;367082587220462692;5131280...,3474,111,7,11,12,19,...,0.956,1.0,2018-09-01 18:18:47,1,18,18,4,4,17,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121612,5026363925379972351,15099,836752724084922533;3613783563199627217;1036082...,5977512434884267894;2072967855524022579;207296...,4326,167,7,12,15,19,...,0.965,,2018-09-07 22:20:53,7,22,20,4,6,16,4
121613,45836528932802192,15799,836752724084922533;2871729383671301763;3492642...,9148482949976129397;6491818071284064879;325738...,5397,111,5,16,14,20,...,0.960,,2018-09-07 20:43:35,7,20,43,4,14,12,20
121614,5774287585256625409,4776,836752724084922533;2871729383671301763;8123435...,6241534230954727302;2636395404473730413;248870...,1559,160,5,8,8,15,...,0.987,,2018-09-07 23:24:52,7,23,24,4,10,12,52
121615,3606899985219981461,15264,836752724084922533;6693726201323251689,5977512434884267894;4621934203383159480;187732...,998,150,8,11,14,18,...,0.985,,2018-09-07 16:11:30,7,16,11,3,12,22,0


In [38]:
del train
del test
gc.collect()

print('初始维度:', data.shape)

data, cols = pre_process(data)
print('pre_process:', data.shape)


初始维度: (121617, 35)
pre_process: (121617, 35)


In [39]:
#############################
print(data.columns.tolist())
data = zuhe(data)
print('zuhe:', data.shape)

# 均为比率
data = item(data)
print('item:', data.shape)

data = user_item(data)
print('user_item:', data.shape)

data = user_shop(data)
print('user_shop:', data.shape)

data = shop_item(data)
print('shop_item:', data.shape)
###############################

['instance_id', 'item_id', 'item_category_list', 'item_property_list', 'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level', 'user_id', 'user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level', 'context_id', 'context_timestamp', 'context_page_id', 'predict_category_property', 'shop_id', 'shop_review_num_level', 'shop_review_positive_rate', 'shop_star_level', 'shop_score_service', 'shop_score_delivery', 'shop_score_description', 'is_trade', 'time', 'day', 'hour', 'minute', 'maphour', 'mapmin', 'item_category_1', 'item_category_2']
item两两组合
user两两组合
shop两两组合
zuhe: (121617, 39)
一个item有多少brand,price salse collected level……
一个brand有多少price salse collected level……
一个city有多少item_price_level，item_sales_level，item_collected_level，item_pv_level
一个price有多少item_sales_level，item_collected_level，item_pv_level
一个item_sales_level有多少item_collected_level，item_pv_level
一个item_collected_level有多少item_pv_level
item: (121617, 58)


  data[str(col) + '_shop_rev_prob'] = data[str(col) + '_shop_rev_cnt'] / data['shop_rev_cnt']
  data[str(col) + '_shop_rev_prob'] = data[str(col) + '_shop_rev_cnt'] / data['shop_rev_cnt']


shop_item: (121617, 116)


  data[str(col) + '_shop_rev_prob'] = data[str(col) + '_shop_rev_cnt'] / data['shop_rev_cnt']


In [40]:
data = data.drop(cols, axis=1)

# 得到全部训练集
print('经过处理后,全部训练集最终维度:', data.shape)
data.to_csv(path+'201_meng_feat_all.csv', index=False)

# 得到7号训练集
data = data.loc[data.day == 7]
data = data.drop('day', axis=1)
print('经过处理后,7号训练集最终维度:', data.shape)
print(data.columns.tolist())
data.to_csv(path+'201_meng_feat.csv', index=False)

经过处理后,全部训练集最终维度: (121617, 83)
经过处理后,7号训练集最终维度: (28054, 82)
['instance_id', 'sale_price', 'sale_collect', 'price_collect', 'gender_star', 'item_price_level_item_prob', 'item_sales_level_item_prob', 'item_collected_level_item_prob', 'item_pv_level_item_prob', 'item_city_id_brand_prob', 'item_price_level_brand_prob', 'item_sales_level_brand_prob', 'item_collected_level_brand_prob', 'item_pv_level_brand_prob', 'item_price_level_city_prob', 'item_sales_level_city_prob', 'item_collected_level_city_prob', 'item_pv_level_city_prob', 'item_sales_level_price_prob', 'item_collected_level_price_prob', 'item_pv_level_price_prob', 'item_collected_level_salse_prob', 'item_pv_level_salse_prob', 'item_pv_level_coll_prob', 'item_id_user_prob', 'item_category_list_user_prob', 'item_brand_id_user_prob', 'item_city_id_user_prob', 'item_price_level_user_prob', 'item_sales_level_user_prob', 'item_collected_level_user_prob', 'item_pv_level_user_prob', 'item_id_user_gender_prob', 'item_category_list_user_gende

In [41]:
path = './data/'

train = pd.read_csv(path+'201_meng_feat_all.csv')
test = pd.read_csv(path+'201_meng_feat.csv')

In [42]:
train.head(5)

Unnamed: 0,instance_id,day,sale_price,sale_collect,price_collect,gender_star,item_price_level_item_prob,item_sales_level_item_prob,item_collected_level_item_prob,item_pv_level_item_prob,...,item_sales_level_shop_prob,item_collected_level_shop_prob,item_pv_level_shop_prob,item_id_shop_rev_prob,item_brand_id_shop_rev_prob,item_city_id_shop_rev_prob,item_price_level_shop_rev_prob,item_sales_level_shop_rev_prob,item_collected_level_shop_rev_prob,item_pv_level_shop_rev_prob
0,7548377597191725106,3,118,1111,811,6,1.0,1.0,1.0,0.75,...,0.794118,0.588235,0.823529,0.001289,0.002192,0.044221,0.188616,0.176691,0.140656,0.208277
1,5975219932893529301,6,97,913,713,3,1.0,0.5,0.777778,0.944444,...,0.089109,0.178218,0.168317,0.002559,0.002559,0.042087,0.311674,0.089293,0.235888,0.129817
2,7640080258618597885,3,105,1013,513,4,1.0,0.5,1.0,0.5,...,0.375,0.75,0.375,0.000387,0.000387,0.208664,0.117772,0.138142,0.177851,0.187971
3,3961824130940365274,31,97,910,710,2,1.0,0.315789,1.0,1.0,...,0.137255,0.72549,0.411765,0.001136,0.003048,0.135497,0.385333,0.1141,0.105792,0.222402
4,9169996568220897747,1,117,1112,712,8,1.0,0.911111,1.0,1.0,...,0.256637,0.374631,0.39528,0.00269,0.020262,0.245174,0.385333,0.169565,0.17351,0.140936
