In [18]:
#coding:utf-8
import pandas as pd
import numpy as np
import time
import datetime
import gc
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
import lightgbm as lgb

In [19]:
#删除Instance_id和day两列
def pre_process(data):

    cols = data.columns.tolist()
    keys = ['instance_id', 'day']

    #修改1
    for k in keys:
        if k in cols:
            cols.remove(k)

    return data, cols

In [20]:

def zuhe(data):

    for col in ['user_gender_id','user_age_level','user_occupation_id','user_star_level']:
        #修改3
        if col not in data.columns:
            print(f"Column '{col}' not found in the dataset.")
            continue
        data[col] = data[col].apply(lambda x: 0 if x == -1 else x)

    for col in ['item_sales_level', 'item_price_level', 'item_collected_level',
                'user_gender_id','user_age_level','user_occupation_id','user_star_level',
                'shop_review_num_level', 'shop_star_level']:
        data[col] = data[col].astype(str)

    print('item两两组合')
    data['sale_price'] = data['item_sales_level'] + data['item_price_level'] #新指标，衡量item的sales和price水平
    data['sale_collect'] = data['item_sales_level'] + data['item_collected_level'] #结合销量和收藏
    data['price_collect'] = data['item_price_level'] + data['item_collected_level'] #结合价格和收藏

    print('user两两组合')
    data['gender_star'] = data['user_gender_id'] + data['user_star_level'] #结合用户性别和星级

    print('shop两两组合')
    data['review_star'] = data['shop_review_num_level'] + data['shop_star_level'] #结合店铺评论数量和星级
#修改2
#     # 将字符串 "nan" 替换为 NaN
#     data.replace("nan", np.nan, inplace=True)

#     # 填充缺失值为零
#     data.fillna(0, inplace=True)
    
    for col in ['item_sales_level', 'item_price_level', 'item_collected_level',  'sale_price','sale_collect', 'price_collect',
                'user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level']:
        data[col] = data[col].astype(int)

    del data['review_star'] 

    return data

In [21]:
#对商品（item）相关的特征进行一些统计和概率计算

def item(data):

    print('一个item有多少brand,price salse collected level……')

    #统计每个商品（item）的数量，新列名：item_cnt
    itemcnt = data.groupby(['item_id'], as_index=False)['instance_id'].agg({'item_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['item_id'], how='left')
    
    # 去除 'item_brand_id','item_city_id'
    #对每个item的价格，销量，收藏，pv统计，并计算相对总量的概率（item_prob, 
    #eg: 对 item_price_level 进行了统计，计算了每个商品在不同 item_price_level 下的数量以及相对于总数量的概率。
    #item1*2, price3price2, forprice3 prob=1/2。就是看item的price/sales...分布
    for col in ['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level']:
        itemcnt = data.groupby([col, 'item_id'], as_index=False)['instance_id'].agg({str(col) + '_item_cnt': 'count'})
        data = pd.merge(data, itemcnt, on=[col, 'item_id'], how='left')
        data[str(col) + '_item_prob']=data[str(col) + '_item_cnt']/data['item_cnt']
        del data[str(col) + '_item_cnt']
    del data['item_cnt']

    print('一个brand有多少price salse collected level……')

    itemcnt = data.groupby(['item_brand_id'], as_index=False)['instance_id'].agg({'item_brand_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['item_brand_id'], how='left')

    for col in ['item_city_id', 'item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level']:
        itemcnt = data.groupby([col, 'item_brand_id'], as_index=False)['instance_id'].agg({str(col) + '_brand_cnt': 'count'})
        data = pd.merge(data, itemcnt, on=[col, 'item_brand_id'], how='left')
        data[str(col) + '_brand_prob'] = data[str(col) + '_brand_cnt'] / data['item_brand_cnt']
        del data[str(col) + '_brand_cnt']
    del data['item_brand_cnt']

    print('一个city有多少item_price_level，item_sales_level，item_collected_level，item_pv_level')

    itemcnt = data.groupby(['item_city_id'], as_index=False)['instance_id'].agg({'item_city_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['item_city_id'], how='left')
    for col in ['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level']:
        itemcnt = data.groupby([col, 'item_city_id'], as_index=False)['instance_id'].agg({str(col) + '_city_cnt': 'count'})
        data = pd.merge(data, itemcnt, on=[col, 'item_city_id'], how='left')
        data[str(col) + '_city_prob'] = data[str(col) + '_city_cnt'] / data['item_city_cnt']
        del data[str(col) + '_city_cnt']
    del data['item_city_cnt']

    print('一个price有多少item_sales_level，item_collected_level，item_pv_level')

    itemcnt = data.groupby(['item_price_level'], as_index=False)['instance_id'].agg({'item_price_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['item_price_level'], how='left')
    for col in ['item_sales_level', 'item_collected_level', 'item_pv_level']:
        itemcnt = data.groupby([col, 'item_city_id'], as_index=False)['instance_id'].agg({str(col) + '_price_cnt': 'count'})
        data = pd.merge(data, itemcnt, on=[col, 'item_city_id'], how='left')
        data[str(col) + '_price_prob'] = data[str(col) + '_price_cnt'] / data['item_price_cnt']
        del data[str(col) + '_price_cnt']
    del data['item_price_cnt']

    print('一个item_sales_level有多少item_collected_level，item_pv_level')

    itemcnt = data.groupby(['item_sales_level'], as_index=False)['instance_id'].agg({'item_salse_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['item_sales_level'], how='left')
    for col in ['item_collected_level', 'item_pv_level']:
        itemcnt = data.groupby([col, 'item_sales_level'], as_index=False)['instance_id'].agg({str(col) + '_salse_cnt': 'count'})
        data = pd.merge(data, itemcnt, on=[col, 'item_sales_level'], how='left')
        data[str(col) + '_salse_prob'] = data[str(col) + '_salse_cnt'] / data['item_salse_cnt']
        del data[str(col) + '_salse_cnt']
    del data['item_salse_cnt']

    print('一个item_collected_level有多少item_pv_level')

    itemcnt = data.groupby(['item_collected_level'], as_index=False)['instance_id'].agg({'item_coll_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['item_collected_level'], how='left')
    for col in ['item_pv_level']:
        itemcnt = data.groupby([col, 'item_collected_level'], as_index=False)['instance_id'].agg({str(col) + '_coll_cnt': 'count'})
        data = pd.merge(data, itemcnt, on=[col, 'item_collected_level'], how='left')
        data[str(col) + '_coll_prob'] = data[str(col) + '_coll_cnt'] / data['item_coll_cnt']
        del data[str(col) + '_coll_cnt']
    del data['item_coll_cnt']
 
    return data
#这样的统计信息可以帮助理解每个商品的销售情况，以及与其他特征的关系。

In [22]:
#对用户和商品之间的关系进行一些统计和概率计算
def user_item(data):

    itemcnt = data.groupby(['user_id'], as_index=False)['instance_id'].agg({'user_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['user_id'], how='left')

    print('一个user有多少item_id,item_brand_id……')
    for col in ['item_id','item_category_list',
                'item_brand_id','item_city_id','item_price_level',
                'item_sales_level','item_collected_level','item_pv_level']:
        item_shop_cnt = data.groupby([col, 'user_id'], as_index=False)['instance_id'].agg({str(col)+'_user_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'user_id'], how='left')
        data[str(col) + '_user_prob'] = data[str(col) + '_user_cnt'] / data['user_cnt']
        del data[str(col) + '_user_cnt']

    print('一个user_gender有多少item_id,item_brand_id……')
    itemcnt = data.groupby(['user_gender_id'], as_index=False)['instance_id'].agg({'user_gender_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['user_gender_id'], how='left')
    for col in ['item_id', 'item_category_list',
                'item_brand_id','item_city_id','item_price_level',
                'item_sales_level','item_collected_level','item_pv_level']:
        item_shop_cnt = data.groupby([col, 'user_gender_id'], as_index=False)['instance_id'].agg({str(col)+'_user_gender_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'user_gender_id'], how='left')
        data[str(col) + '_user_gender_prob'] = data[str(col) + '_user_gender_cnt'] / data['user_gender_cnt']
        del data[str(col) + '_user_gender_cnt']

    print('一个user_age_level有多少item_id,item_brand_id……')
    itemcnt = data.groupby(['user_age_level'], as_index=False)['instance_id'].agg({'user_age_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['user_age_level'], how='left')
    for col in ['item_id', 'item_category_list',
                'item_brand_id','item_city_id','item_price_level',
                'item_sales_level','item_collected_level','item_pv_level']:
        item_shop_cnt = data.groupby([col, 'user_age_level'], as_index=False)['instance_id'].agg({str(col)+'_user_age_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'user_age_level'], how='left')
        data[str(col) + '_user_age_prob'] = data[str(col) + '_user_age_cnt'] / data['user_age_cnt']
        del data[str(col) + '_user_age_cnt']

    print('一个user_occupation_id有多少item_id,item_brand_id…')
    itemcnt = data.groupby(['user_occupation_id'], as_index=False)['instance_id'].agg({'user_occ_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['user_occupation_id'], how='left')
    for col in ['item_id', 'item_category_list',
                'item_brand_id','item_city_id','item_price_level',
                'item_sales_level','item_collected_level','item_pv_level']:
        item_shop_cnt = data.groupby([col, 'user_occupation_id'], as_index=False)['instance_id'].agg({str(col)+'_user_occ_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'user_occupation_id'], how='left')
        data[str(col) + '_user_occ_prob'] = data[str(col) + '_user_occ_cnt'] / data['user_occ_cnt']
        del data[str(col) + '_user_occ_cnt']

    return data

#这个函数的目的是通过统计用户与商品的关系，为每个用户创建一些统计信息和概率信息，
#这样的统计信息可以帮助理解每个用户与不同商品特征之间的关系。

In [23]:
def user_shop(data):

    print('一个user有多少shop_id,shop_review_num_level……')
    for col in ['shop_id', 'shop_review_num_level', 'shop_star_level']:
        item_shop_cnt = data.groupby([col, 'user_id'], as_index=False)['instance_id'].agg(
            {str(col) + '_user_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'user_id'], how='left')
        data[str(col) + '_user_prob'] = data[str(col) + '_user_cnt'] / data['user_cnt']
        del data[str(col) + '_user_cnt']
    del data['user_cnt']

    print('一个user_gender有多少shop_id,shop_review_num_level……')
    for col in ['shop_id', 'shop_review_num_level', 'shop_star_level']:
        item_shop_cnt = data.groupby([col, 'user_gender_id'], as_index=False)['instance_id'].agg(
            {str(col) + '_user_gender_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'user_gender_id'], how='left')
        data[str(col) + '_user_gender_prob'] = data[str(col) + '_user_gender_cnt'] / data['user_gender_cnt']
        del data[str(col) + '_user_gender_cnt']
    del data['user_gender_cnt']

    print('一个user_age_level有多少shop_id,shop_review_num_level……')
    for col in ['shop_id', 'shop_review_num_level', 'shop_star_level']:
        item_shop_cnt = data.groupby([col, 'user_age_level'], as_index=False)['instance_id'].agg(
            {str(col) + '_user_age_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'user_age_level'], how='left')
        data[str(col) + '_user_age_prob'] = data[str(col) + '_user_age_cnt'] / data['user_age_cnt']
        del data[str(col) + '_user_age_cnt']
    del data['user_age_cnt']

    print('一个user_occupation_id有多少shop_id,shop_review_num_level……')
    for col in ['shop_id', 'shop_review_num_level', 'shop_star_level']:
        item_shop_cnt = data.groupby([col, 'user_occupation_id'], as_index=False)['instance_id'].agg(
            {str(col) + '_user_occ_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'user_occupation_id'], how='left')
        data[str(col) + '_user_occ_prob'] = data[str(col) + '_user_occ_cnt'] / data['user_occ_cnt']
        del data[str(col) + '_user_occ_cnt']
    del data['user_occ_cnt']

    return data


In [24]:
def shop_item(data):
    
    print('一个shop有多少item_id,item_brand_id,item_city_id,item_price_level……')
    itemcnt = data.groupby(['shop_id'], as_index=False)['instance_id'].agg({'shop_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['shop_id'], how='left')
    for col in ['item_id',
                'item_brand_id','item_city_id','item_price_level',
                'item_sales_level','item_collected_level','item_pv_level']:
        item_shop_cnt = data.groupby([col, 'shop_id'], as_index=False)['instance_id'].agg({str(col)+'_shop_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'shop_id'], how='left')
        data[str(col) + '_shop_prob'] = data[str(col) + '_shop_cnt'] / data['shop_cnt']
        del data[str(col) + '_shop_cnt']
    del data['shop_cnt']

    print('一个shop_review_num_level有多少item_id,item_brand_id,item_city_id,item_price_level……')
    itemcnt = data.groupby(['shop_review_num_level'], as_index=False)['instance_id'].agg({'shop_rev_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['shop_review_num_level'], how='left')
    for col in ['item_id',
                'item_brand_id','item_city_id','item_price_level',
                'item_sales_level','item_collected_level','item_pv_level']:
        item_shop_cnt = data.groupby([col, 'shop_review_num_level'], as_index=False)['instance_id'].agg({str(col)+'_shop_rev_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'shop_review_num_level'], how='left')
        data[str(col) + '_shop_rev_prob'] = data[str(col) + '_shop_rev_cnt'] / data['shop_rev_cnt']
        del data[str(col) + '_shop_rev_cnt']
    del data['shop_rev_cnt']

    return data

# 读取数据

In [25]:
path =  '/Users/apple/Desktop/data/'

train = pd.read_csv(path+'train_all.csv')
test = pd.read_csv(path+'test_all.csv')


# data = pd.concat([train, test])
data = train.append(test, ignore_index=True)
print(data.columns.tolist())

['instance_id', 'item_id', 'item_category_list', 'item_property_list', 'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level', 'user_id', 'user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level', 'context_id', 'context_timestamp', 'context_page_id', 'predict_category_property', 'shop_id', 'shop_review_num_level', 'shop_review_positive_rate', 'shop_star_level', 'shop_score_service', 'shop_score_delivery', 'shop_score_description', 'is_trade', 'time', 'day', 'hour', 'minute', 'maphour', 'mapmin', 'item_category_1', 'item_category_2']


  data = train.append(test, ignore_index=True)


In [26]:
data

Unnamed: 0,instance_id,item_id,item_category_list,item_property_list,item_brand_id,item_city_id,item_price_level,item_sales_level,item_collected_level,item_pv_level,...,shop_score_description,is_trade,time,day,hour,minute,maphour,mapmin,item_category_1,item_category_2
0,5.089880e+18,2769,836752724084922533;1852600517265062354,5977512434884267894;2072967855524022579;513128...,5982,2,6,11,10,17,...,0.987,0,2018-09-02 20:31:59,2,20,31,4,2,7,0
1,2.163600e+18,16242,836752724084922533;6670526099037031245,6241534230954727302;367082587220462692;5977512...,4705,147,6,9,10,16,...,0.970,0,2018-09-05 13:13:58,5,13,13,3,14,21,0
2,8.229730e+18,7731,836752724084922533;1916390345133212703,5977512434884267894;2636395404473730413;462193...,5385,80,4,12,12,13,...,0.957,0,2018-09-01 21:38:48,1,21,38,4,9,9,0
3,8.978460e+18,68,836752724084922533;1852600517265062354,367082587220462692;5977512434884267894;2072967...,3543,128,9,4,5,11,...,0.994,0,2018-09-02 22:58:54,2,22,58,4,14,7,0
4,3.831630e+18,556,836752724084922533;3348197449185791127,2072967855524022579;639228713552738893;3163265...,6156,103,5,11,9,14,...,0.957,0,2018-09-05 06:43:40,5,6,43,2,14,14,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99080,9.047330e+18,18525,836752724084922533;6693726201323251689,2636395404473730413;4621934203383159480;734498...,5328,21,6,7,10,13,...,0.967,0,2018-09-03 16:16:39,3,16,16,3,2,22,0
99081,1.156510e+18,3634,836752724084922533;1852600517265062354,6241534230954727302;367082587220462692;2072967...,3264,110,6,10,12,18,...,0.964,0,2018-09-06 09:53:02,6,9,53,2,9,7,0
99082,9.093200e+18,10222,836752724084922533;1916390345133212703,5977512434884267894;18773227022480660;48375106...,5385,87,4,12,13,19,...,0.982,0,2018-09-02 14:42:50,2,14,42,3,13,9,0
99083,8.695670e+17,11110,836752724084922533;3613783563199627217;1036082...,6241534230954727302;5977512434884267894;263639...,6232,2,7,7,11,14,...,0.968,0,2018-08-31 23:36:10,31,23,36,4,7,16,4


In [27]:
del train
del test
gc.collect()

print('初始维度:', data.shape)

data, cols = pre_process(data)
print('pre_process:', data.shape)


初始维度: (99085, 35)
pre_process: (99085, 35)


In [28]:
#############################
print(data.columns.tolist())
data = zuhe(data)
print('zuhe:', data.shape)

# 均为比率
data = item(data)
print('item:', data.shape)

data = user_item(data)
print('user_item:', data.shape)

data = user_shop(data)
print('user_shop:', data.shape)

data = shop_item(data)
print('shop_item:', data.shape)
###############################

['instance_id', 'item_id', 'item_category_list', 'item_property_list', 'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level', 'user_id', 'user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level', 'context_id', 'context_timestamp', 'context_page_id', 'predict_category_property', 'shop_id', 'shop_review_num_level', 'shop_review_positive_rate', 'shop_star_level', 'shop_score_service', 'shop_score_delivery', 'shop_score_description', 'is_trade', 'time', 'day', 'hour', 'minute', 'maphour', 'mapmin', 'item_category_1', 'item_category_2']
item两两组合
user两两组合
shop两两组合
zuhe: (99085, 39)
一个item有多少brand,price salse collected level……
一个brand有多少price salse collected level……
一个city有多少item_price_level，item_sales_level，item_collected_level，item_pv_level
一个price有多少item_sales_level，item_collected_level，item_pv_level
一个item_sales_level有多少item_collected_level，item_pv_level
一个item_collected_level有多少item_pv_level
item: (99085, 58)
一个

  data[str(col) + '_shop_rev_prob'] = data[str(col) + '_shop_rev_cnt'] / data['shop_rev_cnt']
  data[str(col) + '_shop_rev_prob'] = data[str(col) + '_shop_rev_cnt'] / data['shop_rev_cnt']
  data[str(col) + '_shop_rev_prob'] = data[str(col) + '_shop_rev_cnt'] / data['shop_rev_cnt']
  data[str(col) + '_shop_rev_prob'] = data[str(col) + '_shop_rev_cnt'] / data['shop_rev_cnt']


shop_item: (99085, 116)


  data[str(col) + '_shop_rev_prob'] = data[str(col) + '_shop_rev_cnt'] / data['shop_rev_cnt']


In [29]:
data = data.drop(cols, axis=1)

# 得到全部训练集
print('经过处理后,全部训练集最终维度:', data.shape)
data.to_csv(path+'all_04.csv', index=False)

# 得到7号训练集
data = data.loc[data.day == 7]
data = data.drop('day', axis=1)
print('经过处理后,7号训练集最终维度:', data.shape)
print(data.columns.tolist())
data.to_csv(path+'day7_04.csv', index=False)

经过处理后,全部训练集最终维度: (99085, 83)
经过处理后,7号训练集最终维度: (10184, 82)
['instance_id', 'sale_price', 'sale_collect', 'price_collect', 'gender_star', 'item_price_level_item_prob', 'item_sales_level_item_prob', 'item_collected_level_item_prob', 'item_pv_level_item_prob', 'item_city_id_brand_prob', 'item_price_level_brand_prob', 'item_sales_level_brand_prob', 'item_collected_level_brand_prob', 'item_pv_level_brand_prob', 'item_price_level_city_prob', 'item_sales_level_city_prob', 'item_collected_level_city_prob', 'item_pv_level_city_prob', 'item_sales_level_price_prob', 'item_collected_level_price_prob', 'item_pv_level_price_prob', 'item_collected_level_salse_prob', 'item_pv_level_salse_prob', 'item_pv_level_coll_prob', 'item_id_user_prob', 'item_category_list_user_prob', 'item_brand_id_user_prob', 'item_city_id_user_prob', 'item_price_level_user_prob', 'item_sales_level_user_prob', 'item_collected_level_user_prob', 'item_pv_level_user_prob', 'item_id_user_gender_prob', 'item_category_list_user_gender

In [30]:
path = '/Users/apple/Desktop/data/'

train = pd.read_csv(path+'all_04.csv')
test = pd.read_csv(path+'day7_04.csv')

In [31]:
train.head(5)

Unnamed: 0,instance_id,day,sale_price,sale_collect,price_collect,gender_star,item_price_level_item_prob,item_sales_level_item_prob,item_collected_level_item_prob,item_pv_level_item_prob,...,item_sales_level_shop_prob,item_collected_level_shop_prob,item_pv_level_shop_prob,item_id_shop_rev_prob,item_brand_id_shop_rev_prob,item_city_id_shop_rev_prob,item_price_level_shop_rev_prob,item_sales_level_shop_rev_prob,item_collected_level_shop_rev_prob,item_pv_level_shop_rev_prob
0,5.08988e+18,2,116,1110,610,9,1.0,1.0,1.0,0.75,...,0.617647,0.558824,0.588235,0.000699,0.004017,0.305623,0.277855,0.19752,0.096053,0.178135
1,2.1636e+18,5,96,910,610,5,1.0,0.714286,1.0,0.571429,...,0.137652,0.060729,0.060729,0.001049,0.037009,0.076865,0.328888,0.113725,0.079113,0.105184
2,8.22973e+18,1,124,1212,412,3,1.0,1.0,1.0,1.0,...,0.151515,0.19697,0.227273,0.0003,0.038508,0.05454,0.035811,0.160923,0.215912,0.021127
3,8.97846e+18,2,49,45,95,7,1.0,1.0,1.0,1.0,...,0.333333,0.333333,0.333333,0.00017,0.00051,0.01123,0.037094,0.022631,0.014123,0.014974
4,3.83163e+18,5,115,119,59,7,1.0,1.0,1.0,1.0,...,0.2,0.2,0.2,7.4e-05,7.4e-05,0.009557,0.140317,0.163061,0.062676,0.056823
