In [1]:
import pandas as pd
from collections import Counter
import datetime
import numpy as np
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('../Data/round1_ijcai_18_train_20180301/round1_ijcai_18_train_20180301.txt', sep=' ') # 训练集数据加载

In [3]:
# test_df = pd.read_csv('../Data/round1_ijcai_18_test_b_20180418.txt', sep=' ') # B测试集数据加载
test_df = pd.read_csv('../Data/round1_ijcai_18_test_a_20180301/round1_ijcai_18_test_a_20180301.txt', sep=' ') # A测试集数据加载

# 缺失值处理

In [4]:
# 对训练集中部分特征值为NAN的样本进行删除
df = df[df['shop_review_positive_rate']!=-1]
df = df[df['shop_score_delivery']!=-1]
df = df[df['shop_score_description']!=-1]
df = df[df['shop_score_service']!=-1]
df = df[df['item_brand_id']!=-1]
df = df[df['item_city_id']!=-1]

df = df.reset_index(drop=True) # 删除部分具有缺失值样本后重新更新数据集的index

 某些字段的取值范围可以进行缩放
 
    user_age_level-1000
    user_occupation_id-2000
    user_star_level-3000
    context_page_id-4000
    shop_star_level-4999

In [5]:
def dataNormalization(data):
    data.user_age_level = data.user_age_level.replace(-1,999)
    data.user_age_level = data.user_age_level - 1000

    data.user_occupation_id = data.user_occupation_id.replace(-1,1999)
    data.user_occupation_id = data.user_occupation_id - 2000

    data.user_star_level = data.user_star_level.replace(-1,2999)
    data.user_star_level = data.user_star_level - 3000

    data.context_page_id = data.context_page_id - 4000

    data.shop_star_level = data.shop_star_level - 4999
    
    return data

df = dataNormalization(df)
test_df = dataNormalization(test_df)

In [6]:
def occupationMap(value):
    if value == 3 or value == -1:
        return 1
    return 2

def userStarMap(value):
    if value < 2:
        return 1
    return 2

def ageMap(value):
    if value < 3:
        return 1
    return 2

def pageMap(value):
    if value < 10:
        return 2
    return 1

    
def baseProcess(data):
    data['user_gender'] = data['user_gender_id'].apply(lambda x: 1 if x==-1 else 2)
    data['user_age'] = data['user_age_level'].apply(ageMap)
    data['user_occupation'] = data['user_occupation_id'].apply(occupationMap)
    data['user_star'] = data['user_star_level'].apply(userStarMap)
    data['context_page'] = data['context_page_id'].apply(pageMap)
    return data

In [None]:
# 通过对平均交易量进行可视化操作，进行离散化操作
import matplotlib.pyplot as plt

def visualData(data, colName):
    plt.figure()
    x = data['is_trade'].groupby(data[colName]).mean().index
    y = data['is_trade'].groupby(data[colName]).mean().get_values()
    m = data['is_trade'].groupby(data[colName]).mean().median()
    plt.plot(x,y)
    plt.hlines(m,x.min(),x.max())
#     plt.xlim(x.max()//100*100,x.max())


visualData(df, 'shop_star_level')

# 特征提取

In [7]:
def dataConvert(df):
    df['len_item_category'] = df['item_category_list'].apply(lambda str1: len(str1.split(';')))
    for i in range(3):
        df['item_category_%d'%i] = df['item_category_list'].apply(lambda str1: int(str1.split(';')[i]) if len(str1.split(';'))>i else -1)
    df = df.drop(['item_category_list'], 1)
    
    # 属性类别属性的出现顺序无规则且无序
    df['len_item_property'] = df['item_property_list'].apply(lambda str1: len(np.unique(str1.split(';'))))
    for i in range(5):
        df['item_property_%d'%i] = df['item_property_list'].apply(lambda str1: int(np.unique(str1.split(';'))[i]) if len(np.unique(str1.split(';')))>i else -1)
    df = df.drop(['item_property_list'], 1)
    
    df['len_predict_category_property'] = df['predict_category_property'].apply(lambda str1: len(str1.split(';')))
    for i in range(10):
        df['predict_category_property_%d'%i] = df['predict_category_property'].apply(lambda str1: int(str1.split(';')[i].split(':')[0]) \
                                                                                     if len(str1.split(';'))>i else -1)
    df = df.drop(['predict_category_property'], 1)
    return df


In [8]:
def timeStampConvert(df):
    xShape = df.shape[0]
    df['context_time_hour'] = [datetime.datetime.fromtimestamp(df['context_timestamp'][i]).hour for i in range(xShape)]
    df['context_time_day'] = [datetime.datetime.fromtimestamp(df['context_timestamp'][i]).day for i in range(xShape)]
    df['context_time_weekday'] = [datetime.datetime.fromtimestamp(df['context_timestamp'][i]).weekday() for i in range(xShape)]
    return df


In [9]:
# 根据真实情景的时间段广告点击情况来划分时间区间
def mapHour(hour):
    if hour >=7 and hour <=12:
        return 1
    elif (hour >= 13 and hour <= 20):
        return 2
    return 3


In [10]:
def dataProcess(data):
    data = baseProcess(data)
    data = dataConvert(data)
    data = timeStampConvert(data)
    data['context_time_hour_map'] = data['context_time_hour'].apply(mapHour)
    return data
   
df = dataProcess(df)
test_df = dataProcess(test_df)

In [11]:
df.shape

(477334, 54)

In [None]:
df.to_csv('../Output/a/train_step2_301.csv', index=False)
test_df.to_csv('../Output/a/test_step2_301.csv', index=False)

In [None]:
df = pd.read_csv('../Output/a/train_step2_301.csv') # 预处理后的测试集数据加载
test_df = pd.read_csv('../Output/a/test_step2_301.csv') # 预处理后的测试集数据加载

In [12]:
df.shape

(477334, 54)

In [13]:
def tradeRateCalculate(data, test_data, cols=['item_id', 'shop_id','user_id','item_brand_id','item_city_id']):
    dmin = data['context_time_day'].min()
    dmax = test_data['context_time_day'].max()
    for col in cols:
        for day in range(dmin, dmax+1):
            df_p1 = data[df.context_time_day==day-1]
            df_p2 = data[df.context_time_day==day]
            itemtrade = df_p1.groupby([col, 'context_time_day'],  as_index=False).agg({'is_trade':'sum','instance_id':'count'})
            itemtrade[col+'_traderate'] = itemtrade.is_trade / itemtrade.instance_id
            if day == dmin:
                rdf = itemtrade
            elif day == dmax:
                test_data = pd.merge(test_data, itemtrade[[col, str(col)+'_traderate', 'context_time_day']], on=[col, 'context_time_day'],how='left').replace(np.nan, 0.0)
            else:
                rdf = pd.concat([rdf, itemtrade])
        data = pd.merge(data, rdf[[col, str(col)+'_traderate', 'context_time_day']], on=[col, 'context_time_day'],how='left').replace(np.nan, 0.0)
    return data, test_data

print (df.shape)
df, test_df = tradeRateCalculate(df, test_df)
print (df.shape)

(477334, 54)
(477334, 59)


In [14]:
def dropCols(data):
    return data.drop(['item_category_0'], 1)


In [15]:
def dataLabelEncoder(df):
    df['item_category_1'] = pd.Categorical(df['item_category_1']).codes
    df['item_category_2'] = pd.Categorical(df['item_category_2']).codes

    df['item_property_0'] = pd.Categorical(df['item_property_0']).codes
    df['item_property_1'] = pd.Categorical(df['item_property_1']).codes
    df['item_property_2'] = pd.Categorical(df['item_property_2']).codes
    df['item_property_3'] = pd.Categorical(df['item_property_3']).codes
    df['item_property_4'] = pd.Categorical(df['item_property_4']).codes

    df['predict_category_property_0'] = pd.Categorical(df['predict_category_property_0']).codes
    df['predict_category_property_1'] = pd.Categorical(df['predict_category_property_1']).codes
    df['predict_category_property_2'] = pd.Categorical(df['predict_category_property_2']).codes
    df['predict_category_property_3'] = pd.Categorical(df['predict_category_property_3']).codes
    df['predict_category_property_4'] = pd.Categorical(df['predict_category_property_4']).codes
    df['predict_category_property_5'] = pd.Categorical(df['predict_category_property_5']).codes
    df['predict_category_property_6'] = pd.Categorical(df['predict_category_property_6']).codes
    df['predict_category_property_7'] = pd.Categorical(df['predict_category_property_7']).codes
    df['predict_category_property_8'] = pd.Categorical(df['predict_category_property_8']).codes
    df['predict_category_property_9'] = pd.Categorical(df['predict_category_property_9']).codes

    df['shop_id'] = pd.Categorical(df['shop_id']).codes
    df['item_brand_id'] = pd.Categorical(df['item_brand_id']).codes
    df['item_city_id'] = pd.Categorical(df['item_city_id']).codes
    df['user_id'] = pd.Categorical(df['user_id']).codes
    df['item_id'] = pd.Categorical(df['item_id']).codes
    return df


In [16]:
# 连续值离散化 根据实际场景分段处理
def reviewMap(score):
    if score >= 0.98 :
        return 3
    elif score >= 0.965 :
        return 2
    return 1

def serviceMap(score):
    if (score > 0.945 and score <=0.995):
        return 3
    elif (score > 0.94 and score <= 0.945) or (score>0.995):
        return 2
    return 1

def deliveryMap(score):
    if (score > 0.945 and score <=0.995):
        return 3
    elif (score > 0.916 and score <= 0.945) or (score>0.995):
        return 2
    return 1

def dspMap(score):
    if (score >= 0.94 and score <=0.996):
        return 3
    elif (score > 0.905 and score < 0.94) or (score>0.996):
        return 2
    return 1

def constantMap(data):
    data['shop_review_positive_map'] = data['shop_review_positive_rate'].apply(reviewMap)
    data['shop_map_service'] = data['shop_score_service'].apply(serviceMap)
    data['shop_map_delivery'] = data['shop_score_delivery'].apply(deliveryMap)
    data['shop_map_description'] = data['shop_score_description'].apply(dspMap)
    return data


In [17]:
def zuheFeature(data):
    for col in ['item_sales_level', 'item_price_level', 'item_collected_level','item_pv_level',
                'user_gender','user_age','user_occupation','user_star',
                'shop_review_num_level', 'shop_star_level','shop_map_delivery','shop_map_service']:
        data[col] = data[col].astype(str)

    data['user_gender_age'] = data['user_gender'] + data['user_age']
    data['user_gender_occ'] = data['user_gender'] + data['user_occupation']
    data['user_gender_star'] = data['user_gender'] + data['user_star']
    
    data['shop_review_star'] = data['shop_review_num_level']+data['shop_star_level']
    data['shop_delivery_service'] = data['shop_map_delivery'] + data['shop_map_service']
    
    data['item_collected_sales'] = data['item_sales_level'] + data['item_collected_level']
    data['item_collected_pv'] = data['item_collected_level'] + data['item_pv_level']
    data['item_sales_pv'] = data['item_sales_level'] + data['item_pv_level']
    
    for col in ['item_sales_level', 'item_price_level', 'item_collected_level','item_pv_level',
                'user_gender','user_age','user_occupation','user_star',
                'shop_review_num_level', 'shop_star_level','shop_map_delivery','shop_map_service',
                'user_gender_age','user_gender_occ','user_gender_star',
               'shop_review_star','shop_delivery_service',
                'item_collected_sales','item_collected_pv','item_sales_pv']:
        data[col] = data[col].astype(int)
    return data


In [18]:
# 点击率统计特征
def cntFeature(data, cols=['item_id','user_id','shop_id']):
    # 今日之前的所有点击率
    dmin = data['context_time_day'].min()
    dmax = data['context_time_day'].max() + 1
    for day in range(dmin, dmax):
        df1 = data[data.context_time_day<day]
        df2 = data[data.context_time_day==day]
        for col in cols:
            cnt = df1.groupby([col])['instance_id'].agg({'cnt':'count'})['cnt'].to_dict()
            df2[str(col)+'_cnt'] = df2[col].apply(lambda x: cnt.get(x, 0))
        df2 = df2[['item_id_cnt','user_id_cnt','shop_id_cnt','instance_id']]
        if day == dmin:
            rdf = df2
        else:
            rdf = pd.concat([df2,rdf])
    data = pd.merge(data, rdf, on=['instance_id'],how='left')
    return data

def cnt1Feature(data, cols=['item_id','user_id','shop_id']):
    # 今日之前一天的所有点击率
    dmin = data['context_time_day'].min()
    dmax = data['context_time_day'].max() + 1
    for day in range(dmin, dmax):
        df1 = data[data.context_time_day == (day-1)]
        df2 = data[data.context_time_day == day]
        for col in cols:
            cnt = df1.groupby([col])['instance_id'].agg({'cnt':'count'})['cnt'].to_dict()
            df2[str(col)+'_cnt1'] = df2[col].apply(lambda x: cnt.get(x, 0))
        df2 = df2[['item_id_cnt1','user_id_cnt1','shop_id_cnt1','instance_id']]
        if day == dmin:
            rdf = df2
        else:
            rdf = pd.concat([df2,rdf])
    data = pd.merge(data, rdf, on=['instance_id'],how='left')
    return data


In [19]:
print ("——————————Start——————————")
rows = df.shape[0]
df1 = df.append(test_df)
df1 = df1.drop_duplicates(subset='instance_id')  # 去除重复的instance_id
df1 = dropCols(df1)

print ("——————————标签化——————————")
df_enc = dataLabelEncoder(df1)
print (df_enc.shape)

print ("——————————离散化——————————")
df_enc = constantMap(df_enc)
df_enc['shop_noraml'] = df_enc.apply(
     lambda x : 1 if (x.shop_review_positive_map==3) and x.shop_map_service==3 and x.shop_map_delivery==3 and x.shop_map_description==3 else 0,
     axis=1)
print (df_enc.shape)

print ("——————————组合化——————————")
df_zuhe = zuheFeature(df_enc)
print (df_zuhe.shape)

print ("——————————点击统计——————————")
df_cnt1 = cnt1Feature(df_zuhe)
df_cnt = cntFeature(df_cnt1)
print (df_cnt.shape)

print ("——————————END——————————")

——————————Start——————————
——————————标签化——————————
(495652, 58)
——————————离散化——————————
(495652, 63)
——————————组合化——————————
(495652, 71)
——————————点击统计——————————
(495652, 77)
——————————END——————————


In [20]:
# label = pd.DataFrame({'is_trade':label})
test_df = df_cnt[df_cnt.is_trade.isnull()]
df = df_cnt[df_cnt.is_trade.notnull()]
# del df_enc, rows

In [21]:
df.shape

(477284, 77)

# 数据处理完毕，保存

In [22]:
df.to_csv('../Output/a/train_step3_301_final.csv', index=False)
test_df.to_csv('../Output/a/test_step3_301_final.csv', index=False)