## 特征工程

In [3]:
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns

### 工具库函数

In [13]:

#第一类编码
def categories_process_first_class(cate):
    cate = str(cate)
    return int(cate[0])

#第2类编码
def categories_process_second_class(cate):
    cate = str(cate)
    if len(cate)<2:
        return 0
    else:
        return int(cate[1:])

#年龄处理，切段
def age_process(age):
    age = int(age)
    if age==0:
        return 0
    elif age<15:
        return 1
    elif age<25:
        return 2
    elif age<40:
        return 3
    elif age<60:
        return 4
    else:
        return 5

#省份处理
def process_province(hometown):
    hometown = str(hometown)
    province = int(hometown[0:2])
    return province

#城市处理
def process_city(hometown):
    hometown = str(hometown)
    if len(hometown)>1:
        city = int(hometown[2:])
    else:
        city = 0
    return city

#几点钟
def get_time_day(t):
    t = str(t)
    t=int(t[0:2])
    return t

#一天切成4段
def get_time_hour(t):
    t = str(t)
    t=int(t[2:4])
    if t<6:
        return 0
    elif t<12:
        return 1
    elif t<18:
        return 2
    else:
        return 3

#评估与计算logloss
def logloss(act, pred):
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = ll * -1.0/len(act)
    return ll

### 数据处理

In [6]:
dpath = "./data/"

train

In [7]:
train = pd.read_csv(dpath+"train.csv")
ad = pd.read_csv(dpath+"ad.csv")

In [26]:
train['clickTime_day'] = train['clickTime'].apply(get_time_day)
train['clickTime_hour']= train['clickTime'].apply(get_time_hour)

In [27]:
train.head()

Unnamed: 0,label,clickTime,conversionTime,creativeID,userID,positionID,connectionType,telecomsOperator,clickTime_day,clickTime_hour
0,0,170000,,3089,2798058,293,1,1,17,0
1,0,170000,,1259,463234,6161,1,2,17,0
2,0,170000,,4465,1857485,7434,4,1,17,0
3,0,170000,,1004,2038823,977,1,1,17,0
4,0,170000,,1887,2015141,3688,1,1,17,0


test

In [28]:
test = pd.read_csv(dpath+"test.csv")

In [29]:
test['clickTime_day'] = test['clickTime'].apply(get_time_day)
test['clickTime_hour']= test['clickTime'].apply(get_time_hour)

In [30]:
test.head()

Unnamed: 0,instanceID,label,clickTime,creativeID,userID,positionID,connectionType,telecomsOperator,clickTime_day,clickTime_hour
0,1,-1,310000,3745,1164848,3451,1,3,31,0
1,2,-1,310000,2284,2127247,1613,1,3,31,0
2,3,-1,310000,1456,2769125,5510,2,1,31,0
3,4,-1,310000,4565,9762,4113,2,3,31,0
4,5,-1,310000,49,2513636,3615,1,3,31,0


app_categories

In [9]:
app_categories = pd.read_csv(dpath+"app_categories.csv")

In [14]:
app_categories["app_categories_first_class"] = app_categories['appCategory'].apply(categories_process_first_class)
app_categories["app_categories_second_class"] = app_categories['appCategory'].apply(categories_process_second_class)

In [19]:
app_categories.head()

Unnamed: 0,appID,appCategory,app_categories_first_class,app_categories_second_class
0,14,2,2,0
1,25,203,2,3
2,68,104,1,4
3,75,402,4,2
4,83,203,2,3


user

In [22]:
user = pd.read_csv(dpath+"user.csv")

In [23]:
user['age_process'] = user['age'].apply(age_process)
user["hometown_province"] = user['hometown'].apply(process_province)
user["hometown_city"] = user['hometown'].apply(process_city)
user["residence_province"] = user['residence'].apply(process_province)
user["residence_city"] = user['residence'].apply(process_city)

In [24]:
user.head()

Unnamed: 0,userID,age,gender,education,marriageStatus,haveBaby,hometown,residence,age_process,hometown_province,hometown_city,residence_province,residence_city
0,1,42,1,0,2,0,512,503,4,51,2,50,3
1,2,18,1,5,1,0,1403,1403,2,14,3,14,3
2,3,0,2,4,0,0,0,0,0,0,0,0,0
3,4,21,2,5,3,0,607,607,2,60,7,60,7
4,5,22,2,0,0,0,0,1301,2,0,0,13,1


position

In [36]:
position = pd.read_csv(dpath+"position.csv")

### 合并数据

合并train

In [37]:
%%time
train_user = pd.merge(train,user,on='userID')
train_user_ad = pd.merge(train_user,ad,on='creativeID')
train_user_ad_app = pd.merge(train_user_ad,app_categories,on='appID')
train_user_ad_app_pos = pd.merge(train_user_ad_app,position,on='positionID')

Wall time: 8.51 s


In [38]:
train_merge = train_user_ad_app_pos
train_merge.head()

Unnamed: 0,label,clickTime,conversionTime,creativeID,userID,positionID,connectionType,telecomsOperator,clickTime_day,clickTime_hour,...,adID,camgaignID,advertiserID,appID,appPlatform,appCategory,app_categories_first_class,app_categories_second_class,sitesetID,positionType
0,0,170000,,3089,2798058,293,1,1,17,0,...,1321,83,10,434,1,108,1,8,0,3
1,0,180028,,3089,1683269,293,2,3,18,0,...,1321,83,10,434,1,108,1,8,0,3
2,0,191045,,3089,240899,293,1,2,19,1,...,1321,83,10,434,1,108,1,8,0,3
3,0,182300,,2230,2177495,293,2,1,18,3,...,2841,83,10,434,1,108,1,8,0,3
4,0,200113,,2230,417301,293,2,2,20,0,...,2841,83,10,434,1,108,1,8,0,3


In [40]:
train_merge.shape

(3749528, 32)

In [41]:
train_merge.columns

Index(['label', 'clickTime', 'conversionTime', 'creativeID', 'userID',
       'positionID', 'connectionType', 'telecomsOperator', 'clickTime_day',
       'clickTime_hour', 'age', 'gender', 'education', 'marriageStatus',
       'haveBaby', 'hometown', 'residence', 'age_process', 'hometown_province',
       'hometown_city', 'residence_province', 'residence_city', 'adID',
       'camgaignID', 'advertiserID', 'appID', 'appPlatform', 'appCategory',
       'app_categories_first_class', 'app_categories_second_class',
       'sitesetID', 'positionType'],
      dtype='object')

In [42]:
train_merge.to_csv(dpath+"train_merge.csv",index=0)

合并test

In [43]:
%%time
test_user = pd.merge(test,user,on='userID')
test_user_ad = pd.merge(test_user,ad,on='creativeID')
test_user_ad_app = pd.merge(test_user_ad,app_categories,on='appID')
test_user_ad_app_pos = pd.merge(test_user_ad_app,position,on='positionID')

Wall time: 1.66 s


In [44]:
test_merge = test_user_ad_app_pos
test_merge.head()

Unnamed: 0,instanceID,label,clickTime,creativeID,userID,positionID,connectionType,telecomsOperator,clickTime_day,clickTime_hour,...,adID,camgaignID,advertiserID,appID,appPlatform,appCategory,app_categories_first_class,app_categories_second_class,sitesetID,positionType
0,1,-1,310000,3745,1164848,3451,1,3,31,0,...,1166,430,80,14,2,2,2,0,0,1
1,178,-1,310001,3745,2170066,3451,1,2,31,0,...,1166,430,80,14,2,2,2,0,0,1
2,21839,-1,310524,3745,786179,3451,1,1,31,0,...,1166,430,80,14,2,2,2,0,0,1
3,25947,-1,310623,3745,1467866,3451,1,3,31,1,...,1166,430,80,14,2,2,2,0,0,1
4,26757,-1,310631,3745,1230630,3451,1,3,31,1,...,1166,430,80,14,2,2,2,0,0,1


In [45]:
%%time
test_merge.to_csv(dpath+"test_merge.csv",index=False)

Wall time: 9.79 s
